# Apache Spark Dataframe 

In this exercise, we are going to get some insights on stock market data. We use `walmart_stock.csv` file as our dataset to analyse the data.

### 1- Create an Apache Spark Session

In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
spark= SparkSession.builder.master("local").appName("Test Spark").getOrCreate()
sc = spark.sparkContext   

In [3]:
spark

### 2- Load the `walmart_stock.csv` file into a dataframe and infer the data schema

In [4]:
import os
MAIN_DIRECTORY = os.getcwd()
MAIN_DIRECTORY

'C:\\Users\\Syaidatul Syafira\\OneDrive - studentupmedumy.onmicrosoft.com\\Desktop\\DA\\Big Data Analytics with Apache Spark\\Apache Spark SC'

In [5]:
file_path = MAIN_DIRECTORY + "/walmart_stock.csv"

turn into dataframe spark.read

In [6]:
df1 = spark.read.csv("walmart_stock.csv", header = True, inferSchema = True)

### 3- Display the column names and print the dataframe schema

In [7]:
df1.columns

['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close']

In [8]:
df1.printSchema()

root
 |-- Date: timestamp (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Volume: integer (nullable = true)
 |-- Adj Close: double (nullable = true)



### 4- Print out the first five rows of the data

df.show will give you in tabular format, df.head not in tabular format

In [9]:
df1.show(5)

+-------------------+------------------+---------+---------+------------------+--------+------------------+
|               Date|              Open|     High|      Low|             Close|  Volume|         Adj Close|
+-------------------+------------------+---------+---------+------------------+--------+------------------+
|2012-01-03 00:00:00|         59.970001|61.060001|59.869999|         60.330002|12668800|52.619234999999996|
|2012-01-04 00:00:00|60.209998999999996|60.349998|59.470001|59.709998999999996| 9593300|         52.078475|
|2012-01-05 00:00:00|         59.349998|59.619999|58.369999|         59.419998|12768200|         51.825539|
|2012-01-06 00:00:00|         59.419998|59.450001|58.869999|              59.0| 8069400|          51.45922|
|2012-01-09 00:00:00|         59.029999|59.549999|58.919998|             59.18| 6679300|51.616215000000004|
+-------------------+------------------+---------+---------+------------------+--------+------------------+
only showing top 5 rows



### 5- Use `describe()` method to get statistical information on the data 

In [10]:
df1.describe().show()

+-------+------------------+-----------------+-----------------+-----------------+-----------------+-----------------+
|summary|              Open|             High|              Low|            Close|           Volume|        Adj Close|
+-------+------------------+-----------------+-----------------+-----------------+-----------------+-----------------+
|  count|              1258|             1258|             1258|             1258|             1258|             1258|
|   mean| 72.35785375357709|72.83938807631165| 71.9186009594594|72.38844998012726|8222093.481717011|67.23883848728146|
| stddev|  6.76809024470826|6.768186808159218|6.744075756255496|6.756859163732991|  4519780.8431556|6.722609449996857|
|    min|56.389998999999996|        57.060001|        56.299999|        56.419998|          2094900|        50.363689|
|    max|         90.800003|        90.970001|            89.25|        90.470001|         80898100|84.91421600000001|
+-------+------------------+-----------------+--

### 6- Use `format_number` function to format the numbers for just showing up to two decimal places. 
[format_number() documentation](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html?highlight=format_number#pyspark.sql.functions.format_number)

In [11]:
from pyspark.sql.functions import format_number

des_result = df1.describe()

#we want to fix at describe table straight away

In [12]:
des_result.select(des_result["summary"], 
                format_number(des_result["Open"].cast("float"),2).alias("Open"),
                format_number(des_result["High"].cast("float"),2).alias("High"),
                format_number(des_result["Low"].cast("float"),2).alias("Low"),
                format_number(des_result["Close"].cast("float"),2).alias("Close"),
                format_number(des_result["Volume"].cast("int"),2).alias("Volume")).show()
                            

+-------+--------+--------+--------+--------+-------------+
|summary|    Open|    High|     Low|   Close|       Volume|
+-------+--------+--------+--------+--------+-------------+
|  count|1,258.00|1,258.00|1,258.00|1,258.00|     1,258.00|
|   mean|   72.36|   72.84|   71.92|   72.39| 8,222,093.00|
| stddev|    6.77|    6.77|    6.74|    6.76| 4,519,780.00|
|    min|   56.39|   57.06|   56.30|   56.42| 2,094,900.00|
|    max|   90.80|   90.97|   89.25|   90.47|80,898,100.00|
+-------+--------+--------+--------+--------+-------------+



### 7- Create a new coulmn called HV Ratio on a new dataframe that returns the ratio of the High Price versus volume of stock traded for a day.

In [13]:

hv_df = df1.withColumn('HV_Ratio', df1["High"]/ df1["Volume"])

In [14]:
hv_df.select('HV_Ratio').show(5)

+--------------------+
|            HV_Ratio|
+--------------------+
|4.819714653321546E-6|
|6.290848613094555E-6|
|4.669412994783916E-6|
|7.367338463826307E-6|
|8.915604778943901E-6|
+--------------------+
only showing top 5 rows



In [15]:
hv_df.show(5)

+-------------------+------------------+---------+---------+------------------+--------+------------------+--------------------+
|               Date|              Open|     High|      Low|             Close|  Volume|         Adj Close|            HV_Ratio|
+-------------------+------------------+---------+---------+------------------+--------+------------------+--------------------+
|2012-01-03 00:00:00|         59.970001|61.060001|59.869999|         60.330002|12668800|52.619234999999996|4.819714653321546E-6|
|2012-01-04 00:00:00|60.209998999999996|60.349998|59.470001|59.709998999999996| 9593300|         52.078475|6.290848613094555E-6|
|2012-01-05 00:00:00|         59.349998|59.619999|58.369999|         59.419998|12768200|         51.825539|4.669412994783916E-6|
|2012-01-06 00:00:00|         59.419998|59.450001|58.869999|              59.0| 8069400|          51.45922|7.367338463826307E-6|
|2012-01-09 00:00:00|         59.029999|59.549999|58.919998|             59.18| 6679300|51.616215

### 8- What day had the Peak High in Price?

just use order by on df1, use [0] to show result of the first column index (date) 

In [16]:
df1.orderBy(df1['High'].desc()).head()[0]

datetime.datetime(2015, 1, 13, 0, 0)

### 9-What is the mean of the Close column?

In [17]:
from pyspark.sql.functions import mean

df1.agg({'Close':'mean'}).show()

+-----------------+
|       avg(Close)|
+-----------------+
|72.38844998012726|
+-----------------+



### 10- How many days was the Close lower than 70 USD?

In [18]:
df1.filter(df1['Close'] < 70).count()

397

### 11-What percentage of the time was the High greater than 80 USD ?
#### In other words, (Number of High Days>80)/(Total Days in the dataframe)

In [20]:
(df1.filter(df1['High'] > 80).count() / df1.count()) * 100

9.141494435612083

### 12-What is the correlation between High and Volume?

In [21]:
df1.stat.corr('High', 'Volume')

-0.3384326061737161

### 13- What is the max High per year (use GroupBy)?

to use "Year" function, it must be imported first

In [28]:
from pyspark.sql.functions import max, year

In [30]:
year_df = df1.withColumn('Year', year(df1['Date']))

In [31]:
year_df.show(5)

+-------------------+------------------+---------+---------+------------------+--------+------------------+----+
|               Date|              Open|     High|      Low|             Close|  Volume|         Adj Close|Year|
+-------------------+------------------+---------+---------+------------------+--------+------------------+----+
|2012-01-03 00:00:00|         59.970001|61.060001|59.869999|         60.330002|12668800|52.619234999999996|2012|
|2012-01-04 00:00:00|60.209998999999996|60.349998|59.470001|59.709998999999996| 9593300|         52.078475|2012|
|2012-01-05 00:00:00|         59.349998|59.619999|58.369999|         59.419998|12768200|         51.825539|2012|
|2012-01-06 00:00:00|         59.419998|59.450001|58.869999|              59.0| 8069400|          51.45922|2012|
|2012-01-09 00:00:00|         59.029999|59.549999|58.919998|             59.18| 6679300|51.616215000000004|2012|
+-------------------+------------------+---------+---------+------------------+--------+--------

creating maximum dataframe for each columns and group by year

In [33]:
max_df = year_df.groupBy('Year').max()

In [35]:
max_df.show()

+----+-----------------+---------+---------+----------+-----------+-----------------+---------+
|Year|        max(Open)|max(High)| max(Low)|max(Close)|max(Volume)|   max(Adj Close)|max(Year)|
+----+-----------------+---------+---------+----------+-----------+-----------------+---------+
|2015|        90.800003|90.970001|    89.25| 90.470001|   80898100|84.91421600000001|     2015|
|2013|        81.209999|81.370003|    80.82| 81.209999|   25683700|        73.929868|     2013|
|2014|87.08000200000001|88.089996|86.480003| 87.540001|   22812400|81.70768000000001|     2014|
|2012|        77.599998|77.599998|76.690002| 77.150002|   38007300|        68.568371|     2012|
|2016|             74.5|75.190002|73.629997| 74.300003|   35076700|        73.233524|     2016|
+----+-----------------+---------+---------+----------+-----------+-----------------+---------+



In [36]:
max_df.select('Year', 'max(Year)').show()

+----+---------+
|Year|max(Year)|
+----+---------+
|2015|     2015|
|2013|     2013|
|2014|     2014|
|2012|     2012|
|2016|     2016|
+----+---------+



### 14- What is the average Close for each Calendar Month (close price for Jan,Feb, Mar, etc)?


In [37]:
from pyspark.sql.functions import max, month

In [40]:
month_df = df1.withColumn("Month", month(df1["Date"]))

In [42]:
month_df.show(5)

+-------------------+------------------+---------+---------+------------------+--------+------------------+-----+
|               Date|              Open|     High|      Low|             Close|  Volume|         Adj Close|Month|
+-------------------+------------------+---------+---------+------------------+--------+------------------+-----+
|2012-01-03 00:00:00|         59.970001|61.060001|59.869999|         60.330002|12668800|52.619234999999996|    1|
|2012-01-04 00:00:00|60.209998999999996|60.349998|59.470001|59.709998999999996| 9593300|         52.078475|    1|
|2012-01-05 00:00:00|         59.349998|59.619999|58.369999|         59.419998|12768200|         51.825539|    1|
|2012-01-06 00:00:00|         59.419998|59.450001|58.869999|              59.0| 8069400|          51.45922|    1|
|2012-01-09 00:00:00|         59.029999|59.549999|58.919998|             59.18| 6679300|51.616215000000004|    1|
+-------------------+------------------+---------+---------+------------------+--------+

In [43]:
from pyspark.sql.functions import mean

In [44]:
mean_df = month_df.groupBy("Month").mean()

In [45]:
mean_df.show()

+-----+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+----------+
|Month|        avg(Open)|        avg(High)|         avg(Low)|       avg(Close)|      avg(Volume)|   avg(Adj Close)|avg(Month)|
+-----+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+----------+
|   12|72.87952850943395|73.35566025471698|72.44481152830188|72.84792478301885|7967959.433962264|68.46031040566042|      12.0|
|    1|71.40811884158416|71.97009924752473|70.90425712871289|71.44801958415842|8761851.485148516|65.56887865346533|       1.0|
|    6| 72.5100938962264| 72.9262265471698|72.12198099056603| 72.4953774245283|8303756.603773585|67.43827906603772|       6.0|
|    3|71.69046716822429|72.20289709345795|71.31878489719628|71.77794377570092|7721836.448598131| 66.2763403084112|       3.0|
|    5|72.24349083962262|72.71783049056604|71.85292466981134|72.30971688679247|8632350.943396226|       67.0655

In [46]:
mean_df.select('Month', 'avg(Close)').show()

+-----+-----------------+
|Month|       avg(Close)|
+-----+-----------------+
|   12|72.84792478301885|
|    1|71.44801958415842|
|    6| 72.4953774245283|
|    3|71.77794377570092|
|    5|72.30971688679247|
|    9|72.18411785294116|
|    4|72.97361900952382|
|    8|73.02981855454546|
|    7|74.43971943925233|
|   10|71.57854545454543|
|   11| 72.1110893069307|
|    2|  71.306804443299|
+-----+-----------------+



In [47]:
mean_df.select('Month', 'avg(Close)').orderBy("Month").show()

+-----+-----------------+
|Month|       avg(Close)|
+-----+-----------------+
|    1|71.44801958415842|
|    2|  71.306804443299|
|    3|71.77794377570092|
|    4|72.97361900952382|
|    5|72.30971688679247|
|    6| 72.4953774245283|
|    7|74.43971943925233|
|    8|73.02981855454546|
|    9|72.18411785294116|
|   10|71.57854545454543|
|   11| 72.1110893069307|
|   12|72.84792478301885|
+-----+-----------------+



#### Well Done!