## Purpose of script:
#### Reviewing Spark date and time operations
#### Referencing Jose Portilla's "Spark and Python for Big Data with PySpark" course

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('d_t').getOrCreate()

In [3]:
df = spark.read.csv('../Datasets/appl_stock.csv', inferSchema=True, header=True)

In [4]:
df.show(5)

+-------------------+----------+----------+------------------+------------------+---------+------------------+
|               Date|      Open|      High|               Low|             Close|   Volume|         Adj Close|
+-------------------+----------+----------+------------------+------------------+---------+------------------+
|2010-01-04 00:00:00|213.429998|214.499996|212.38000099999996|        214.009998|123432400|         27.727039|
|2010-01-05 00:00:00|214.599998|215.589994|        213.249994|        214.379993|150476200|27.774976000000002|
|2010-01-06 00:00:00|214.379993|    215.23|        210.750004|        210.969995|138040000|27.333178000000004|
|2010-01-07 00:00:00|    211.75|212.000006|        209.050005|            210.58|119282800|          27.28265|
|2010-01-08 00:00:00|210.299994|212.000006|209.06000500000002|211.98000499999998|111902700|         27.464034|
+-------------------+----------+----------+------------------+------------------+---------+------------------+
o

In [16]:
from pyspark.sql.functions import (dayofmonth, hour,
                                   dayofyear, month,
                                   year, weekofyear,
                                   format_number)

In [7]:
df.select(dayofmonth(df['Date'])).show(5)

+----------------+
|dayofmonth(Date)|
+----------------+
|               4|
|               5|
|               6|
|               7|
|               8|
+----------------+
only showing top 5 rows



In [8]:
df.select(hour(df['Date'])).show(5)

+----------+
|hour(Date)|
+----------+
|         0|
|         0|
|         0|
|         0|
|         0|
+----------+
only showing top 5 rows



In [9]:
df.select(month(df['Date'])).show(5)

+-----------+
|month(Date)|
+-----------+
|          1|
|          1|
|          1|
|          1|
|          1|
+-----------+
only showing top 5 rows



In [12]:
# steps for getting average closing price per year

# add year column
year_df = df.withColumn('Year', year(df['Date']))

In [15]:
new_df = year_df.groupBy('Year').mean().select(['Year', 'avg(Close)'])

In [21]:
new_df = new_df.select(['Year', format_number('avg(Close)', 2).alias('Average Close')])

In [23]:
new_df = new_df.orderBy('Year')

new_df.show()

+----+-------------+
|Year|Average Close|
+----+-------------+
|2010|       259.84|
|2011|       364.00|
|2012|       576.05|
|2013|       472.63|
|2014|       295.40|
|2015|       120.04|
|2016|       104.60|
+----+-------------+



In [24]:
spark.stop()