# Spark DataFrame
## Dates and TimeStamps

In [2]:
import findspark

In [3]:
findspark.init('/home/ubuntu/spark-2.0.0-bin-hadoop2.7')

In [4]:
from pyspark.sql import SparkSession

In [5]:
spark = SparkSession.builder.appName('dates').getOrCreate()

In [6]:
df = spark.read.csv('../Dataset/Spark_DataFrames/appl_stock.csv',header=True,inferSchema=True)

In [7]:
df.head(1)

[Row(Date=datetime.datetime(2010, 1, 4, 0, 0), Open=213.429998, High=214.499996, Low=212.38000099999996, Close=214.009998, Volume=123432400, Adj Close=27.727039)]

In [10]:
df.printSchema()
df.columns

root
 |-- Date: timestamp (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Volume: integer (nullable = true)
 |-- Adj Close: double (nullable = true)



['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close']

In [12]:
df.select(['Date','Open']).show(3)

+--------------------+----------+
|                Date|      Open|
+--------------------+----------+
|2010-01-04 00:00:...|213.429998|
|2010-01-05 00:00:...|214.599998|
|2010-01-06 00:00:...|214.379993|
+--------------------+----------+
only showing top 3 rows



In [14]:
from pyspark.sql.functions import dayofmonth,hour,dayofyear,month,year,weekofyear,format_number,date_format

In [15]:
df.select(dayofmonth(df.Date)).show()

+----------------+
|dayofmonth(Date)|
+----------------+
|               4|
|               5|
|               6|
|               7|
|               8|
|              11|
|              12|
|              13|
|              14|
|              15|
|              19|
|              20|
|              21|
|              22|
|              25|
|              26|
|              27|
|              28|
|              29|
|               1|
+----------------+
only showing top 20 rows



In [18]:
df.select(month(df.Date)).show()

+-----------+
|month(Date)|
+-----------+
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          2|
+-----------+
only showing top 20 rows



In [36]:
close_per_month = df.groupBy(month("Date")).mean("Close")

In [37]:
close_per_month.show()

+-----------+------------------+
|month(Date)|        avg(Close)|
+-----------+------------------+
|         12|302.35053626845644|
|          1|322.20971425714276|
|          6|      288.12546566|
|          3|332.91156731372547|
|          5|351.62102085714304|
|          9| 301.0763195902777|
|          4|340.51041081506827|
|          8| 300.4385809612901|
|          7|281.72216211486483|
|         10|308.30552563157886|
|         11| 306.2725174895104|
|          2| 321.3595563037038|
+-----------+------------------+



In [40]:
newdf = df.withColumn("Year", year(df.Date))

In [44]:
close_per_year = newdf.groupBy("Year").mean().select(["Year","avg(Close)"])

In [48]:
new_close = close_per_year.withColumnRenamed("avg(Close)", "AvgClosingPrice")

In [50]:
new_close.select(['Year', format_number('AvgClosingPrice', 3).alias('AvgClosingPrice 3 digits')]).show()

+----+------------------------+
|Year|AvgClosingPrice 3 digits|
+----+------------------------+
|2015|                 120.040|
|2013|                 472.635|
|2014|                 295.402|
|2012|                 576.050|
|2016|                 104.604|
|2010|                 259.842|
|2011|                 364.004|
+----+------------------------+

