In [1]:
from pyspark.sql import SparkSession, types, functions
spark = SparkSession.builder.appName("dates").getOrCreate()

In [2]:
schema_apple = types.StructType([types.StructField('Date', types.TimestampType()),
                                 types.StructField('Open', types.DoubleType()),
                                 types.StructField('High', types.DoubleType()),
                                 types.StructField('Low', types.DoubleType()), 
                                 types.StructField('Close', types.DoubleType()),
                                 types.StructField('Volume', types.IntegerType()),
                                 types.StructField('Adj Close', types.DoubleType())])

df_dates = spark.read.format("CSV")\
                     .option("header", "true")\
                     .schema(schema_apple)\
                     .load("../Arquivos/appl_stock.csv")

In [3]:
df_dates.select(["Date", "Open"]).show()

+-------------------+------------------+
|               Date|              Open|
+-------------------+------------------+
|2010-01-04 00:00:00|        213.429998|
|2010-01-05 00:00:00|        214.599998|
|2010-01-06 00:00:00|        214.379993|
|2010-01-07 00:00:00|            211.75|
|2010-01-08 00:00:00|        210.299994|
|2010-01-11 00:00:00|212.79999700000002|
|2010-01-12 00:00:00|209.18999499999998|
|2010-01-13 00:00:00|        207.870005|
|2010-01-14 00:00:00|210.11000299999998|
|2010-01-15 00:00:00|210.92999500000002|
|2010-01-19 00:00:00|        208.330002|
|2010-01-20 00:00:00|        214.910006|
|2010-01-21 00:00:00|        212.079994|
|2010-01-22 00:00:00|206.78000600000001|
|2010-01-25 00:00:00|202.51000200000001|
|2010-01-26 00:00:00|205.95000100000001|
|2010-01-27 00:00:00|        206.849995|
|2010-01-28 00:00:00|        204.930004|
|2010-01-29 00:00:00|        201.079996|
|2010-02-01 00:00:00|192.36999699999998|
+-------------------+------------------+
only showing top

In [4]:
df_dates.select(functions.dayofmonth(df_dates["Date"])).show()

+----------------+
|dayofmonth(Date)|
+----------------+
|               4|
|               5|
|               6|
|               7|
|               8|
|              11|
|              12|
|              13|
|              14|
|              15|
|              19|
|              20|
|              21|
|              22|
|              25|
|              26|
|              27|
|              28|
|              29|
|               1|
+----------------+
only showing top 20 rows



In [5]:
df_dates.select(functions.month(df_dates["Date"])).show()

+-----------+
|month(Date)|
+-----------+
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          2|
+-----------+
only showing top 20 rows



In [6]:
df_dates.select(functions.year(df_dates["Date"])).show()

+----------+
|year(Date)|
+----------+
|      2010|
|      2010|
|      2010|
|      2010|
|      2010|
|      2010|
|      2010|
|      2010|
|      2010|
|      2010|
|      2010|
|      2010|
|      2010|
|      2010|
|      2010|
|      2010|
|      2010|
|      2010|
|      2010|
|      2010|
+----------+
only showing top 20 rows



In [7]:
new_df_dates = df_dates.withColumn("Year", functions.year(df_dates["Date"]))

In [8]:
result = new_df_dates.groupBy("Year").mean().select(["Year", "avg(Close)"])

In [9]:
new = result.withColumnRenamed("avg(Close)", "Average Closing Price")

In [10]:
new.select(['year', functions.format_number("Average Closing Price", 2).alias("Avg Close")]).show()

+----+---------+
|year|Avg Close|
+----+---------+
|2015|   120.04|
|2013|   472.63|
|2014|   295.40|
|2012|   576.05|
|2016|   104.60|
|2010|   259.84|
|2011|   364.00|
+----+---------+

