## Import statements

In [0]:
import pyspark.sql.functions as f
from pyspark.sql.types import *

## Create a dataframe

In [0]:
df = spark.read.format("csv").option("header", True).load("/FileStore/tables/store_sales/train.csv")

In [0]:
df.show(5)

+---+----------+---------+----------+-----+-----------+
| id|      date|store_nbr|    family|sales|onpromotion|
+---+----------+---------+----------+-----+-----------+
|  0|2013-01-01|        1|AUTOMOTIVE|  0.0|          0|
|  1|2013-01-01|        1| BABY CARE|  0.0|          0|
|  2|2013-01-01|        1|    BEAUTY|  0.0|          0|
|  3|2013-01-01|        1| BEVERAGES|  0.0|          0|
|  4|2013-01-01|        1|     BOOKS|  0.0|          0|
+---+----------+---------+----------+-----+-----------+
only showing top 5 rows



In [0]:
df.printSchema()

root
 |-- id: string (nullable = true)
 |-- date: string (nullable = true)
 |-- store_nbr: string (nullable = true)
 |-- family: string (nullable = true)
 |-- sales: string (nullable = true)
 |-- onpromotion: string (nullable = true)



In [0]:
df = df.withColumn("date", f.to_date("date", "yyyy-MM-dd")).withColumn("id", df["id"].cast(IntegerType())).withColumn("store_nbr", df["store_nbr"].cast(IntegerType())).withColumn("sales", df["sales"].cast(FloatType())).withColumn("onpromotion",df["onpromotion"].cast(IntegerType()))

In [0]:
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- date: date (nullable = true)
 |-- store_nbr: integer (nullable = true)
 |-- family: string (nullable = true)
 |-- sales: float (nullable = true)
 |-- onpromotion: integer (nullable = true)



## Aggregates
Functions commonly used are count, sum, avg, min and max

In [0]:
# Here count is the method availabel on spark df. Ref document: https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.count.html#pyspark.sql.DataFrame.count
# This is an action so result will be calculated by calling the method. 
df.count()

Out[10]: 3000888

In [0]:
# This is a function and a transformation. An action is required to trigger execution. Doc: https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.functions.count.html#pyspark.sql.functions.count
df.select(f.count("*")).show()

+--------+
|count(1)|
+--------+
| 3000888|
+--------+



In [0]:
df.select("family", "sales").groupby("family").agg(f.count("*")).show(10)

+-------------------+--------+
|             family|count(1)|
+-------------------+--------+
|     PREPARED FOODS|   90936|
|HOME AND KITCHEN II|   90936|
|         LADIESWEAR|   90936|
|    LAWN AND GARDEN|   90936|
|          GROCERY I|   90936|
|          BABY CARE|   90936|
|            PRODUCE|   90936|
|         AUTOMOTIVE|   90936|
|          BEVERAGES|   90936|
|          HOME CARE|   90936|
+-------------------+--------+
only showing top 10 rows



In [0]:
df.select("family", "sales").groupby("family").agg(f.sum("sales")).show(10)

+-------------------+--------------------+
|             family|          sum(sales)|
+-------------------+--------------------+
|     PREPARED FOODS|    8799895.11675644|
|HOME AND KITCHEN II|           1520670.0|
|         LADIESWEAR|            651159.0|
|    LAWN AND GARDEN|            548842.0|
|          GROCERY I| 3.434627348690796E8|
|          BABY CARE|             10051.0|
|            PRODUCE|1.2270468468069077E8|
|         AUTOMOTIVE|            554822.0|
|          BEVERAGES|        2.16954486E8|
|          HOME CARE|         1.6022744E7|
+-------------------+--------------------+
only showing top 10 rows



In [0]:
df.select("family", "sales").groupby("family").agg(f.sum("sales").alias("total_sales")).withColumn("total_sales",f.round("total_sales", 2)).show(10)

+-------------------+--------------+
|             family|   total_sales|
+-------------------+--------------+
|     PREPARED FOODS|    8799895.12|
|HOME AND KITCHEN II|     1520670.0|
|         LADIESWEAR|      651159.0|
|    LAWN AND GARDEN|      548842.0|
|          GROCERY I|3.4346273487E8|
|          BABY CARE|       10051.0|
|            PRODUCE|1.2270468468E8|
|         AUTOMOTIVE|      554822.0|
|          BEVERAGES|  2.16954486E8|
|          HOME CARE|   1.6022744E7|
+-------------------+--------------+
only showing top 10 rows



In [0]:
df.select("family", "sales").groupby("family").agg(f.sum("sales").alias("total_sales")).withColumn("total_sales",f.format_number(f.round("total_sales", 2), 20)).sort("total_sales", ascending=False).show(10)

+--------------------+--------------------+
|              family|         total_sales|
+--------------------+--------------------+
|            CLEANING|97,521,289.000000...|
|      PREPARED FOODS|8,799,895.1200000...|
|         CELEBRATION|761,177.000000000...|
|    LIQUOR,WINE,BEER|7,746,640.0000000...|
|            LINGERIE|653,114.000000000...|
|          LADIESWEAR|651,159.000000000...|
|               DAIRY|64,487,709.000000...|
|               BOOKS|6,438.00000000000...|
|PLAYERS AND ELECT...|562,608.000000000...|
|          AUTOMOTIVE|554,822.000000000...|
+--------------------+--------------------+
only showing top 10 rows



In [0]:
df.groupBy().min().show()

+-------+--------------+----------+----------------+
|min(id)|min(store_nbr)|min(sales)|min(onpromotion)|
+-------+--------------+----------+----------------+
|      0|             1|       0.0|               0|
+-------+--------------+----------+----------------+



In [0]:
df.groupBy().max().show()

+-------+--------------+----------+----------------+
|max(id)|max(store_nbr)|max(sales)|max(onpromotion)|
+-------+--------------+----------+----------------+
|3000887|            54|  124717.0|             741|
+-------+--------------+----------+----------------+



In [0]:
df.columns

Out[47]: ['id', 'date', 'store_nbr', 'family', 'sales', 'onpromotion']

In [0]:
df.groupBy("store_nbr","family").max("onpromotion").sort("max(onpromotion)", ascending=False).show(10)

+---------+---------+----------------+
|store_nbr|   family|max(onpromotion)|
+---------+---------+----------------+
|       53|GROCERY I|             741|
|       36|GROCERY I|             697|
|       33|GROCERY I|             591|
|       54|GROCERY I|             551|
|       16|GROCERY I|             519|
|       35|GROCERY I|             425|
|       53|BEVERAGES|             342|
|       36|BEVERAGES|             330|
|       53| CLEANING|             285|
|       33|BEVERAGES|             263|
+---------+---------+----------------+
only showing top 10 rows



In [0]:
df.groupby("family").sum("sales", "onpromotion").show(10)

+-------------------+--------------------+----------------+
|             family|          sum(sales)|sum(onpromotion)|
+-------------------+--------------------+----------------+
|     PREPARED FOODS|    8799895.11675644|           37524|
|HOME AND KITCHEN II|           1520670.0|           32814|
|         LADIESWEAR|            651159.0|            1680|
|    LAWN AND GARDEN|            548842.0|           11208|
|          GROCERY I| 3.434627348690796E8|         1914801|
|          BABY CARE|             10051.0|              53|
|            PRODUCE|1.2270468468069077E8|         1117921|
|         AUTOMOTIVE|            554822.0|            4783|
|          BEVERAGES|        2.16954486E8|          906958|
|          HOME CARE|         1.6022744E7|          197230|
+-------------------+--------------------+----------------+
only showing top 10 rows



In [0]:
df.groupby("family").agg(f.sum("sales").alias("total_sales"), f.sum("onpromotion").alias("onpromo_total"), f.avg("sales").alias("avg_sales")).sort("avg_sales", ascending=False).show(10)

+-------------+--------------------+-------------+------------------+
|       family|         total_sales|onpromo_total|         avg_sales|
+-------------+--------------------+-------------+------------------+
|    GROCERY I| 3.434627348690796E8|      1914801|3776.9720998183293|
|    BEVERAGES|        2.16954486E8|       906958|2385.7931512272366|
|      PRODUCE|1.2270468468069077E8|      1117921|1349.3521232591138|
|     CLEANING|         9.7521289E7|       661157|1072.4167436438813|
|        DAIRY|         6.4487709E7|       728707| 709.1548891528108|
| BREAD/BAKERY| 4.213394557421875E7|       331289| 463.3362537852858|
|      POULTRY| 3.187600447331786E7|       226421| 350.5322916481686|
|        MEATS|3.1086468399745286E7|       304028| 341.8499648076151|
|PERSONAL CARE|         2.4592051E7|       246928|270.43251297615905|
|         DELI|2.4110322464111328E7|       583316| 265.1350671253555|
+-------------+--------------------+-------------+------------------+
only showing top 10 

In [0]:
df.groupby("family").agg(f.sum("sales").alias("total_sales"), f.sum("onpromotion").alias("onpromo_total"), f.avg("sales").alias("avg_sales")).show(10)

+-------------------+--------------------+-------------+-------------------+
|             family|         total_sales|onpromo_total|          avg_sales|
+-------------------+--------------------+-------------+-------------------+
|     PREPARED FOODS|    8799895.11675644|        37524|  96.77020230443871|
|HOME AND KITCHEN II|           1520670.0|        32814| 16.722420163631565|
|         LADIESWEAR|            651159.0|         1680|   7.16062945368171|
|    LAWN AND GARDEN|            548842.0|        11208|  6.035475499252222|
|          GROCERY I| 3.434627348690796E8|      1914801| 3776.9720998183293|
|          BABY CARE|             10051.0|           53|0.11052828362804609|
|            PRODUCE|1.2270468468069077E8|      1117921| 1349.3521232591138|
|         AUTOMOTIVE|            554822.0|         4783|  6.101236034133897|
|          BEVERAGES|        2.16954486E8|       906958| 2385.7931512272366|
|          HOME CARE|         1.6022744E7|       197230| 176.19802938330255|

In [0]:
df.groupby("family").agg({'sales': 'sum', 'onpromotion' : 'avg'}).show(10)

+-------------------+--------------------+--------------------+
|             family|    avg(onpromotion)|          sum(sales)|
+-------------------+--------------------+--------------------+
|     PREPARED FOODS|   0.412641858010029|    8799895.11675644|
|HOME AND KITCHEN II| 0.36084718923198733|           1520670.0|
|         LADIESWEAR|0.018474531538664556|            651159.0|
|    LAWN AND GARDEN| 0.12325151755080496|            548842.0|
|          GROCERY I|   21.05657825283716| 3.434627348690796E8|
|          BABY CARE|5.828274830650127E-4|             10051.0|
|            PRODUCE|  12.293492126330607|1.2270468468069077E8|
|         AUTOMOTIVE| 0.05259743116037653|            554822.0|
|          BEVERAGES|   9.973585818597694|        2.16954486E8|
|          HOME CARE|   2.168888009149292|         1.6022744E7|
+-------------------+--------------------+--------------------+
only showing top 10 rows

