In [21]:
from pyspark.sql import SparkSession

In [22]:
spark_session = SparkSession.builder.appName("pyspark_dataframes_group_by_aggregation").getOrCreate()
spark_session

In [23]:
spark_df = (spark_session
            .read
            .csv("file:///mnt/92D26AE0D26AC7D5/Python/pyspark/ararental_2.csv", header=True, inferSchema=True))
spark_df.show()

+----------------+---------------+--------------+--------------+-----+-----+
|     Vendor_Name|        Address|          City|           Fax|Years|Sales|
+----------------+---------------+--------------+--------------+-----+-----+
|American Rental |100 Grantley Ct| Sandy Springs|800  -714-7422|    3| 4530|
|    Attema Sales|  117 E 13th St|         Pella|641  -628-4983| null| 2300|
|     B & S Sales|  218 Maquan St|        Hanson|          null|    2| 8000|
|            null|    PO Box 3374|South Pasadena|818  -276-8409|    2| null|
|            null|           null|   Lees Summit|816  -524-6983|    5| 5000|
|    Attema Sales|   100 D Meg St|        Hanson|641  -628-4983| null| 2000|
|    Lovi Rental |         Meg Ct| Sandy Springs|800  -700-8422|    4| 6000|
+----------------+---------------+--------------+--------------+-----+-----+



In [29]:
# here I drop null row for Years and Sales
spark_df = spark_df.na.drop(subset=["Years", "Sales"])

# we can use groupBy() method to group data and note that this wil return GroupData object
# so we can't use show() here
type(spark_df.groupBy("City"))

pyspark.sql.group.GroupedData

In [33]:
# actualy from here we are going to group data and apply aggregate functions like sum(), count(), mean(), etc
# here we can get sum for all possible columns by group them by "City"
spark_df.groupBy("City").sum().show()

+-------------+----------+----------+
|         City|sum(Years)|sum(Sales)|
+-------------+----------+----------+
|Sandy Springs|         7|     10530|
|       Hanson|         2|      8000|
|  Lees Summit|         5|      5000|
+-------------+----------+----------+



In [34]:
# here we can get sum for given columns by group them by "City"
spark_df.select(["Sales", "City"]).groupBy("City").sum().show()

+-------------+----------+
|         City|sum(Sales)|
+-------------+----------+
|Sandy Springs|     10530|
|       Hanson|      8000|
|  Lees Summit|      5000|
+-------------+----------+



In [35]:
# here we can get mean for given columns by group them by "City"
spark_df.select(["Sales", "City"]).groupBy("City").mean().show()

+-------------+----------+
|         City|avg(Sales)|
+-------------+----------+
|Sandy Springs|    5265.0|
|       Hanson|    8000.0|
|  Lees Summit|    5000.0|
+-------------+----------+



In [36]:
# here we can get count for given columns by group them by "City"
spark_df.select(["Sales", "City"]).groupBy("City").count().show()

+-------------+-----+
|         City|count|
+-------------+-----+
|Sandy Springs|    2|
|       Hanson|    1|
|  Lees Summit|    1|
+-------------+-----+



In [42]:
# we can also apply aggregate functions directly
spark_df.agg({"Sales":"sum"}).show()

+----------+
|sum(Sales)|
+----------+
|     23530|
+----------+

