## Setting Environment Variables

In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

## Creating Data Frame

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder\
        .appName("Group By")\
        .getOrCreate()

In [3]:
simpleData = [("James","Sales","NY",90000,34,10000),
    ("Michael","Sales","NY",86000,56,20000),
    ("Robert","Sales","CA",81000,30,23000),
    ("Maria","Finance","CA",90000,24,23000),
    ("Raman","Finance","CA",99000,40,24000),
    ("Scott","Finance","NY",83000,36,19000),
    ("Jen","Finance","NY",79000,53,15000),
    ("Jeff","Marketing","CA",80000,25,18000),
    ("Kumar","Marketing","NY",91000,50,21000)
  ]

schema = ["employee_name","department","state","salary","age","bonus"]
df = spark.createDataFrame(data=simpleData, schema = schema)
df.printSchema()

root
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- state: string (nullable = true)
 |-- salary: long (nullable = true)
 |-- age: long (nullable = true)
 |-- bonus: long (nullable = true)



## Group By

### With Single Columns

In [4]:
df.groupBy("department").count().orderBy("count", ascending = False).show()

+----------+-----+
|department|count|
+----------+-----+
|   Finance|    4|
|     Sales|    3|
| Marketing|    2|
+----------+-----+



In [5]:
from pyspark.sql.functions import col
df.groupBy(col("department")).mean("bonus").show()

+----------+------------------+
|department|        avg(bonus)|
+----------+------------------+
|     Sales|17666.666666666668|
|   Finance|           20250.0|
| Marketing|           19500.0|
+----------+------------------+



In [28]:
# If we want to pass the alias for the same, we can use select() function
# from pyspark.sql.functions import max
df.groupBy("department").max("bonus").withColumnRenamed("max(bonus)", "max_Bonus").show()

+----------+---------+
|department|max_Bonus|
+----------+---------+
|     Sales|    23000|
|   Finance|    24000|
| Marketing|    21000|
+----------+---------+



### With Multiple columns

In [7]:
df.groupBy("department", "state").sum("salary", "bonus").show()

+----------+-----+-----------+----------+
|department|state|sum(salary)|sum(bonus)|
+----------+-----+-----------+----------+
|     Sales|   NY|     176000|     30000|
|     Sales|   CA|      81000|     23000|
|   Finance|   CA|     189000|     47000|
|   Finance|   NY|     162000|     34000|
| Marketing|   NY|      91000|     21000|
| Marketing|   CA|      80000|     18000|
+----------+-----+-----------+----------+



In [9]:
df.groupBy(df.state, df.department).sum("bonus", "salary").show()

+-----+----------+----------+-----------+
|state|department|sum(bonus)|sum(salary)|
+-----+----------+----------+-----------+
|   NY|     Sales|     30000|     176000|
|   CA|     Sales|     23000|      81000|
|   CA|   Finance|     47000|     189000|
|   NY|   Finance|     34000|     162000|
|   CA| Marketing|     18000|      80000|
|   NY| Marketing|     21000|      91000|
+-----+----------+----------+-----------+



### More Aggregates at same time

In [12]:
from pyspark.sql.functions import avg, sum, max
df.groupBy("state")\
    .agg(
        sum("salary").alias("sum_salary"),\
        avg("salary").alias("avg_salary"),\
        sum("bonus").alias("sum_bonus"),\
        max("bonus").alias("max_bonus")
    )\
    .show()

+-----+----------+----------+---------+---------+
|state|sum_salary|avg_salary|sum_bonus|max_bonus|
+-----+----------+----------+---------+---------+
|   NY|    429000|   85800.0|    85000|    21000|
|   CA|    350000|   87500.0|    88000|    24000|
+-----+----------+----------+---------+---------+



### Using Filter over the aggregate data

In [37]:
df.groupBy("department")\
    .agg(
        sum("salary").alias("sum_salary"),\
        avg("salary").alias("avg_salary"),\
        sum("bonus").alias("sum_bonus"),\
        avg("bonus").alias("avg_bonus")
    )\
    .filter(col("sum_salary")<=300000)\
    .show()

+----------+----------+-----------------+---------+------------------+
|department|sum_salary|       avg_salary|sum_bonus|         avg_bonus|
+----------+----------+-----------------+---------+------------------+
|     Sales|    257000|85666.66666666667|    53000|17666.666666666668|
| Marketing|    171000|          85500.0|    39000|           19500.0|
+----------+----------+-----------------+---------+------------------+

