# Aggregate Function in DataFrame – Part 2

In [19]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# Create Spark session
spark = SparkSession.builder.appName('AggregationExamples').getOrCreate()

StatementMeta(, cf10214c-5cdc-4826-b2f6-fa2456a7be3f, 21, Finished, Available, Finished)

## Sample Data Creation

In [20]:
data = [
    ('HR', 10000, 500, 'John'),
    ('Finance', 20000, 1500, 'Doe'),
    ('HR', 15000, 1000, 'Alice'),
    ('Finance', 25000, 2000, 'Eve'),
    ('HR', 20000, 1500, 'Mark')
]

schema = StructType([
    StructField('department', StringType(), True),
    StructField('salary', IntegerType(), True),
    StructField('bonus', IntegerType(), True),
    StructField('employee_name', StringType(), True)
])

df = spark.createDataFrame(data, schema)
df.show()

StatementMeta(, cf10214c-5cdc-4826-b2f6-fa2456a7be3f, 22, Finished, Available, Finished)

+----------+------+-----+-------------+
|department|salary|bonus|employee_name|
+----------+------+-----+-------------+
|        HR| 10000|  500|         John|
|   Finance| 20000| 1500|          Doe|
|        HR| 15000| 1000|        Alice|
|   Finance| 25000| 2000|          Eve|
|        HR| 20000| 1500|         Mark|
+----------+------+-----+-------------+



## 1. Grouped Aggregation
- **sum()**: Adds values in a group.
- **avg()**: Computes average.
- **max()**, **min()**: Finds max/min values.

In [21]:
df.groupBy('department').agg(
   sum('salary').alias('total_salary'),
   avg('salary').alias('avg_salary'),
   max('bonus').alias('max_bonus'),
   min('bonus').alias('min_bonus')
).show()

StatementMeta(, cf10214c-5cdc-4826-b2f6-fa2456a7be3f, 23, Finished, Available, Finished)

+----------+------------+----------+---------+---------+
|department|total_salary|avg_salary|max_bonus|min_bonus|
+----------+------------+----------+---------+---------+
|        HR|       45000|   15000.0|     1500|      500|
|   Finance|       45000|   22500.0|     2000|     1500|
+----------+------------+----------+---------+---------+



## 2. Multiple Aggregations
Perform multiple aggregations in a single step.

In [22]:
df.groupBy('department').agg(
    count('salary').alias('count_salary'),
    avg('bonus').alias('avg_bonus'),
    max('salary').alias('max_salary')
).show()

StatementMeta(, cf10214c-5cdc-4826-b2f6-fa2456a7be3f, 24, Finished, Available, Finished)

+----------+------------+---------+----------+
|department|count_salary|avg_bonus|max_salary|
+----------+------------+---------+----------+
|        HR|           3|   1000.0|     20000|
|   Finance|           2|   1750.0|     25000|
+----------+------------+---------+----------+



## 3. Concatenate Strings

In [23]:
df.groupBy('department').agg(
    concat_ws(', ', collect_list('employee_name')).alias('employees')
).show()

StatementMeta(, cf10214c-5cdc-4826-b2f6-fa2456a7be3f, 25, Finished, Available, Finished)

+----------+-----------------+
|department|        employees|
+----------+-----------------+
|        HR|John, Alice, Mark|
|   Finance|         Doe, Eve|
+----------+-----------------+



## 4. First and Last Values

In [24]:
df.groupBy('department').agg(
    first('employee_name').alias('first_employee'),
    last('employee_name').alias('last_employee')
).show()

StatementMeta(, cf10214c-5cdc-4826-b2f6-fa2456a7be3f, 26, Finished, Available, Finished)

+----------+--------------+-------------+
|department|first_employee|last_employee|
+----------+--------------+-------------+
|   Finance|           Doe|          Eve|
|        HR|          John|         Mark|
+----------+--------------+-------------+



## 5. Standard Deviation and Variance

In [25]:
df.select(
   stddev('salary').alias('stddev_salary'),
   variance('salary').alias('variance_salary')
).show()

StatementMeta(, cf10214c-5cdc-4826-b2f6-fa2456a7be3f, 27, Finished, Available, Finished)

+----------------+---------------+
|   stddev_salary|variance_salary|
+----------------+---------------+
|5700.87712549569|         3.25E7|
+----------------+---------------+



## 6. Aggregation with Alias

In [26]:
df.groupBy('department').agg(
    sum('salary').alias('Total Salary')
).show()

StatementMeta(, cf10214c-5cdc-4826-b2f6-fa2456a7be3f, 28, Finished, Available, Finished)

+----------+------------+
|department|Total Salary|
+----------+------------+
|        HR|       45000|
|   Finance|       45000|
+----------+------------+



## 7. Sum of Distinct Values

In [27]:
df.select(
    sumDistinct('salary').alias('sum_distinct_salary')
).show()

StatementMeta(, cf10214c-5cdc-4826-b2f6-fa2456a7be3f, 29, Finished, Available, Finished)

+-------------------+
|sum_distinct_salary|
+-------------------+
|              70000|
+-------------------+

