In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import when
from pyspark.sql.functions import count
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [2]:
spark = SparkSession.builder\
        .appName("PySpark_Intermediate")\
        .getOrCreate()

In [3]:
people_schema = StructType([
    StructField("Name",StringType(),True),
    StructField("Age",IntegerType(),True),
    StructField("City",StringType(),True),
    StructField("Salary",IntegerType(),True),
    StructField("Gender",StringType(),True)
])

In [4]:
df = spark.read.csv("People_1.csv", header=True, inferSchema=False, schema=people_schema)

In [5]:
df.show()

+-------+---+-------+------+------+
|   Name|Age|   City|Salary|Gender|
+-------+---+-------+------+------+
|  Alice| 29| Mumbai| 50000|     F|
|    Bob| 34|   Pune| 60000|     M|
|Charlie| 25|   Pune| 45000|     M|
|  David| 42| Mumbai| 80000|     M|
|    Eva| 30|Chennai| 62000|     F|
|  Frank| 38|Chennai|  null|     M|
|  Ritch| 28| Mumbai| 68000|     M|
|   Lory| 30|   Pune| 81000|     F|
|   Karl| 26|   Pune| 40000|     M|
+-------+---+-------+------+------+



For each city, calculate:

Total number of people

Average salary

Maximum salary

In [6]:
## Fill Null with 0 is not best practice

df.fillna(0).show()

df.groupBy("City").agg(count("*").alias("total_people"),
                      avg("Salary").alias("avg_salary"),
                      max("Salary").alias("max_salary")).show()

+-------+---+-------+------+------+
|   Name|Age|   City|Salary|Gender|
+-------+---+-------+------+------+
|  Alice| 29| Mumbai| 50000|     F|
|    Bob| 34|   Pune| 60000|     M|
|Charlie| 25|   Pune| 45000|     M|
|  David| 42| Mumbai| 80000|     M|
|    Eva| 30|Chennai| 62000|     F|
|  Frank| 38|Chennai|     0|     M|
|  Ritch| 28| Mumbai| 68000|     M|
|   Lory| 30|   Pune| 81000|     F|
|   Karl| 26|   Pune| 40000|     M|
+-------+---+-------+------+------+

+-------+------------+----------+----------+
|   City|total_people|avg_salary|max_salary|
+-------+------------+----------+----------+
|Chennai|           2|   62000.0|     62000|
| Mumbai|           3|   66000.0|     80000|
|   Pune|           4|   56500.0|     81000|
+-------+------------+----------+----------+



In [7]:
# Total people count

people_count_df = df.groupBy("City").agg(count("*").alias("total_people"))

In [8]:
# Calculating average salary and max salary by filtering null values

salary_df = df.filter(col("Salary").isNotNull())\
.groupBy("City").agg(avg("Salary").alias("avg_salary"),
                      max("Salary").alias("max_salary"))

In [9]:
final_df = people_count_df.join(salary_df,on="City",how="inner").show()

+-------+------------+----------+----------+
|   City|total_people|avg_salary|max_salary|
+-------+------------+----------+----------+
|Chennai|           2|   62000.0|     62000|
| Mumbai|           3|   66000.0|     80000|
|   Pune|           4|   56500.0|     81000|
+-------+------------+----------+----------+



Filter out only male employees

Group them by city

Calculate:

Number of males

Average salary of males (ignore nulls)

In [10]:
df.show()

+-------+---+-------+------+------+
|   Name|Age|   City|Salary|Gender|
+-------+---+-------+------+------+
|  Alice| 29| Mumbai| 50000|     F|
|    Bob| 34|   Pune| 60000|     M|
|Charlie| 25|   Pune| 45000|     M|
|  David| 42| Mumbai| 80000|     M|
|    Eva| 30|Chennai| 62000|     F|
|  Frank| 38|Chennai|  null|     M|
|  Ritch| 28| Mumbai| 68000|     M|
|   Lory| 30|   Pune| 81000|     F|
|   Karl| 26|   Pune| 40000|     M|
+-------+---+-------+------+------+



In [11]:
male_count = df.filter(df.Gender=="M")\
.groupBy("City").agg(count("*").alias("Male_Count"))

In [12]:
avg_salary = df.filter(df.Gender=='M')\
.filter((df.Salary.isNotNull()))\
.groupBy("City").agg(avg("Salary").alias("Avg_male_salary"))

In [13]:
male_city_count_avg_salry =  male_count.join(avg_salary,how='inner',on="City").show()

+------+----------+------------------+
|  City|Male_Count|   Avg_male_salary|
+------+----------+------------------+
|Mumbai|         2|           74000.0|
|  Pune|         3|48333.333333333336|
+------+----------+------------------+



Create a new column salary_status:

If salary ≥ 60000 → "High"

If salary between 40000 and 59999 → "Medium"

If salary < 40000 → "Low"

If salary is null → "Unknown"

Display the columns: name, city, salary, salary_status

In [14]:
df = df.withColumn("Salary_Status",when(df.Salary>=60000,"High")
                              .when((df.Salary>=40000) & (df.Salary<=59999),"Medium")
                              .when((df.Salary<40000),"Low")
                              .when((df.Salary.isNull()),"Unknown"))
add_salary_status_column = df.select("Name","City","Salary","Salary_Status")

Group by both City and Gender

Calculate:

Total number of people

Average salary per group (excluding nulls)

Final output columns:

nginx
Copy
Edit
City | Gender | People_Count | Avg_Salary

In [15]:
city_gender_wise_cont = df.groupBy("City","Gender").agg(count("*").alias("People_Count"))

In [16]:
city_gender_wise_avg_salary_without_null = df.filter(df.Salary.isNotNull())\
.groupBy("City","Gender").agg(avg("Salary").alias("Avg_Salary"))

In [17]:
city_gender_wise_avg_salary = city_gender_wise_cont.join(city_gender_wise_avg_salary_without_null,how='left',on=['City','Gender'])

In [18]:
city_gender_wise_avg_salary = city_gender_wise_avg_salary.fillna({"Avg_Salary":0})

In [19]:
city_gender_wise_avg_salary.show()

+-------+------+------------+------------------+
|   City|Gender|People_Count|        Avg_Salary|
+-------+------+------------+------------------+
|Chennai|     F|           1|           62000.0|
| Mumbai|     F|           1|           50000.0|
|   Pune|     M|           3|48333.333333333336|
|   Pune|     F|           1|           81000.0|
|Chennai|     M|           1|               0.0|
| Mumbai|     M|           2|           74000.0|
+-------+------+------------+------------------+



In [20]:
add_salary_status_column.show()

+-------+-------+------+-------------+
|   Name|   City|Salary|Salary_Status|
+-------+-------+------+-------------+
|  Alice| Mumbai| 50000|       Medium|
|    Bob|   Pune| 60000|         High|
|Charlie|   Pune| 45000|       Medium|
|  David| Mumbai| 80000|         High|
|    Eva|Chennai| 62000|         High|
|  Frank|Chennai|  null|      Unknown|
|  Ritch| Mumbai| 68000|         High|
|   Lory|   Pune| 81000|         High|
|   Karl|   Pune| 40000|       Medium|
+-------+-------+------+-------------+



In [21]:
#city wise count
city_wise_count = add_salary_status_column.groupBy("City").agg(count("*").alias("Peoples_In_City"))

In [22]:
people_by_salary_status_count = add_salary_status_column.groupBy("City","Salary_Status").agg(count("*").alias("Total_People"))

In [23]:
people_by_salary_status = people_by_salary_status_count.join(city_wise_count,how='left',on=['City'])

In [26]:
people_by_salary_status_percentage = people_by_salary_status.withColumn("%_Percentage",round((col("Total_People")/col("Peoples_In_City"))*100,2))

In [28]:
people_by_salary_status_percentage.show()

+-------+-------------+------------+---------------+------------+
|   City|Salary_Status|Total_People|Peoples_In_City|%_Percentage|
+-------+-------------+------------+---------------+------------+
|Chennai|      Unknown|           1|              2|        50.0|
| Mumbai|       Medium|           1|              3|       33.33|
|   Pune|         High|           2|              4|        50.0|
| Mumbai|         High|           2|              3|       66.67|
|   Pune|       Medium|           2|              4|        50.0|
|Chennai|         High|           1|              2|        50.0|
+-------+-------------+------------+---------------+------------+



In [36]:
temp_df = people_by_salary_status_percentage.groupBy("City").pivot("Salary_Status").sum("%_Percentage")

In [37]:
temp_df.show()

+-------+-----+------+-------+
|   City| High|Medium|Unknown|
+-------+-----+------+-------+
|Chennai| 50.0|  null|   50.0|
| Mumbai|66.67| 33.33|   null|
|   Pune| 50.0|  50.0|   null|
+-------+-----+------+-------+

