In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [3]:
spark = SparkSession.builder\
        .appName("Advanced_Spark")\
        .getOrCreate()

# Task:
Calculate the age group for each person:

"Young" if age < 30

"Adult" if age is between 30 and 45

"Senior" if age > 45

For each combination of City, Gender, and Age Group:

Count number of people

Compute average non-null salary

Categorize the average salary as:

"High" if avg salary ≥ 70000

"Medium" if avg salary between 50000 and 69999

"Low" if avg salary < 50000

"Unavailable" if average salary is null (e.g., all were null in that group)

In [4]:
people_schema = StructType([
    StructField("name",StringType(),True),
    StructField("age",IntegerType(),True),
    StructField("city",StringType(),True),
    StructField("salary",FloatType(),True),
    StructField("gender",StringType(),True)
])

In [5]:
df = spark.read.csv("People_1.csv",header=True,inferSchema=False,schema=people_schema)

In [6]:
df.show()

+-------+---+-------+-------+------+
|   name|age|   city| salary|gender|
+-------+---+-------+-------+------+
|  Alice| 29| Mumbai|50000.0|     F|
|    Bob| 34|   Pune|60000.0|     M|
|Charlie| 25|   Pune|45000.0|     M|
|  David| 42| Mumbai|80000.0|     M|
|    Eva| 30|Chennai|62000.0|     F|
|  Frank| 38|Chennai|   null|     M|
|  Ritch| 28| Mumbai|68000.0|     M|
|   Lory| 30|   Pune|81000.0|     F|
|   Karl| 26|   Pune|40000.0|     M|
+-------+---+-------+-------+------+



In [7]:
df_with_age_group = df.withColumn("age_group",when(df.age<30,"Young")
                         .when((df.age>=30) & (df.age<=45),"Adult")
                         .when(df.age>45,"Senior"))

In [8]:
df_with_age_group.show()

+-------+---+-------+-------+------+---------+
|   name|age|   city| salary|gender|age_group|
+-------+---+-------+-------+------+---------+
|  Alice| 29| Mumbai|50000.0|     F|    Young|
|    Bob| 34|   Pune|60000.0|     M|    Adult|
|Charlie| 25|   Pune|45000.0|     M|    Young|
|  David| 42| Mumbai|80000.0|     M|    Adult|
|    Eva| 30|Chennai|62000.0|     F|    Adult|
|  Frank| 38|Chennai|   null|     M|    Adult|
|  Ritch| 28| Mumbai|68000.0|     M|    Young|
|   Lory| 30|   Pune|81000.0|     F|    Adult|
|   Karl| 26|   Pune|40000.0|     M|    Young|
+-------+---+-------+-------+------+---------+



In [9]:
number_of_people = df_with_age_group.groupBy("city","gender","age_group").agg(count("*").alias("people_count"))

In [10]:
avg_salary_column_added = df_with_age_group.filter(df.salary.isNotNull())\
.groupBy("city","gender","age_group").agg(avg("salary").alias("avg_salary"))

In [11]:
number_people_with_avg_salary = number_of_people.join(avg_salary_column_added,how='left',on=["city","gender","age_group"])

In [12]:
number_people_with_avg_salary = number_people_with_avg_salary.fillna({'avg_salary':0})

In [13]:
salary_level_column_added = number_people_with_avg_salary.withColumn("salary_level",when((number_people_with_avg_salary.avg_salary) >= 70000,"High")
                                                  .when((number_people_with_avg_salary.avg_salary >=50000) & (number_people_with_avg_salary.avg_salary <69999),"Medium")
                                                  .when((number_people_with_avg_salary.avg_salary < 50000),"Low")
                                                  .when((number_people_with_avg_salary.avg_salary.isNull()),"Unavailable"))
#                                                    .otherwise("Unavailable"))

In [14]:
salary_level_column_added.show()

+-------+------+---------+------------+----------+------------+
|   city|gender|age_group|people_count|avg_salary|salary_level|
+-------+------+---------+------------+----------+------------+
|   Pune|     M|    Young|           2|   42500.0|         Low|
|Chennai|     F|    Adult|           1|   62000.0|      Medium|
|   Pune|     M|    Adult|           1|   60000.0|      Medium|
|   Pune|     F|    Adult|           1|   81000.0|        High|
| Mumbai|     M|    Young|           1|   68000.0|      Medium|
| Mumbai|     M|    Adult|           1|   80000.0|        High|
|Chennai|     M|    Adult|           1|       0.0|         Low|
| Mumbai|     F|    Young|           1|   50000.0|      Medium|
+-------+------+---------+------------+----------+------------+



# Advanced Level – Question 2:

Create a report that shows for each city:

Total number of people

Number of people in each salary level ("High", "Medium", "Low", "Unavailable")

Percentage distribution of these salary levels

In [15]:
df = df.withColumn("Salary_Status",when(df.salary>=60000,"High")
                              .when((df.salary>=40000) & (df.salary<=59999),"Medium")
                              .when((df.salary<40000),"Low")
                              .when((df.salary.isNull()),"Unknown"))
add_salary_status_column = df.select("name","city","salary","salary_status")

In [16]:
city_gender_wise_cont = df.groupBy("city","gender").agg(count("*").alias("people_count"))

In [17]:
city_gender_wise_avg_salary_without_null = df.filter(df.salary.isNotNull())\
.groupBy("city","gender").agg(avg("salary").alias("avg_salary"))

In [18]:
city_gender_wise_avg_salary = city_gender_wise_cont.join(city_gender_wise_avg_salary_without_null,how='left',on=['city','gender'])

In [19]:
city_gender_wise_avg_salary = city_gender_wise_avg_salary.fillna({"avg_salary":0})

In [20]:
city_gender_wise_avg_salary.show()

+-------+------+------------+------------------+
|   city|gender|people_count|        avg_salary|
+-------+------+------------+------------------+
|Chennai|     F|           1|           62000.0|
| Mumbai|     F|           1|           50000.0|
|   Pune|     M|           3|48333.333333333336|
|   Pune|     F|           1|           81000.0|
|Chennai|     M|           1|               0.0|
| Mumbai|     M|           2|           74000.0|
+-------+------+------------+------------------+



In [21]:
#city wise count
city_wise_count = add_salary_status_column.groupBy("city").agg(count("*").alias("peoples_in_city"))

In [22]:
people_by_salary_status_count = add_salary_status_column.groupBy("city","salary_status").agg(count("*").alias("total_people"))

In [23]:
people_by_salary_status = people_by_salary_status_count.join(city_wise_count,how='left',on=['city'])

In [24]:
people_by_salary_status_percentage = people_by_salary_status.withColumn("%_percentage",round((col("total_people")/col("peoples_in_city"))*100,2))

In [25]:
people_by_salary_status_percentage.show()

+-------+-------------+------------+---------------+------------+
|   city|salary_status|total_people|peoples_in_city|%_percentage|
+-------+-------------+------------+---------------+------------+
|Chennai|      Unknown|           1|              2|        50.0|
| Mumbai|       Medium|           1|              3|       33.33|
|   Pune|         High|           2|              4|        50.0|
| Mumbai|         High|           2|              3|       66.67|
|   Pune|       Medium|           2|              4|        50.0|
|Chennai|         High|           1|              2|        50.0|
+-------+-------------+------------+---------------+------------+



In [26]:
salary_status_counts = people_by_salary_status_count.groupBy("city").pivot("salary_status").sum("total_people")

In [27]:
salary_status_counts = salary_status_counts.fillna(0)

In [28]:
salary_status_counts = salary_status_counts.withColumnRenamed("High","high_count")\
                    .withColumnRenamed("Medium","medium_count")\
                    .withColumnRenamed("Unknown","unavailable_count")\
                    .withColumnRenamed("Low","low_count")

In [37]:
# temp_df = people_by_salary_status_percentage.groupBy("city","total_people","peoples_in_city").pivot("salary_status").sum("%_percentage")
temp_df = people_by_salary_status_percentage.groupBy("city").pivot("salary_status").sum("%_percentage")

In [38]:
temp_df = temp_df.withColumnRenamed("High","high_%")\
                .withColumnRenamed("Medium","medium_%")\
                .withColumnRenamed("Low","low_%")\
                .withColumnRenamed("Unknown","unknown_%")

In [39]:
temp_df = temp_df.fillna(0)

In [40]:
temp_df.show()

+-------+------+--------+---------+
|   city|high_%|medium_%|unknown_%|
+-------+------+--------+---------+
|Chennai|  50.0|     0.0|     50.0|
| Mumbai| 66.67|   33.33|      0.0|
|   Pune|  50.0|    50.0|      0.0|
+-------+------+--------+---------+



In [41]:
final_df = temp_df.join(city_wise_count, on="city", how="left")
final_df = final_df.join(salary_status_counts, on="city", how="left")
final_df = final_df.join(city_wise_count)

In [42]:
final_df.show()

+-------+------+--------+---------+---------------+----------+------------+-----------------+-------+---------------+
|   city|high_%|medium_%|unknown_%|peoples_in_city|high_count|medium_count|unavailable_count|   city|peoples_in_city|
+-------+------+--------+---------+---------------+----------+------------+-----------------+-------+---------------+
|Chennai|  50.0|     0.0|     50.0|              2|         1|           0|                1|Chennai|              2|
|Chennai|  50.0|     0.0|     50.0|              2|         1|           0|                1| Mumbai|              3|
|Chennai|  50.0|     0.0|     50.0|              2|         1|           0|                1|   Pune|              4|
| Mumbai| 66.67|   33.33|      0.0|              3|         2|           1|                0|Chennai|              2|
| Mumbai| 66.67|   33.33|      0.0|              3|         2|           1|                0| Mumbai|              3|
| Mumbai| 66.67|   33.33|      0.0|              3|     

##  Tasks

Drop rows where either UnitsSold or UnitPrice is null.

Add a new column: TotalSale = UnitsSold * UnitPrice.

Find average TotalSale per Region and Category.

Use pivot() to transform Region-wise average sales, with Category as columns.

Fill any nulls in the pivoted result with 0.

In [55]:
df_schema = StructType([
    StructField("productid",IntegerType(),True),
    StructField("producname",StringType(),True),
    StructField("category",StringType(),True),
    StructField("regionid",StringType(),True),
    StructField("unitsold",IntegerType(),True),
    StructField("unitprice",IntegerType(),True)
])

In [57]:
sales_df = spark.read.csv("sales.csv",inferSchema=False,header=True,schema=df_schema)

In [58]:
sales_df.show()

+---------+----------+-----------+--------+--------+---------+
|productid|producname|   category|regionid|unitsold|unitprice|
+---------+----------+-----------+--------+--------+---------+
|      101|    Laptop|Electronics|    West|      10|    50000|
|      102|    Laptop|Electronics|   North|       5|    52000|
|      103|     Chair|  Furniture|    West|       7|     1500|
|      104|     Table|  Furniture|    East|       3|     3000|
|      105|    Mobile|Electronics|   South|    null|    20000|
|      106|      Sofa|  Furniture|   South|       2|     null|
|      107|    Laptop|Electronics|    West|       4|    51000|
+---------+----------+-----------+--------+--------+---------+



In [61]:
sales_without_null = sales_df.na.drop(subset=["unitsold","unitprice"])

In [66]:
sale_with_total_sale = sales_without_null.withColumn("totalsale",(sales_without_null.unitsold * sales_without_null.unitprice))

In [68]:
avg_sale_region_category = sale_with_total_sale.groupBy("category","regionid").agg(avg("totalsale").alias("avg_sale_category_region"))

In [71]:
avg_sale_region_category.show()

+-----------+--------+------------------------+
|   category|regionid|avg_sale_category_region|
+-----------+--------+------------------------+
|  Furniture|    East|                  9000.0|
|Electronics|   North|                260000.0|
|  Furniture|    West|                 10500.0|
|Electronics|    West|                352000.0|
+-----------+--------+------------------------+



In [73]:
category_region_sales = avg_sale_region_category.groupBy("category").pivot("regionid").sum("avg_sale_category_region")

In [74]:
category_region_sales = category_region_sales.fillna(0)

In [75]:
category_region_sales.show()

+-----------+------+--------+--------+
|   category|  East|   North|    West|
+-----------+------+--------+--------+
|Electronics|   0.0|260000.0|352000.0|
|  Furniture|9000.0|     0.0| 10500.0|
+-----------+------+--------+--------+



## Your Tasks

Remove rows where either units_sold or revenue is null.

Add a rank column within each category based on the revenue, in descending order.

Filter the top 2 products per category based on the revenue.

Show final columns: category, product_name, revenue, rank

In [77]:
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number

In [78]:
sale_with_total_sale.show()

+---------+----------+-----------+--------+--------+---------+---------+
|productid|producname|   category|regionid|unitsold|unitprice|totalsale|
+---------+----------+-----------+--------+--------+---------+---------+
|      101|    Laptop|Electronics|    West|      10|    50000|   500000|
|      102|    Laptop|Electronics|   North|       5|    52000|   260000|
|      103|     Chair|  Furniture|    West|       7|     1500|    10500|
|      104|     Table|  Furniture|    East|       3|     3000|     9000|
|      107|    Laptop|Electronics|    West|       4|    51000|   204000|
+---------+----------+-----------+--------+--------+---------+---------+



In [83]:
window_spec = Window.partitionBy("category").orderBy(col("totalsale").desc())

In [87]:
sales_with_rank = sale_with_total_sale.withColumn("rnk_number",row_number().over(window_spec))

In [89]:
top_2_products = sales_with_rank.filter(col("rnk_number")<=2)

In [90]:
top_2_products.select("category","producname","totalsale","rnk_number").show()

+-----------+----------+---------+----------+
|   category|producname|totalsale|rnk_number|
+-----------+----------+---------+----------+
|Electronics|    Laptop|   260000|         2|
|Electronics|    Laptop|   204000|         3|
|  Furniture|     Table|     9000|         2|
+-----------+----------+---------+----------+

