In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [3]:
spark = SparkSession.builder\
        .appName("Advanced_Spark")\
        .getOrCreate()

In [29]:
people_schema = StructType([
    StructField("name",StringType(),True),
    StructField("age",IntegerType(),True),
    StructField("city",StringType(),True),
    StructField("salary",FloatType(),True),
    StructField("gender",StringType(),True)
])

In [30]:
df = spark.read.csv("People_1.csv",header=True,inferSchema=False,schema=people_schema)

In [31]:
df.show()

+-------+---+-------+-------+------+
|   name|age|   city| salary|gender|
+-------+---+-------+-------+------+
|  Alice| 29| Mumbai|50000.0|     F|
|    Bob| 34|   Pune|60000.0|     M|
|Charlie| 25|   Pune|45000.0|     M|
|  David| 42| Mumbai|80000.0|     M|
|    Eva| 30|Chennai|62000.0|     F|
|  Frank| 38|Chennai|   null|     M|
+-------+---+-------+-------+------+



In [32]:
df_with_age_group = df.withColumn("age_group",when(df.age<30,"Young")
                         .when((df.age>=30) & (df.age<=45),"Adult")
                         .when(df.age>45,"Senior"))

In [33]:
df_with_age_group.show()

+-------+---+-------+-------+------+---------+
|   name|age|   city| salary|gender|age_group|
+-------+---+-------+-------+------+---------+
|  Alice| 29| Mumbai|50000.0|     F|    Young|
|    Bob| 34|   Pune|60000.0|     M|    Adult|
|Charlie| 25|   Pune|45000.0|     M|    Young|
|  David| 42| Mumbai|80000.0|     M|    Adult|
|    Eva| 30|Chennai|62000.0|     F|    Adult|
|  Frank| 38|Chennai|   null|     M|    Adult|
+-------+---+-------+-------+------+---------+



In [77]:
number_of_people = df_with_age_group.groupBy("city","gender","age_group").agg(count("*").alias("people_count"))

In [78]:
avg_salary_column_added = df_with_age_group.filter(df.salary.isNotNull())\
.groupBy("city","gender","age_group").agg(avg("salary").alias("avg_salary"))

In [79]:
number_people_with_avg_salary = number_of_people.join(avg_salary_column_added,how='left',on=["city","gender","age_group"])

In [80]:
number_people_with_avg_salary = number_people_with_avg_salary.fillna({'avg_salary':0})

In [87]:
salary_level_column_added = number_people_with_avg_salary.withColumn("salary_level",when((number_people_with_avg_salary.avg_salary) >= 70000,"High")
                                                  .when((number_people_with_avg_salary.avg_salary >=50000) & (number_people_with_avg_salary.avg_salary <69999),"Medium")
                                                  .when((number_people_with_avg_salary.avg_salary < 50000),"Low")
                                                  .when((number_people_with_avg_salary.avg_salary.isNull()),"Unavailable"))
#                                                    .otherwise("Unavailable"))

In [88]:
salary_level_column_added.show()

+-------+------+---------+------------+----------+------------+
|   city|gender|age_group|people_count|avg_salary|salary_level|
+-------+------+---------+------------+----------+------------+
|   Pune|     M|    Young|           1|   45000.0|         Low|
|Chennai|     F|    Adult|           1|   62000.0|      Medium|
|   Pune|     M|    Adult|           1|   60000.0|      Medium|
| Mumbai|     M|    Adult|           1|   80000.0|        High|
|Chennai|     M|    Adult|           1|       0.0|         Low|
| Mumbai|     F|    Young|           1|   50000.0|      Medium|
+-------+------+---------+------------+----------+------------+

