In [1]:
# pip install pyspark

In [2]:
# pip install pyspark findspark

In [3]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import when
from pyspark.sql.functions import count
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [4]:
spark = SparkSession.builder\
        .appName("Day1_Spark_Basics")\
        .getOrCreate()

In [5]:
people_schema = StructType([
    StructField("Name",StringType(),True),
    StructField("Age",IntegerType(),True),
    StructField("City",StringType(),True)
])

In [6]:
df = spark.read.csv("people.csv", header=True, inferSchema=False, schema=people_schema)

In [7]:
df.show()

+-------+----+---------+
|   Name| Age|     City|
+-------+----+---------+
|  Alice|  29|   Mumbai|
|    Bob|  34|     Pune|
|Charlie|  25|    Delhi|
|  David|  42|   Mumbai|
|    Eva|  30|  Chennai|
|    Tom|null|Ahmedabad|
+-------+----+---------+



In [8]:
df.groupBy("City") \
  .agg(count("*").alias("total_people")) \
  .show(truncate=False)

+---------+------------+
|City     |total_people|
+---------+------------+
|Chennai  |1           |
|Mumbai   |2           |
|Ahmedabad|1           |
|Pune     |1           |
|Delhi    |1           |
+---------+------------+



In [9]:
city_wise_count = df.groupBy("City")\
    .count().show(truncate=False)

+---------+-----+
|City     |count|
+---------+-----+
|Chennai  |1    |
|Mumbai   |2    |
|Ahmedabad|1    |
|Pune     |1    |
|Delhi    |1    |
+---------+-----+



In [10]:
df.select(df.Name,df.Age).show()

+-------+----+
|   Name| Age|
+-------+----+
|  Alice|  29|
|    Bob|  34|
|Charlie|  25|
|  David|  42|
|    Eva|  30|
|    Tom|null|
+-------+----+



In [11]:
filtered_df = df.filter(df.Age>=30)#.select(df.Name,df.Age).show()
filtered_df.select(col("Name"),col("Age")).show()

+-----+---+
| Name|Age|
+-----+---+
|  Bob| 34|
|David| 42|
|  Eva| 30|
+-----+---+



#### Add a new column called age_group based on the following logic:
#### "Young" if age < 30
#### "Adult" if age is between 30 and 40 (inclusive)
#### "Senior" if age > 40

In [12]:
adding_age_group = df.withColumn("age_group",when(df.Age<30,"Young")
                                 .when((df.Age>=30) & (df.Age <=40),"Adult")
                                 .when(df.Age>40,"Senior"))
adding_age_group.show()

+-------+----+---------+---------+
|   Name| Age|     City|age_group|
+-------+----+---------+---------+
|  Alice|  29|   Mumbai|    Young|
|    Bob|  34|     Pune|    Adult|
|Charlie|  25|    Delhi|    Young|
|  David|  42|   Mumbai|   Senior|
|    Eva|  30|  Chennai|    Adult|
|    Tom|null|Ahmedabad|     null|
+-------+----+---------+---------+



#### Find the average age of people per city.
#### Show the output as:
#### city | avg_age

In [13]:
CitywiseAverageAge = df.groupBy("City")\
    .agg(avg("Age").alias("CitywiseAverageAge"))

In [14]:
CitywiseAverageAge.show()

+---------+------------------+
|     City|CitywiseAverageAge|
+---------+------------------+
|  Chennai|              30.0|
|   Mumbai|              35.5|
|Ahmedabad|              null|
|     Pune|              34.0|
|    Delhi|              25.0|
+---------+------------------+



In [15]:
df.show()

+-------+----+---------+
|   Name| Age|     City|
+-------+----+---------+
|  Alice|  29|   Mumbai|
|    Bob|  34|     Pune|
|Charlie|  25|    Delhi|
|  David|  42|   Mumbai|
|    Eva|  30|  Chennai|
|    Tom|null|Ahmedabad|
+-------+----+---------+



#### Remove all rows where the Age is null or missing

In [18]:
remove_null = df.na.drop()

In [19]:
remove_null.show()

+-------+---+-------+
|   Name|Age|   City|
+-------+---+-------+
|  Alice| 29| Mumbai|
|    Bob| 34|   Pune|
|Charlie| 25|  Delhi|
|  David| 42| Mumbai|
|    Eva| 30|Chennai|
+-------+---+-------+



In [25]:
df.sort(df.Age.desc()).show()

+-------+----+---------+
|   Name| Age|     City|
+-------+----+---------+
|  David|  42|   Mumbai|
|    Bob|  34|     Pune|
|    Eva|  30|  Chennai|
|  Alice|  29|   Mumbai|
|Charlie|  25|    Delhi|
|    Tom|null|Ahmedabad|
+-------+----+---------+



In [30]:
df.select("Name","Age","City")\
    .orderBy(df.Age.desc())\
    .limit(3)\
    .show()

+-----+---+-------+
| Name|Age|   City|
+-----+---+-------+
|David| 42| Mumbai|
|  Bob| 34|   Pune|
|  Eva| 30|Chennai|
+-----+---+-------+



#### Read both CSVs into separate DataFrames

#### Combine them into one DataFrame

#### Remove any duplicate rows

#### Show the result

In [41]:
mumbai_people_schema = StructType([
    StructField("Name",StringType(),True),
    StructField("Age",IntegerType(),True),
    StructField("City",StringType(),True)
])

In [42]:
mumbai_people_df = spark.read.csv("people_mumbai.csv",header=True, inferSchema=False, schema=mumbai_people_schema)

In [43]:
pune_scheam = StructType([
    StructField("Name",StringType(),True),
    StructField("Age",IntegerType(),True),
    StructField("City",StringType(),True)
])

In [44]:
pune_people_df = spark.read.csv("people_pune.csv",header=True, inferSchema=False, schema=pune_scheam)

In [46]:
pune_people_df.show()

+-------+---+------+
|   Name|Age|  City|
+-------+---+------+
|    Bob| 34|  Pune|
|Charlie| 25|  Pune|
|    Eva| 30|Mumbai|
+-------+---+------+



In [50]:
union_two_df = mumbai_people_df.union(pune_people_df).distinct()

In [51]:
union_two_df.show()

+-------+---+------+
|   Name|Age|  City|
+-------+---+------+
|    Eva| 30|Mumbai|
|  David| 42|Mumbai|
|  Alice| 29|Mumbai|
|Charlie| 25|  Pune|
|    Bob| 34|  Pune|
+-------+---+------+

