Module 1: Setup & SparkSession Initialization

In [1]:
!pip install -q pyspark

In [2]:
# Import & Start Spark
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("BotCampus PySpark Practice") \
    .master("local[*]") \
    .getOrCreate()

# Create DataFrame
data = [
    ("Anjali", "Bangalore", 24),
    ("Ravi", "Hyderabad", 28),
    ("Kavya", "Delhi", 22),
    ("Meena", "Chennai", 25),
    ("Arjun", "Mumbai", 30)
]
columns = ["name", "city", "age"]
df = spark.createDataFrame(data, columns)

In [3]:
# Show schema and data
df.printSchema()
df.show()

# Convert to RDD and show outputs
rdd = df.rdd
print(rdd.collect())
print(rdd.map(lambda x: (x.name, x.age)).collect())

root
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- age: long (nullable = true)

+------+---------+---+
|  name|     city|age|
+------+---------+---+
|Anjali|Bangalore| 24|
|  Ravi|Hyderabad| 28|
| Kavya|    Delhi| 22|
| Meena|  Chennai| 25|
| Arjun|   Mumbai| 30|
+------+---------+---+

[Row(name='Anjali', city='Bangalore', age=24), Row(name='Ravi', city='Hyderabad', age=28), Row(name='Kavya', city='Delhi', age=22), Row(name='Meena', city='Chennai', age=25), Row(name='Arjun', city='Mumbai', age=30)]
[('Anjali', 24), ('Ravi', 28), ('Kavya', 22), ('Meena', 25), ('Arjun', 30)]


Module 2: RDDs & Transformations

In [4]:
feedback = spark.sparkContext.parallelize([
    "Ravi from Bangalore loved the delivery",
    "Meena from Hyderabad had a late order",
    "Ajay from Pune liked the service",
    "Anjali from Delhi faced UI issues",
    "Rohit from Mumbai gave positive feedback"
])


In [14]:
#Split each line into words (flatMap)
words = feedback.flatMap(lambda line: line.lower().split())
words.collect()

['ravi',
 'from',
 'bangalore',
 'loved',
 'the',
 'delivery',
 'meena',
 'from',
 'hyderabad',
 'had',
 'a',
 'late',
 'order',
 'ajay',
 'from',
 'pune',
 'liked',
 'the',
 'service',
 'anjali',
 'from',
 'delhi',
 'faced',
 'ui',
 'issues',
 'rohit',
 'from',
 'mumbai',
 'gave',
 'positive',
 'feedback']

In [15]:
# Remove stop words (from, the, a, etc.)
stop_words = {"from", "the", "a", "an", "had"}
filtered_words = words.filter(lambda word: word not in stop_words)
filtered_words.collect()


['ravi',
 'bangalore',
 'loved',
 'delivery',
 'meena',
 'hyderabad',
 'late',
 'order',
 'ajay',
 'pune',
 'liked',
 'service',
 'anjali',
 'delhi',
 'faced',
 'ui',
 'issues',
 'rohit',
 'mumbai',
 'gave',
 'positive',
 'feedback']

In [16]:
# Count each word frequency using reduceByKey .
word_counts = filtered_words.map(lambda word: (word, 1)).reduceByKey(lambda a, b: a + b)
word_counts.collect()


[('loved', 1),
 ('liked', 1),
 ('service', 1),
 ('anjali', 1),
 ('faced', 1),
 ('issues', 1),
 ('rohit', 1),
 ('mumbai', 1),
 ('positive', 1),
 ('feedback', 1),
 ('ravi', 1),
 ('bangalore', 1),
 ('delivery', 1),
 ('meena', 1),
 ('hyderabad', 1),
 ('late', 1),
 ('order', 1),
 ('ajay', 1),
 ('pune', 1),
 ('delhi', 1),
 ('ui', 1),
 ('gave', 1)]

In [17]:
top_3_words = word_counts.takeOrdered(3, key=lambda x: -x[1])
top_3_words


[('loved', 1), ('liked', 1), ('service', 1)]

Module 3: DataFrames & Transformation (With Joins)

In [9]:
from pyspark.sql.functions import col, when

students = [
    ("Amit", "10-A", 89),
    ("Kavya", "10-B", 92),
    ("Anjali", "10-A", 78),
    ("Rohit", "10-B", 85),
    ("Sneha", "10-C", 80)
]
attendance = [
    ("Amit", 24),
    ("Kavya", 22),
    ("Anjali", 20),
    ("Rohit", 25),
    ("Sneha", 19)
]
df_students = spark.createDataFrame(students, ["name", "section", "marks"])
df_attendance = spark.createDataFrame(attendance, ["name", "days_present"])

In [19]:
#Join both DataFrames on name .
df_joined = df_students.join(df_attendance, "name")
df_joined.show()

+------+-------+-----+------------+
|  name|section|marks|days_present|
+------+-------+-----+------------+
|  Amit|   10-A|   89|          24|
|Anjali|   10-A|   78|          20|
| Kavya|   10-B|   92|          22|
| Rohit|   10-B|   85|          25|
| Sneha|   10-C|   80|          19|
+------+-------+-----+------------+



In [25]:
#Create a new column: attendance_rate = days_present / 25 .
df_final = df_joined.withColumn("attendance_rate", col("days_present") / 25)
df_final.show()

+------+-------+-----+------------+---------------+
|  name|section|marks|days_present|attendance_rate|
+------+-------+-----+------------+---------------+
|  Amit|   10-A|   89|          24|           0.96|
|Anjali|   10-A|   78|          20|            0.8|
| Kavya|   10-B|   92|          22|           0.88|
| Rohit|   10-B|   85|          25|            1.0|
| Sneha|   10-C|   80|          19|           0.76|
+------+-------+-----+------------+---------------+



In [23]:
#Grade students using when :A: >90, B: 80–90, C: <80.
df_final = df_final.withColumn("grade", when(col("marks") > 90, "A")
                                .when((col("marks") <= 90) & (col("marks") >= 80), "B")
                                .otherwise("C"))
df_final.show()

+------+-------+-----+------------+---------------+-----+
|  name|section|marks|days_present|attendance_rate|grade|
+------+-------+-----+------------+---------------+-----+
|  Amit|   10-A|   89|          24|           0.96|    B|
|Anjali|   10-A|   78|          20|            0.8|    C|
| Kavya|   10-B|   92|          22|           0.88|    A|
| Rohit|   10-B|   85|          25|            1.0|    B|
| Sneha|   10-C|   80|          19|           0.76|    B|
+------+-------+-----+------------+---------------+-----+



In [22]:
#Filter students with good grades but poor attendance (<80%).
df_filtered = df_final.filter((col("grade").isin("A", "B")) & (col("attendance_rate") < 0.8))
df_filtered.show()

+-----+-------+-----+------------+---------------+-----+
| name|section|marks|days_present|attendance_rate|grade|
+-----+-------+-----+------------+---------------+-----+
|Sneha|   10-C|   80|          19|           0.76|    B|
+-----+-------+-----+------------+---------------+-----+



Module 4: Ingest CSV & JSON, Save as Parquet

In [29]:
# Create CSV
with open("employees.csv", "w") as f:
    f.write("""emp_id,name,dept,city,salary
101,Anil,IT,Bangalore,80000
102,Kiran,HR,Mumbai,65000
103,Deepa,Finance,Chennai,72000
""")

In [26]:
# Create JSON
with open("employee.json", "w") as f:
    f.write("""
{
"id": 201,
"name": "Nandini",
"contact": {
    "email": "nandi@example.com",
    "city": "Hyderabad"
},
"skills": ["Python", "Spark", "SQL"]
}
""")

In [30]:
# Load CSV
df_csv = spark.read.option("header", True).csv("employees.csv", inferSchema=True)

# Load JSON
df_json = spark.read.option("multiline", True).json("employee.json")


In [31]:
# Flatten JSON
from pyspark.sql.functions import col, explode

df_flat = df_json.select(
    col("id"),
    col("name"),
    col("contact.city").alias("city"),
    explode(col("skills")).alias("skill")
)


In [32]:
# Save to Parquet partitioned by city
df_csv.write.mode("overwrite").partitionBy("city").parquet("/content/emp_csv_parquet")
df_flat.write.mode("overwrite").partitionBy("city").parquet("/content/emp_json_parquet")


Module 5: Spark SQL with Temp Views

In [33]:
df_students.createOrReplaceTempView("students_view")


In [34]:
# a) Average marks per section
spark.sql("SELECT section, AVG(marks) as avg_marks FROM students_view GROUP BY section").show()


+-------+---------+
|section|avg_marks|
+-------+---------+
|   10-A|     83.5|
|   10-B|     88.5|
|   10-C|     80.0|
+-------+---------+



In [35]:
# b) Top scorer per section
spark.sql("""SELECT section, name, marks FROM (SELECT *, RANK() OVER(PARTITION BY section ORDER BY marks DESC) as rank
    FROM students_view) WHERE rank = 1""").show()

+-------+-----+-----+
|section| name|marks|
+-------+-----+-----+
|   10-A| Amit|   89|
|   10-B|Kavya|   92|
|   10-C|Sneha|   80|
+-------+-----+-----+



In [38]:
# c) Count of students per grade
spark.sql("""SELECT CASE WHEN marks > 90 THEN 'A' WHEN marks BETWEEN 80 AND 90 THEN 'B'
        ELSE 'C' END as grade,COUNT(*) as count FROM students_view
        GROUP BY grade""").show()

+-----+-----+
|grade|count|
+-----+-----+
|    B|    3|
|    A|    1|
|    C|    1|
+-----+-----+



In [37]:
# d) Students above average
spark.sql("""WITH avg_table AS (SELECT AVG(marks) as avg_marks FROM students_view)
SELECT * FROM students_view, avg_table WHERE students_view.marks > avg_table.avg_marks""").show()

+-----+-------+-----+---------+
| name|section|marks|avg_marks|
+-----+-------+-----+---------+
| Amit|   10-A|   89|     84.8|
|Kavya|   10-B|   92|     84.8|
|Rohit|   10-B|   85|     84.8|
+-----+-------+-----+---------+



 Module 6: Partitioned Write & Incremental Load

In [39]:
# Full load
df_students.write.mode("overwrite").partitionBy("section").parquet("/content/output/students")

# Incremental load
df_inc = spark.createDataFrame([("Tejas", "10-A", 91)], ["name", "section", "marks"])
df_inc.write.mode("append").partitionBy("section").parquet("/content/output/students")

In [40]:
# List files
import os
print("Files in partitioned directory:")
print(os.listdir("/content/output/students"))

Files in partitioned directory:
['._SUCCESS.crc', 'section=10-A', 'section=10-B', '_SUCCESS', 'section=10-C']


In [41]:
df_10A = spark.read.parquet("/content/output/students/section=10-A")
df_10A.show()

+------+-----+
|  name|marks|
+------+-----+
|Anjali|   78|
| Tejas|   91|
|  Amit|   89|
+------+-----+



Module 7: ETL Pipeline – End to End

In [43]:
# Create CSV
with open("raw_emp.csv", "w") as f:
    f.write("""emp_id,name,dept,salary,bonus
1,Arjun,IT,75000,5000
2,Kavya,HR,62000,
3,Sneha,Finance,68000,4000
4,Ramesh,Sales,58000,
""")
df_raw = spark.read.option("header", True).option("inferSchema", True).csv("raw_emp.csv")

In [45]:
#fill bonus
df_filled = df_raw.fillna({"bonus": 2000})
df_filled.show()

+------+------+-------+------+-----+
|emp_id|  name|   dept|salary|bonus|
+------+------+-------+------+-----+
|     1| Arjun|     IT| 75000| 5000|
|     2| Kavya|     HR| 62000| 2000|
|     3| Sneha|Finance| 68000| 4000|
|     4|Ramesh|  Sales| 58000| 2000|
+------+------+-------+------+-----+



In [46]:
# Add total_ctc
df_final = df_filled.withColumn("total_ctc", col("salary") + col("bonus"))
df_final.show()

+------+------+-------+------+-----+---------+
|emp_id|  name|   dept|salary|bonus|total_ctc|
+------+------+-------+------+-----+---------+
|     1| Arjun|     IT| 75000| 5000|    80000|
|     2| Kavya|     HR| 62000| 2000|    64000|
|     3| Sneha|Finance| 68000| 4000|    72000|
|     4|Ramesh|  Sales| 58000| 2000|    60000|
+------+------+-------+------+-----+---------+



In [47]:
# Filter > 65k
df_filtered = df_final.filter(col("total_ctc") > 65000)
df_filtered.show()


+------+-----+-------+------+-----+---------+
|emp_id| name|   dept|salary|bonus|total_ctc|
+------+-----+-------+------+-----+---------+
|     1|Arjun|     IT| 75000| 5000|    80000|
|     3|Sneha|Finance| 68000| 4000|    72000|
+------+-----+-------+------+-----+---------+



In [48]:
# Save to JSON
df_filtered.write.mode("overwrite").json("/content/etl_output/json")
# Save to Parquet partitioned by dept
df_filtered.write.mode("overwrite").partitionBy("dept").parquet("/content/etl_output/parquet")

In [53]:
from google.colab import files
files.download("employees.csv")
files.download("employee.json")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>