In [1]:
# Install Java & Spark
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.4.1/spark-3.4.1-bin-hadoop3.tgz
!tar xf spark-3.4.1-bin-hadoop3.tgz
!pip install -q findspark

# Set environment variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.4.1-bin-hadoop3"

# Initialize SparkSession
import findspark
findspark.init()
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("PySparkOnColab").getOrCreate()
spark


In [2]:
from google.colab import files
files.upload()  # 🔍 Select the correct CSV file (probably `employees.csv`)


Saving fill.csv to fill.csv


{'fill.csv': b'EmployeeID,Name,Age,Department,Salary,JoiningDate\r\n101,Manav Rajput,21,Engineering,55000,2022-06-15\r\n102,Sneha Sharma,,Marketing,47000,2021-04-10\r\n103,,25,HR,50000,2023-01-20\r\n104,Anjali Verma,22,,52000,2022-11-05\r\n105,Amit Mishra,24,Engineering,,2020-08-12\r\n106,Nidhi Chauhan,26,Marketing,49000,\r\n107,Vikram Sinha,,HR,51000,2021-07-22\r\n108,Kajal Mehta,23,Finance,53000,2022-12-18\r\n109,,28,Engineering,60000,2019-09-10\r\n110,Shweta Dubey,22,Marketing,46000,2023-02-14\r\n'}

In [3]:
df = spark.read.csv("fill.csv", header=True, inferSchema=True)
df.show()
df.printSchema()


+----------+-------------+----+-----------+------+-----------+
|EmployeeID|         Name| Age| Department|Salary|JoiningDate|
+----------+-------------+----+-----------+------+-----------+
|       101| Manav Rajput|  21|Engineering| 55000| 2022-06-15|
|       102| Sneha Sharma|null|  Marketing| 47000| 2021-04-10|
|       103|         null|  25|         HR| 50000| 2023-01-20|
|       104| Anjali Verma|  22|       null| 52000| 2022-11-05|
|       105|  Amit Mishra|  24|Engineering|  null| 2020-08-12|
|       106|Nidhi Chauhan|  26|  Marketing| 49000|       null|
|       107| Vikram Sinha|null|         HR| 51000| 2021-07-22|
|       108|  Kajal Mehta|  23|    Finance| 53000| 2022-12-18|
|       109|         null|  28|Engineering| 60000| 2019-09-10|
|       110| Shweta Dubey|  22|  Marketing| 46000| 2023-02-14|
+----------+-------------+----+-----------+------+-----------+

root
 |-- EmployeeID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = tru

In [4]:
df.na.fill("Missing Values", ["department", "salary"]).show()

+----------+-------------+----+--------------+------+-----------+
|EmployeeID|         Name| Age|    Department|Salary|JoiningDate|
+----------+-------------+----+--------------+------+-----------+
|       101| Manav Rajput|  21|   Engineering| 55000| 2022-06-15|
|       102| Sneha Sharma|null|     Marketing| 47000| 2021-04-10|
|       103|         null|  25|            HR| 50000| 2023-01-20|
|       104| Anjali Verma|  22|Missing Values| 52000| 2022-11-05|
|       105|  Amit Mishra|  24|   Engineering|  null| 2020-08-12|
|       106|Nidhi Chauhan|  26|     Marketing| 49000|       null|
|       107| Vikram Sinha|null|            HR| 51000| 2021-07-22|
|       108|  Kajal Mehta|  23|       Finance| 53000| 2022-12-18|
|       109|         null|  28|   Engineering| 60000| 2019-09-10|
|       110| Shweta Dubey|  22|     Marketing| 46000| 2023-02-14|
+----------+-------------+----+--------------+------+-----------+



In [5]:
df.na.fill("Unknown").show()

+----------+-------------+----+-----------+------+-----------+
|EmployeeID|         Name| Age| Department|Salary|JoiningDate|
+----------+-------------+----+-----------+------+-----------+
|       101| Manav Rajput|  21|Engineering| 55000| 2022-06-15|
|       102| Sneha Sharma|null|  Marketing| 47000| 2021-04-10|
|       103|      Unknown|  25|         HR| 50000| 2023-01-20|
|       104| Anjali Verma|  22|    Unknown| 52000| 2022-11-05|
|       105|  Amit Mishra|  24|Engineering|  null| 2020-08-12|
|       106|Nidhi Chauhan|  26|  Marketing| 49000|       null|
|       107| Vikram Sinha|null|         HR| 51000| 2021-07-22|
|       108|  Kajal Mehta|  23|    Finance| 53000| 2022-12-18|
|       109|      Unknown|  28|Engineering| 60000| 2019-09-10|
|       110| Shweta Dubey|  22|  Marketing| 46000| 2023-02-14|
+----------+-------------+----+-----------+------+-----------+



In [6]:
df.na.fill({
    "Name": "Unknown",
    "Department": "Unknown",
    "Age": 0,
    "Salary": 0,
    "JoiningDate": "1900-01-01"
}).show()


+----------+-------------+---+-----------+------+-----------+
|EmployeeID|         Name|Age| Department|Salary|JoiningDate|
+----------+-------------+---+-----------+------+-----------+
|       101| Manav Rajput| 21|Engineering| 55000| 2022-06-15|
|       102| Sneha Sharma|  0|  Marketing| 47000| 2021-04-10|
|       103|      Unknown| 25|         HR| 50000| 2023-01-20|
|       104| Anjali Verma| 22|    Unknown| 52000| 2022-11-05|
|       105|  Amit Mishra| 24|Engineering|     0| 2020-08-12|
|       106|Nidhi Chauhan| 26|  Marketing| 49000| 1900-01-01|
|       107| Vikram Sinha|  0|         HR| 51000| 2021-07-22|
|       108|  Kajal Mehta| 23|    Finance| 53000| 2022-12-18|
|       109|      Unknown| 28|Engineering| 60000| 2019-09-10|
|       110| Shweta Dubey| 22|  Marketing| 46000| 2023-02-14|
+----------+-------------+---+-----------+------+-----------+



In [7]:
df.na.fill("", ['Department']).show()

+----------+-------------+----+-----------+------+-----------+
|EmployeeID|         Name| Age| Department|Salary|JoiningDate|
+----------+-------------+----+-----------+------+-----------+
|       101| Manav Rajput|  21|Engineering| 55000| 2022-06-15|
|       102| Sneha Sharma|null|  Marketing| 47000| 2021-04-10|
|       103|         null|  25|         HR| 50000| 2023-01-20|
|       104| Anjali Verma|  22|           | 52000| 2022-11-05|
|       105|  Amit Mishra|  24|Engineering|  null| 2020-08-12|
|       106|Nidhi Chauhan|  26|  Marketing| 49000|       null|
|       107| Vikram Sinha|null|         HR| 51000| 2021-07-22|
|       108|  Kajal Mehta|  23|    Finance| 53000| 2022-12-18|
|       109|         null|  28|Engineering| 60000| 2019-09-10|
|       110| Shweta Dubey|  22|  Marketing| 46000| 2023-02-14|
+----------+-------------+----+-----------+------+-----------+

