In [2]:
# Install Java & Spark
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.4.1/spark-3.4.1-bin-hadoop3.tgz
!tar xf spark-3.4.1-bin-hadoop3.tgz
!pip install -q findspark

# Set environment variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.4.1-bin-hadoop3"

# Initialize SparkSession
import findspark
findspark.init()
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("PySparkOnColab").getOrCreate()
spark


In [3]:
from google.colab import files
files.upload()  # 🔍 Select the correct CSV file (probably `employees.csv`)



Saving Employee2.csv to Employee2.csv


{'Employee2.csv': b'EmployeeID,Name,Age,Department,Salary,JoiningDate\r\n101,Manav Rajput,21,Engineering,55000,2022-06-15\r\n102,Sneha Sharma,23,Marketing,47000,2021-04-10\r\n103,Rahul Yadav,25,HR,50000,2023-01-20\r\n104,Anjali Verma,22,Finance,52000,2022-11-05\r\n105,Amit Mishra,24,Engineering,58000,2020-08-12\r\n106,Nidhi Chauhan,26,Marketing,49000,2023-03-01\r\n107,Vikram Sinha,27,HR,51000,2021-07-22\r\n108,Kajal Mehta,23,Finance,53000,2022-12-18\r\n109,Rajeev Ranjan,28,Engineering,60000,2019-09-10\r\n110,Shweta Dubey,22,Marketing,46000,2023-02-14\r\n'}

In [4]:
Emp_df = spark.read.csv("Employee2.csv", header=True, inferSchema=True)
Emp_df.show()
Emp_df.printSchema()


+----------+-------------+---+-----------+------+-----------+
|EmployeeID|         Name|Age| Department|Salary|JoiningDate|
+----------+-------------+---+-----------+------+-----------+
|       101| Manav Rajput| 21|Engineering| 55000| 2022-06-15|
|       102| Sneha Sharma| 23|  Marketing| 47000| 2021-04-10|
|       103|  Rahul Yadav| 25|         HR| 50000| 2023-01-20|
|       104| Anjali Verma| 22|    Finance| 52000| 2022-11-05|
|       105|  Amit Mishra| 24|Engineering| 58000| 2020-08-12|
|       106|Nidhi Chauhan| 26|  Marketing| 49000| 2023-03-01|
|       107| Vikram Sinha| 27|         HR| 51000| 2021-07-22|
|       108|  Kajal Mehta| 23|    Finance| 53000| 2022-12-18|
|       109|Rajeev Ranjan| 28|Engineering| 60000| 2019-09-10|
|       110| Shweta Dubey| 22|  Marketing| 46000| 2023-02-14|
+----------+-------------+---+-----------+------+-----------+

root
 |-- EmployeeID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Depart

In [5]:
from google.colab import files
files.upload()  # 🔍 Select the correct CSV file (probably `employees.csv`)



Saving DEPT.CSV to DEPT.CSV


{'DEPT.CSV': b'DepartmentID,DepartmentName,Manager\r\n1,Engineering,Ashish Kumar\r\n2,Finance,Megha Kapoor\r\n3,HR,Deepak Rana\r\n4,Marketing,Riya Sinha\r\n5,Support,Aman Saxena\r\n'}

In [8]:
Dep_df = spark.read.csv("DEPT.CSV", header=True, inferSchema=True)
Dep_df.show()
Dep_df.printSchema()


+------------+--------------+------------+
|DepartmentID|DepartmentName|     Manager|
+------------+--------------+------------+
|           1|   Engineering|Ashish Kumar|
|           2|       Finance|Megha Kapoor|
|           3|            HR| Deepak Rana|
|           4|     Marketing|  Riya Sinha|
|           5|       Support| Aman Saxena|
+------------+--------------+------------+

root
 |-- DepartmentID: integer (nullable = true)
 |-- DepartmentName: string (nullable = true)
 |-- Manager: string (nullable = true)



In [15]:
Emp_df.join(Dep_df, Emp_df.Department == Dep_df.DepartmentName, "inner").show()


+----------+-------------+---+-----------+------+-----------+------------+--------------+------------+
|EmployeeID|         Name|Age| Department|Salary|JoiningDate|DepartmentID|DepartmentName|     Manager|
+----------+-------------+---+-----------+------+-----------+------------+--------------+------------+
|       101| Manav Rajput| 21|Engineering| 55000| 2022-06-15|           1|   Engineering|Ashish Kumar|
|       102| Sneha Sharma| 23|  Marketing| 47000| 2021-04-10|           4|     Marketing|  Riya Sinha|
|       103|  Rahul Yadav| 25|         HR| 50000| 2023-01-20|           3|            HR| Deepak Rana|
|       104| Anjali Verma| 22|    Finance| 52000| 2022-11-05|           2|       Finance|Megha Kapoor|
|       105|  Amit Mishra| 24|Engineering| 58000| 2020-08-12|           1|   Engineering|Ashish Kumar|
|       106|Nidhi Chauhan| 26|  Marketing| 49000| 2023-03-01|           4|     Marketing|  Riya Sinha|
|       107| Vikram Sinha| 27|         HR| 51000| 2021-07-22|           3

In [14]:
# Inner Join
Emp_df.join(Dep_df, Emp_df.EmployeeID == Dep_df.DepartmentID, "inner").show()

+----------+----+---+----------+------+-----------+------------+--------------+-------+
|EmployeeID|Name|Age|Department|Salary|JoiningDate|DepartmentID|DepartmentName|Manager|
+----------+----+---+----------+------+-----------+------------+--------------+-------+
+----------+----+---+----------+------+-----------+------------+--------------+-------+



In [12]:
Emp_df.join(Dep_df, Emp_df.EmployeeID == Dep_df.DepartmentID, "inner").show()


+----------+----+---+----------+------+-----------+------------+--------------+-------+
|EmployeeID|Name|Age|Department|Salary|JoiningDate|DepartmentID|DepartmentName|Manager|
+----------+----+---+----------+------+-----------+------------+--------------+-------+
+----------+----+---+----------+------+-----------+------------+--------------+-------+

