In [None]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=7d2e15394cdb4bd34cee229a71aaf91e8882a056d33fc00bab682fd495359b3e
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
#Initialize a Spark session
spark = SparkSession.builder \
.appName("Employee Data Analysis") \
.getOrCreate()
# Sample employee data
data = [
(1, 'Arjun', 'IT', 75000), (2, 'Vijay', 'Finance', 85000), (3, 'Shalini', 'IT', 90000), (4, 'Sneha', 'HR', 50000), (5, 'Rahul', 'Finance', 60000), (6, 'Amit', 'IT', 55000)
]


In [None]:
# Define schema (columns)
columns = ['EmployeeID', 'EmployeeName', 'Department', 'Salary']


In [None]:
#Create DataFrame
employee_df = spark.createDataFrame (data, columns)
# Show the DataFrame
employee_df.show()


+----------+------------+----------+------+
|EmployeeID|EmployeeName|Department|Salary|
+----------+------------+----------+------+
|         1|       Arjun|        IT| 75000|
|         2|       Vijay|   Finance| 85000|
|         3|     Shalini|        IT| 90000|
|         4|       Sneha|        HR| 50000|
|         5|       Rahul|   Finance| 60000|
|         6|        Amit|        IT| 55000|
+----------+------------+----------+------+



In [None]:
# Filter employees who have a salary greater than 60,000
high_salary_df = employee_df.filter (col('Salary') > 60000)
high_salary_df.show()

+----------+------------+----------+------+
|EmployeeID|EmployeeName|Department|Salary|
+----------+------------+----------+------+
|         1|       Arjun|        IT| 75000|
|         2|       Vijay|   Finance| 85000|
|         3|     Shalini|        IT| 90000|
+----------+------------+----------+------+



In [None]:
#Calculate the average salary by department
avg_salary_df = employee_df.groupBy("Department").avg("Salary")
avg_salary_df.show()


+----------+-----------------+
|Department|      avg(Salary)|
+----------+-----------------+
|   Finance|          72500.0|
|        IT|73333.33333333333|
|        HR|          50000.0|
+----------+-----------------+



In [None]:
# Sort employees in descending order of salary
sorted_df = employee_df.orderBy(col("Salary").desc())
sorted_df.show()


+----------+------------+----------+------+
|EmployeeID|EmployeeName|Department|Salary|
+----------+------------+----------+------+
|         3|     Shalini|        IT| 90000|
|         2|       Vijay|   Finance| 85000|
|         1|       Arjun|        IT| 75000|
|         5|       Rahul|   Finance| 60000|
|         6|        Amit|        IT| 55000|
|         4|       Sneha|        HR| 50000|
+----------+------------+----------+------+



In [None]:
# Sort employees in descending order of salary
sorted_df = employee_df.orderBy(col("Salary").desc())
sorted_df.show()

+----------+------------+----------+------+
|EmployeeID|EmployeeName|Department|Salary|
+----------+------------+----------+------+
|         3|     Shalini|        IT| 90000|
|         2|       Vijay|   Finance| 85000|
|         1|       Arjun|        IT| 75000|
|         5|       Rahul|   Finance| 60000|
|         6|        Amit|        IT| 55000|
|         4|       Sneha|        HR| 50000|
+----------+------------+----------+------+



Sample data with null values

In [None]:
data = [
(1, 'Arjun', 'IT', 75000), (2, 'Vijay', 'Finance', 85000), (3, None, 'IT', 90000), (4, 'Sneha', 'HR', None), (5, 'Rahul', None, 60000), (6, 'Amit', 'IT', 55000)
]
# Define schema (columns)
columns=['EmployeeID', 'EmployeeName' 'Department', 'Salary']

#Create DataFrame employee
df = spark.createDataFrame (data, columns)

# Show the DataFrame
employee_df.show()

#Fill null values in 'EmployeeName' and 'Department' with 'Unknown'
filled_df = employee_df.fillna({'EmployeeName': 'Unknown', 'Department': 'Unknown'})
filled_df.show()

# Drop rows where 'Salary' is null
dropped_null_salary_df= employee_df.dropna (subset=['Salary'])
dropped_null_salary_df.show()


+----------+------------+----------+------+
|EmployeeID|EmployeeName|Department|Salary|
+----------+------------+----------+------+
|         1|       Arjun|        IT| 75000|
|         2|       Vijay|   Finance| 85000|
|         3|     Shalini|        IT| 90000|
|         4|       Sneha|        HR| 50000|
|         5|       Rahul|   Finance| 60000|
|         6|        Amit|        IT| 55000|
+----------+------------+----------+------+

+----------+------------+----------+------+
|EmployeeID|EmployeeName|Department|Salary|
+----------+------------+----------+------+
|         1|       Arjun|        IT| 75000|
|         2|       Vijay|   Finance| 85000|
|         3|     Shalini|        IT| 90000|
|         4|       Sneha|        HR| 50000|
|         5|       Rahul|   Finance| 60000|
|         6|        Amit|        IT| 55000|
+----------+------------+----------+------+

+----------+------------+----------+------+
|EmployeeID|EmployeeName|Department|Salary|
+----------+------------+-----

In [None]:
# Fill null values in 'Salary' with 50000
salary_filled_df = employee_df.fillna({'Salary': 5000})
salary_filled_df.show()

+----------+------------+----------+------+
|EmployeeID|EmployeeName|Department|Salary|
+----------+------------+----------+------+
|         1|       Arjun|        IT| 75000|
|         2|       Vijay|   Finance| 85000|
|         3|     Shalini|        IT| 90000|
|         4|       Sneha|        HR| 50000|
|         5|       Rahul|   Finance| 60000|
|         6|        Amit|        IT| 55000|
+----------+------------+----------+------+

