In [1]:
! pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=91bf034e9d2335ae5d62742b71ea90d15526ce69921924d5b30a00b44f987cc4
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("Product Sales Analysis") \
    .getOrCreate()


In [4]:
data = [
    (1, 'Arjun', 'IT', 75000),
    (2, 'Vijay', 'Finance', 85000),
    (3, 'Shalini', 'IT', 90000),
    (4, 'Sneha', 'HR', 50000),
    (5, 'Rahul', 'Finance', 60000),
    (6, 'Amit', 'IT', 55000)
]

# Define schema (columns)
columns = ['EmployeeID', 'EmployeeName', 'Department', 'Salary']

# Create DataFrame
employee_df = spark.createDataFrame(data, columns)

# Show the DataFrame
employee_df.show()

+----------+------------+----------+------+
|EmployeeID|EmployeeName|Department|Salary|
+----------+------------+----------+------+
|         1|       Arjun|        IT| 75000|
|         2|       Vijay|   Finance| 85000|
|         3|     Shalini|        IT| 90000|
|         4|       Sneha|        HR| 50000|
|         5|       Rahul|   Finance| 60000|
|         6|        Amit|        IT| 55000|
+----------+------------+----------+------+



Task 1: Filter Employees by Salary

In [5]:
filtered_employees = employee_df.filter(col('Salary') > 60000)
filtered_employees.show()


+----------+------------+----------+------+
|EmployeeID|EmployeeName|Department|Salary|
+----------+------------+----------+------+
|         1|       Arjun|        IT| 75000|
|         2|       Vijay|   Finance| 85000|
|         3|     Shalini|        IT| 90000|
+----------+------------+----------+------+



Task 2: Calculate the Average Salary by Department

In [7]:
from pyspark.sql.functions import avg

avg_salary = employee_df.groupBy('department').agg(avg('Salary').alias('Avg_Salary'))
avg_salary.show()


+----------+-----------------+
|department|       Avg_Salary|
+----------+-----------------+
|   Finance|          72500.0|
|        IT|73333.33333333333|
|        HR|          50000.0|
+----------+-----------------+



Sort Employees by Salary

In [8]:

sorted_employees = employee_df.orderBy(col('Salary').desc())

sorted_employees.show()


+----------+------------+----------+------+
|EmployeeID|EmployeeName|Department|Salary|
+----------+------------+----------+------+
|         3|     Shalini|        IT| 90000|
|         2|       Vijay|   Finance| 85000|
|         1|       Arjun|        IT| 75000|
|         5|       Rahul|   Finance| 60000|
|         6|        Amit|        IT| 55000|
|         4|       Sneha|        HR| 50000|
+----------+------------+----------+------+



 Add a Bonus Column

In [9]:
employee_with_bonus = employee_df.withColumn('Bonus', col('Salary') * 0.10)
employee_with_bonus.show()

+----------+------------+----------+------+------+
|EmployeeID|EmployeeName|Department|Salary| Bonus|
+----------+------------+----------+------+------+
|         1|       Arjun|        IT| 75000|7500.0|
|         2|       Vijay|   Finance| 85000|8500.0|
|         3|     Shalini|        IT| 90000|9000.0|
|         4|       Sneha|        HR| 50000|5000.0|
|         5|       Rahul|   Finance| 60000|6000.0|
|         6|        Amit|        IT| 55000|5500.0|
+----------+------------+----------+------+------+

