In [1]:
! pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=d6e017084d436be2a724435feae46e31a5646027f407a0f7b16a70b6db30297f
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("Product Sales Analysis") \
    .getOrCreate()


In [3]:
# Sample employee data with null values
data = [
    (1, 'Arjun', 'IT', 75000),
    (2, 'Vijay', 'Finance', 85000),
    (3, None, 'IT', 90000),
    (4, 'Sneha', 'HR', None),
    (5, 'Rahul', None, 60000),
    (6, 'Amit', 'IT', 55000)
]

# Define schema (columns)
columns = ['EmployeeID', 'EmployeeName', 'Department', 'Salary']

# Create DataFrame
employee_df = spark.createDataFrame(data, columns)

# Show the DataFrame
employee_df.show()

+----------+------------+----------+------+
|EmployeeID|EmployeeName|Department|Salary|
+----------+------------+----------+------+
|         1|       Arjun|        IT| 75000|
|         2|       Vijay|   Finance| 85000|
|         3|        NULL|        IT| 90000|
|         4|       Sneha|        HR|  NULL|
|         5|       Rahul|      NULL| 60000|
|         6|        Amit|        IT| 55000|
+----------+------------+----------+------+



In [4]:
# Fill null values in "Employee" abd "Department" with "Unknown"
filled_df = employee_df.fillna({"EmployeeName": "Unknown", "Department":"Unknown"})
filled_df.show()


+----------+------------+----------+------+
|EmployeeID|EmployeeName|Department|Salary|
+----------+------------+----------+------+
|         1|       Arjun|        IT| 75000|
|         2|       Vijay|   Finance| 85000|
|         3|     Unknown|        IT| 90000|
|         4|       Sneha|        HR|  NULL|
|         5|       Rahul|   Unknown| 60000|
|         6|        Amit|        IT| 55000|
+----------+------------+----------+------+



In [5]:
# Drop rows where "Salary" is null
dropped_null_salary_df = employee_df.dropna(subset=["Salary"])
dropped_null_salary_df.show()


+----------+------------+----------+------+
|EmployeeID|EmployeeName|Department|Salary|
+----------+------------+----------+------+
|         1|       Arjun|        IT| 75000|
|         2|       Vijay|   Finance| 85000|
|         3|        NULL|        IT| 90000|
|         5|       Rahul|      NULL| 60000|
|         6|        Amit|        IT| 55000|
+----------+------------+----------+------+



In [6]:
# Fill null values in "Salary" with 30000
salary_filled_df =employee_df.fillna({"salary" : 5000})
salary_filled_df.show()


+----------+------------+----------+------+
|EmployeeID|EmployeeName|Department|Salary|
+----------+------------+----------+------+
|         1|       Arjun|        IT| 75000|
|         2|       Vijay|   Finance| 85000|
|         3|        NULL|        IT| 90000|
|         4|       Sneha|        HR|  5000|
|         5|       Rahul|      NULL| 60000|
|         6|        Amit|        IT| 55000|
+----------+------------+----------+------+



In [7]:
# Check for null values in the entire DataFrame
null_counts = employee_df.select([(col(c).isNull()).alias(c) for c in employee_df.columns]).show()




+----------+------------+----------+------+
|EmployeeID|EmployeeName|Department|Salary|
+----------+------------+----------+------+
|     false|       false|     false| false|
|     false|       false|     false| false|
|     false|        true|     false| false|
|     false|       false|     false|  true|
|     false|       false|      true| false|
|     false|       false|     false| false|
+----------+------------+----------+------+



In [8]:
# Replace all null values in the DataFrame with "NA"
na_filled_df = employee_df.na.fill("N/A")
na_filled_df.show()

+----------+------------+----------+------+
|EmployeeID|EmployeeName|Department|Salary|
+----------+------------+----------+------+
|         1|       Arjun|        IT| 75000|
|         2|       Vijay|   Finance| 85000|
|         3|         N/A|        IT| 90000|
|         4|       Sneha|        HR|  NULL|
|         5|       Rahul|       N/A| 60000|
|         6|        Amit|        IT| 55000|
+----------+------------+----------+------+

