In [0]:
import random
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# Create SparkSession
spark = SparkSession.builder.getOrCreate()

# Define schema for employees dataframe
schema = StructType([
    StructField("id", IntegerType(), nullable=True),
    StructField("name", StringType(), nullable=True),
    StructField("age", IntegerType(), nullable=True),
    StructField("gender", StringType(), nullable=True),
    StructField("department", StringType(), nullable=True),
    StructField("salary", IntegerType(), nullable=True),
    StructField("address", StringType(), nullable=True)
])

# Generate random data for employees dataframe
data = []
for _ in range(10):
    id = random.randint(1, 100)
    name = random.choice(["John", "Jane", "Mike", "Emily", None])
    age = random.randint(20, 60)
    gender = random.choice(["Male", "Female", None])
    department = random.choice(["Sales", "Marketing", "Finance", "HR", None])
    salary = random.randint(50000, 100000)
    address = random.choice(["New York", "Los Angeles", "Chicago", None])
    data.append((id, name, age, gender, department, salary, address))

# Create employees dataframe
employees = spark.createDataFrame(data, schema)

# Show employees dataframe
employees.show()

+---+-----+---+------+----------+------+-----------+
| id| name|age|gender|department|salary|    address|
+---+-----+---+------+----------+------+-----------+
| 74|Emily| 21|  Male|      NULL| 51270|       NULL|
| 68| NULL| 56|Female|        HR| 89595|       NULL|
| 35| Mike| 44|  Male|        HR| 65925|    Chicago|
| 74|Emily| 29|  NULL| Marketing| 96364|    Chicago|
| 38|Emily| 31|Female|        HR| 76547|Los Angeles|
| 70| NULL| 33|Female|      NULL| 97048|       NULL|
| 74|Emily| 32|Female| Marketing| 56424|Los Angeles|
| 72| John| 53|  Male|        HR| 60073|       NULL|
| 76| John| 56|Female|      NULL| 67926|   New York|
| 22| Mike| 23|  NULL|      NULL| 50860|Los Angeles|
+---+-----+---+------+----------+------+-----------+



### remove records where any of the values are null 

In [0]:
employees.dropna().show()

+---+-----+---+------+----------+------+-----------+
| id| name|age|gender|department|salary|    address|
+---+-----+---+------+----------+------+-----------+
| 35| Mike| 44|  Male|        HR| 65925|    Chicago|
| 38|Emily| 31|Female|        HR| 76547|Los Angeles|
| 74|Emily| 32|Female| Marketing| 56424|Los Angeles|
+---+-----+---+------+----------+------+-----------+



### remove records where a subset of the columns contain null values

In [0]:
employees.dropna(subset=['department']).show()

+---+-----+---+------+----------+------+-----------+
| id| name|age|gender|department|salary|    address|
+---+-----+---+------+----------+------+-----------+
| 68| NULL| 56|Female|        HR| 89595|       NULL|
| 35| Mike| 44|  Male|        HR| 65925|    Chicago|
| 74|Emily| 29|  NULL| Marketing| 96364|    Chicago|
| 38|Emily| 31|Female|        HR| 76547|Los Angeles|
| 74|Emily| 32|Female| Marketing| 56424|Los Angeles|
| 72| John| 53|  Male|        HR| 60073|       NULL|
+---+-----+---+------+----------+------+-----------+



### define how the null records should be dropped - if all or any are null

In [0]:
employees.dropna(how='all', subset=['department', 'gender']).show()

+---+-----+---+------+----------+------+-----------+
| id| name|age|gender|department|salary|    address|
+---+-----+---+------+----------+------+-----------+
| 74|Emily| 21|  Male|      NULL| 51270|       NULL|
| 68| NULL| 56|Female|        HR| 89595|       NULL|
| 35| Mike| 44|  Male|        HR| 65925|    Chicago|
| 74|Emily| 29|  NULL| Marketing| 96364|    Chicago|
| 38|Emily| 31|Female|        HR| 76547|Los Angeles|
| 70| NULL| 33|Female|      NULL| 97048|       NULL|
| 74|Emily| 32|Female| Marketing| 56424|Los Angeles|
| 72| John| 53|  Male|        HR| 60073|       NULL|
| 76| John| 56|Female|      NULL| 67926|   New York|
+---+-----+---+------+----------+------+-----------+

