# Null Handling in PySpark

In [1]:
# Import necessary libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, isnull, coalesce, lit

# Create a Spark session
spark = SparkSession.builder.appName("NullHandling").getOrCreate()

StatementMeta(, c1e48d9e-8391-4063-a5f4-93cc537fa3ac, 3, Finished, Available, Finished)

## Sample Sales Data with Null Values

In [2]:
# Sample data: sales data with nulls
data = [
    ("John", "North", 100, None),
    ("Doe", "East", None, 50),
    (None, "West", 150, 30),
    ("Alice", None, 200, 40),
    ("Bob", "South", None, None),
    (None, None, None, None)
]
columns = ["Name", "Region", "UnitsSold", "Revenue"]

# Create DataFrame
df = spark.createDataFrame(data, columns)
df.show()

StatementMeta(, c1e48d9e-8391-4063-a5f4-93cc537fa3ac, 4, Finished, Available, Finished)

+-----+------+---------+-------+
| Name|Region|UnitsSold|Revenue|
+-----+------+---------+-------+
| John| North|      100|   NULL|
|  Doe|  East|     NULL|     50|
| NULL|  West|      150|     30|
|Alice|  NULL|      200|     40|
|  Bob| South|     NULL|   NULL|
| NULL|  NULL|     NULL|   NULL|
+-----+------+---------+-------+



## 1. Detecting Null Values

In [3]:
# Identify null values in specific columns
df.select(col("Name"), col("Region"), col("UnitsSold"), col("Revenue"),
          col("Name").isNull().alias("Name_is_null"),
          col("Region").isNull().alias("Region_is_null"),
          col("UnitsSold").isNull().alias("UnitsSold_is_null"),
          col("Revenue").isNull().alias("Revenue_is_null")
         ).show()

StatementMeta(, c1e48d9e-8391-4063-a5f4-93cc537fa3ac, 5, Finished, Available, Finished)

+-----+------+---------+-------+------------+--------------+-----------------+---------------+
| Name|Region|UnitsSold|Revenue|Name_is_null|Region_is_null|UnitsSold_is_null|Revenue_is_null|
+-----+------+---------+-------+------------+--------------+-----------------+---------------+
| John| North|      100|   NULL|       false|         false|            false|           true|
|  Doe|  East|     NULL|     50|       false|         false|             true|          false|
| NULL|  West|      150|     30|        true|         false|            false|          false|
|Alice|  NULL|      200|     40|       false|          true|            false|          false|
|  Bob| South|     NULL|   NULL|       false|         false|             true|           true|
| NULL|  NULL|     NULL|   NULL|        true|          true|             true|           true|
+-----+------+---------+-------+------------+--------------+-----------------+---------------+



## 2. Dropping Rows with Null Values

In [4]:
# Drop rows containing null values in any column
df_drop_any = df.dropna()
df_drop_any.show()

# Drop rows only if all columns contain null values
df_drop_all = df.dropna(how='all')
df_drop_all.show()

# Drop rows where specific columns contain null values
df_drop_subset = df.dropna(subset=["Name", "UnitsSold"])
df_drop_subset.show()

StatementMeta(, c1e48d9e-8391-4063-a5f4-93cc537fa3ac, 6, Finished, Available, Finished)

+----+------+---------+-------+
|Name|Region|UnitsSold|Revenue|
+----+------+---------+-------+
+----+------+---------+-------+

+-----+------+---------+-------+
| Name|Region|UnitsSold|Revenue|
+-----+------+---------+-------+
| John| North|      100|   NULL|
|  Doe|  East|     NULL|     50|
| NULL|  West|      150|     30|
|Alice|  NULL|      200|     40|
|  Bob| South|     NULL|   NULL|
+-----+------+---------+-------+

+-----+------+---------+-------+
| Name|Region|UnitsSold|Revenue|
+-----+------+---------+-------+
| John| North|      100|   NULL|
|Alice|  NULL|      200|     40|
+-----+------+---------+-------+



## 3. Filling Null Values

In [5]:
# Fill null values in specific columns
df_fill = df.fillna({"Region": "Unknown", "UnitsSold": 0, "Revenue": 0})
df_fill.show()

StatementMeta(, c1e48d9e-8391-4063-a5f4-93cc537fa3ac, 7, Finished, Available, Finished)

+-----+-------+---------+-------+
| Name| Region|UnitsSold|Revenue|
+-----+-------+---------+-------+
| John|  North|      100|      0|
|  Doe|   East|        0|     50|
| NULL|   West|      150|     30|
|Alice|Unknown|      200|     40|
|  Bob|  South|        0|      0|
| NULL|Unknown|        0|      0|
+-----+-------+---------+-------+



## 4. Using Coalesce to Handle Nulls in Aggregations

In [6]:
# Using coalesce to replace nulls with fallback values
df_coalesce = df.select(
    col("Name"),
    col("Region"),
    coalesce(col("UnitsSold"), lit(0)).alias("UnitsSold_Filled"),
    coalesce(col("Revenue"), lit(0)).alias("Revenue_Filled")
)
df_coalesce.show()

StatementMeta(, c1e48d9e-8391-4063-a5f4-93cc537fa3ac, 8, Finished, Available, Finished)

+-----+------+----------------+--------------+
| Name|Region|UnitsSold_Filled|Revenue_Filled|
+-----+------+----------------+--------------+
| John| North|             100|             0|
|  Doe|  East|               0|            50|
| NULL|  West|             150|            30|
|Alice|  NULL|             200|            40|
|  Bob| South|               0|             0|
| NULL|  NULL|               0|             0|
+-----+------+----------------+--------------+



### Summary of Null Handling in PySpark
1. **Detecting Nulls**: Use `isNull()` to identify missing values.
2. **Dropping Nulls**: Use `dropna()` to remove rows containing nulls.
3. **Filling Nulls**: Use `fillna()` to replace nulls with default values.
4. **Coalesce Function**: Use `coalesce()` to provide fallback values in case of nulls.
5. **Handling Aggregations**: Use `coalesce()` in aggregation functions to avoid null impact.