## System Environment Variables

In [3]:
import os 
import sys
os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

## Creating SparkSession

In [4]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
        .appName('fillna') \
        .getOrCreate()
spark.version

'3.4.2'

## Read CSV

In [5]:
file = "resources/small_zipcode.csv"
df = spark.read.options(header = "true", inferSchema = "true")\
    .csv(file)

In [7]:
df.show()

+---+-------+--------+-------------------+-----+----------+
| id|zipcode|    type|               city|state|population|
+---+-------+--------+-------------------+-----+----------+
|  1|    704|STANDARD|               null|   PR|     30100|
|  2|    704|    null|PASEO COSTA DEL SUR|   PR|      null|
|  3|    709|    null|       BDA SAN LUIS|   PR|      3700|
|  4|  76166|  UNIQUE|  CINGULAR WIRELESS|   TX|     84000|
|  5|  76177|STANDARD|               null|   TX|      null|
+---+-------+--------+-------------------+-----+----------+



In [8]:
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- zipcode: integer (nullable = true)
 |-- type: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- population: integer (nullable = true)



### Replacing with number (using fillna)

In [23]:
# If we replace with only integer type, Then only integer columns from the dataframe gets replaced.
df.fillna(value = 0).show()

+---+-------+--------+-------------------+-----+----------+
| id|zipcode|    type|               city|state|population|
+---+-------+--------+-------------------+-----+----------+
|  1|    704|STANDARD|               null|   PR|     30100|
|  2|    704|    null|PASEO COSTA DEL SUR|   PR|         0|
|  3|    709|    null|       BDA SAN LUIS|   PR|      3700|
|  4|  76166|  UNIQUE|  CINGULAR WIRELESS|   TX|     84000|
|  5|  76177|STANDARD|               null|   TX|         0|
+---+-------+--------+-------------------+-----+----------+



### Replacing with string (using na.fill)

In [24]:
# Similarly, if we replace with only string type, then only string columns from df gets replaced.
# Subset here means that we can replace the same value for some columns as per wish
df.na.fill(value = 'y', subset = ['type']).show()

+---+-------+--------+-------------------+-----+----------+
| id|zipcode|    type|               city|state|population|
+---+-------+--------+-------------------+-----+----------+
|  1|    704|STANDARD|               null|   PR|     30100|
|  2|    704|       y|PASEO COSTA DEL SUR|   PR|      null|
|  3|    709|       y|       BDA SAN LUIS|   PR|      3700|
|  4|  76166|  UNIQUE|  CINGULAR WIRELESS|   TX|     84000|
|  5|  76177|STANDARD|               null|   TX|      null|
+---+-------+--------+-------------------+-----+----------+



### Replacing null with a specific aggregate value

In [33]:
from pyspark.sql.functions import max
max_population = df.agg(max("population")).collect()[0][0]
# print(max_population)
df.na.fill(value = max_population, subset = ["population"]).show()

84000
+---+-------+--------+-------------------+-----+----------+
| id|zipcode|    type|               city|state|population|
+---+-------+--------+-------------------+-----+----------+
|  1|    704|STANDARD|               null|   PR|     30100|
|  2|    704|    null|PASEO COSTA DEL SUR|   PR|     84000|
|  3|    709|    null|       BDA SAN LUIS|   PR|      3700|
|  4|  76166|  UNIQUE|  CINGULAR WIRELESS|   TX|     84000|
|  5|  76177|STANDARD|               null|   TX|     84000|
+---+-------+--------+-------------------+-----+----------+



### Replacing with multiple values for multiple columns

In [20]:
df.na.fill("UNIQUE", ["type"]) \
    .na.fill("UNKNOWN", ["city"])\
    .show()

+---+-------+--------+-------------------+-----+----------+
| id|zipcode|    type|               city|state|population|
+---+-------+--------+-------------------+-----+----------+
|  1|    704|STANDARD|            UNKNOWN|   PR|     30100|
|  2|    704|  UNIQUE|PASEO COSTA DEL SUR|   PR|      null|
|  3|    709|  UNIQUE|       BDA SAN LUIS|   PR|      3700|
|  4|  76166|  UNIQUE|  CINGULAR WIRELESS|   TX|     84000|
|  5|  76177|STANDARD|            UNKNOWN|   TX|      null|
+---+-------+--------+-------------------+-----+----------+



### Replacing with multiple values for multiple columns using Dictionary

In [22]:
rep_dict = {
    "type" : "UNIQUE",
    "city" : "UNKNOWN", 
    "population" : 98090
}

df.fillna(rep_dict).show()

+---+-------+--------+-------------------+-----+----------+
| id|zipcode|    type|               city|state|population|
+---+-------+--------+-------------------+-----+----------+
|  1|    704|STANDARD|            UNKNOWN|   PR|     30100|
|  2|    704|  UNIQUE|PASEO COSTA DEL SUR|   PR|     98090|
|  3|    709|  UNIQUE|       BDA SAN LUIS|   PR|      3700|
|  4|  76166|  UNIQUE|  CINGULAR WIRELESS|   TX|     84000|
|  5|  76177|STANDARD|            UNKNOWN|   TX|     98090|
+---+-------+--------+-------------------+-----+----------+

