# Pyspark Dataframe Part 2 : Handling Missing Values
We will look into 
- dropping columns
- dropping rows 
- various parameters in dropping fucntionalities
- handling missing values by mean, median or mode


#### Set spark instances

In [1]:
import pyspark 
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('test01').getOrCreate()

In [2]:
spark

#### Set dataset 

In [63]:
# read dataset
# notice the NULL value

df_pyspark = spark.read.csv('customer.csv', header =True, inferSchema = True)
df_pyspark.show()

+-------+----+----------+------+
|   Name| Age|Experience|Salary|
+-------+----+----------+------+
|  Zikri|  24|         1|  4500|
|Zakhwan|  25|         1|  4700|
|   Amir|  27|         2|  5000|
|  Ammar|  30|         6|  8500|
|  Haziq|  24|         2|  4700|
|  Irfan|  25|         1|  6000|
|  Fahmi|  27|         5|  7000|
|  Majid|  27|      NULL|  5500|
|   NULL|  25|         2|  5000|
|    Ali|  25|         2|  NULL|
|    Abu|NULL|      NULL|  NULL|
+-------+----+----------+------+



#### Delete column

In [64]:
# drop a column 

df_pyspark.drop('Name').show()

+----+----------+------+
| Age|Experience|Salary|
+----+----------+------+
|  24|         1|  4500|
|  25|         1|  4700|
|  27|         2|  5000|
|  30|         6|  8500|
|  24|         2|  4700|
|  25|         1|  6000|
|  27|         5|  7000|
|  27|      NULL|  5500|
|  25|         2|  5000|
|  25|         2|  NULL|
|NULL|      NULL|  NULL|
+----+----------+------+



#### Delete row

In [65]:
# drop a NULL row
# If there is a NULL value, the row will be deleted

df_pyspark.na.drop().show()

+-------+---+----------+------+
|   Name|Age|Experience|Salary|
+-------+---+----------+------+
|  Zikri| 24|         1|  4500|
|Zakhwan| 25|         1|  4700|
|   Amir| 27|         2|  5000|
|  Ammar| 30|         6|  8500|
|  Haziq| 24|         2|  4700|
|  Irfan| 25|         1|  6000|
|  Fahmi| 27|         5|  7000|
+-------+---+----------+------+



In [66]:
# drop a NULL row - how
# delete the row if all columns are null

df_pyspark.na.drop(how = 'all').show()

+-------+----+----------+------+
|   Name| Age|Experience|Salary|
+-------+----+----------+------+
|  Zikri|  24|         1|  4500|
|Zakhwan|  25|         1|  4700|
|   Amir|  27|         2|  5000|
|  Ammar|  30|         6|  8500|
|  Haziq|  24|         2|  4700|
|  Irfan|  25|         1|  6000|
|  Fahmi|  27|         5|  7000|
|  Majid|  27|      NULL|  5500|
|   NULL|  25|         2|  5000|
|    Ali|  25|         2|  NULL|
|    Abu|NULL|      NULL|  NULL|
+-------+----+----------+------+



In [67]:
# drop a NULL row - how
# delete the row if any columns are null

df_pyspark.na.drop(how = 'any').show()

+-------+---+----------+------+
|   Name|Age|Experience|Salary|
+-------+---+----------+------+
|  Zikri| 24|         1|  4500|
|Zakhwan| 25|         1|  4700|
|   Amir| 27|         2|  5000|
|  Ammar| 30|         6|  8500|
|  Haziq| 24|         2|  4700|
|  Irfan| 25|         1|  6000|
|  Fahmi| 27|         5|  7000|
+-------+---+----------+------+



In [68]:
# drop a NULL row - threshold
# set as 2 -> at least two non null value must be shown
# if the null value is more than 2 we omit those rows

df_pyspark.na.drop(how = 'any', thresh = 2).show()

+-------+---+----------+------+
|   Name|Age|Experience|Salary|
+-------+---+----------+------+
|  Zikri| 24|         1|  4500|
|Zakhwan| 25|         1|  4700|
|   Amir| 27|         2|  5000|
|  Ammar| 30|         6|  8500|
|  Haziq| 24|         2|  4700|
|  Irfan| 25|         1|  6000|
|  Fahmi| 27|         5|  7000|
|  Majid| 27|      NULL|  5500|
|   NULL| 25|         2|  5000|
|    Ali| 25|         2|  NULL|
+-------+---+----------+------+



In [69]:
# drop a NULL row - subset
# set the column that must not contain NULL value

df_pyspark.na.drop(how = 'any', subset = ['Experience']).show()

+-------+---+----------+------+
|   Name|Age|Experience|Salary|
+-------+---+----------+------+
|  Zikri| 24|         1|  4500|
|Zakhwan| 25|         1|  4700|
|   Amir| 27|         2|  5000|
|  Ammar| 30|         6|  8500|
|  Haziq| 24|         2|  4700|
|  Irfan| 25|         1|  6000|
|  Fahmi| 27|         5|  7000|
|   NULL| 25|         2|  5000|
|    Ali| 25|         2|  NULL|
+-------+---+----------+------+



#### Fill missing data

In [70]:
# fill missing value

df_pyspark.na.fill('Missing Values').show()

+--------------+----+----------+------+
|          Name| Age|Experience|Salary|
+--------------+----+----------+------+
|         Zikri|  24|         1|  4500|
|       Zakhwan|  25|         1|  4700|
|          Amir|  27|         2|  5000|
|         Ammar|  30|         6|  8500|
|         Haziq|  24|         2|  4700|
|         Irfan|  25|         1|  6000|
|         Fahmi|  27|         5|  7000|
|         Majid|  27|      NULL|  5500|
|Missing Values|  25|         2|  5000|
|           Ali|  25|         2|  NULL|
|           Abu|NULL|      NULL|  NULL|
+--------------+----+----------+------+



In [71]:
# fill missing value -> of integer

df_pyspark.na.fill(0).show()

+-------+---+----------+------+
|   Name|Age|Experience|Salary|
+-------+---+----------+------+
|  Zikri| 24|         1|  4500|
|Zakhwan| 25|         1|  4700|
|   Amir| 27|         2|  5000|
|  Ammar| 30|         6|  8500|
|  Haziq| 24|         2|  4700|
|  Irfan| 25|         1|  6000|
|  Fahmi| 27|         5|  7000|
|  Majid| 27|         0|  5500|
|   NULL| 25|         2|  5000|
|    Ali| 25|         2|     0|
|    Abu|  0|         0|     0|
+-------+---+----------+------+



In [72]:
# get Imputer from pyspark.ml.feature

from pyspark.ml.feature import Imputer

imputer = Imputer(
    inputCols = ['Age', 'Experience', 'Salary'],
    outputCols = ["{}_imputed".format(c) for c in ['Age', 'Experience', 'Salary']]
).setStrategy("mean")

In [73]:
# set the fill value to the mean value
# the NULL value get replace to mean

imputer.fit(df_pyspark).transform(df_pyspark).show()

+-------+----+----------+------+-----------+------------------+--------------+
|   Name| Age|Experience|Salary|Age_imputed|Experience_imputed|Salary_imputed|
+-------+----+----------+------+-----------+------------------+--------------+
|  Zikri|  24|         1|  4500|         24|                 1|          4500|
|Zakhwan|  25|         1|  4700|         25|                 1|          4700|
|   Amir|  27|         2|  5000|         27|                 2|          5000|
|  Ammar|  30|         6|  8500|         30|                 6|          8500|
|  Haziq|  24|         2|  4700|         24|                 2|          4700|
|  Irfan|  25|         1|  6000|         25|                 1|          6000|
|  Fahmi|  27|         5|  7000|         27|                 5|          7000|
|  Majid|  27|      NULL|  5500|         27|                 2|          5500|
|   NULL|  25|         2|  5000|         25|                 2|          5000|
|    Ali|  25|         2|  NULL|         25|        

In [74]:
# get Imputer from pyspark.ml.feature

from pyspark.ml.feature import Imputer

imputer = Imputer(
    inputCols = ['Age', 'Experience', 'Salary'],
    outputCols = ["{}_imputed".format(c) for c in ['Age', 'Experience', 'Salary']]
).setStrategy("median")

# set the fill value to the median value
# the NULL value get replace to median

imputer.fit(df_pyspark).transform(df_pyspark).show()

+-------+----+----------+------+-----------+------------------+--------------+
|   Name| Age|Experience|Salary|Age_imputed|Experience_imputed|Salary_imputed|
+-------+----+----------+------+-----------+------------------+--------------+
|  Zikri|  24|         1|  4500|         24|                 1|          4500|
|Zakhwan|  25|         1|  4700|         25|                 1|          4700|
|   Amir|  27|         2|  5000|         27|                 2|          5000|
|  Ammar|  30|         6|  8500|         30|                 6|          8500|
|  Haziq|  24|         2|  4700|         24|                 2|          4700|
|  Irfan|  25|         1|  6000|         25|                 1|          6000|
|  Fahmi|  27|         5|  7000|         27|                 5|          7000|
|  Majid|  27|      NULL|  5500|         27|                 2|          5500|
|   NULL|  25|         2|  5000|         25|                 2|          5000|
|    Ali|  25|         2|  NULL|         25|        