PySpark Handling Missing Values:
1. Read file
2. Drop missing rows using dropna()
3. Fill missing values using fillna() + agg()
4. Fill missing values using pyspark.ml.feature.Imputer

In [1]:
from pyspark.sql import SparkSession


In [2]:
spark = SparkSession.builder.appName("Tuto 3 Handle Miss Value")\
        .getOrCreate()

1. Read file

In [41]:
demo_df = spark.read.csv('data/test1.csv', header=True, inferSchema=True)

In [31]:
demo_df.show()

+---------+----+----------+-------+
|     Name| age|Experience| Salary|
+---------+----+----------+-------+
|    Krish|  31|      10.0|30000.0|
|Sudhanshu|  30|       8.0|25000.0|
|    Sunny|  29|       4.0|20000.0|
|     Paul|  24|       3.0|20000.0|
|   Harsha|  21|       1.0|15000.0|
|  Shubham|  23|       2.0|18000.0|
|   Mahesh|NULL|      NULL|40000.0|
|     NULL|  34|      10.0|38000.0|
|     NULL|  36|      NULL|   NULL|
+---------+----+----------+-------+



2. Drop rows having null values

dropna(how='any/all',
    thresh=[int], subset=set/tuple/list)

In [32]:
dropna_df = demo_df
# dropna_df.dropna().show()
dropna_df.na.drop().show()

+---------+---+----------+-------+
|     Name|age|Experience| Salary|
+---------+---+----------+-------+
|    Krish| 31|      10.0|30000.0|
|Sudhanshu| 30|       8.0|25000.0|
|    Sunny| 29|       4.0|20000.0|
|     Paul| 24|       3.0|20000.0|
|   Harsha| 21|       1.0|15000.0|
|  Shubham| 23|       2.0|18000.0|
+---------+---+----------+-------+



In [36]:
# thresh=x. Drop rows having less than x non-null values
dropna_df.na.drop(how='any', thresh=2).show()

+---------+----+----------+-------+
|     Name| age|Experience| Salary|
+---------+----+----------+-------+
|    Krish|  31|      10.0|30000.0|
|Sudhanshu|  30|       8.0|25000.0|
|    Sunny|  29|       4.0|20000.0|
|     Paul|  24|       3.0|20000.0|
|   Harsha|  21|       1.0|15000.0|
|  Shubham|  23|       2.0|18000.0|
|   Mahesh|NULL|      NULL|40000.0|
|     NULL|  34|      10.0|38000.0|
+---------+----+----------+-------+



3. Fill missing values
- using fillna() with agg()
- using imputing()

DataFrame.fillna(value=[float/int/dict/str...], subset=set/tupe/list/col)

In [42]:
fill_df = demo_df

In [None]:
# Fill value =18 out the 'Age' column
fill_df.fillna(value=18, subset='age').show()

PySpark.sql.functions inclues min/max/median/mean/last...

In [45]:
from pyspark.sql import functions as pyf

In [76]:
min_age = demo_df.select("*")\
    .agg(pyf.min(demo_df['Age']))\
    .collect()[0][0]

In [81]:
first_name = demo_df.select('Name')\
    .agg(pyf.first(demo_df['Name']))\
    .collect()[0][0]

In [None]:
fill_df.fillna(value=min_age).show()

In [None]:
fill_df.fillna({'Age':min_age, 'Name':first_name, 'Experience':5.0}).show()

4. pySpark.ml.feature.Imputer(
- strategy='mean/median/mode', missingValue: float,
- inputCols: [list/cols], outputCols: [list/cols],
- inputCol: 'col', output: 'col'
- relativeError: float = 0.001 ) 

In [86]:
from pyspark.ml.feature import Imputer

In [88]:
# Calculate the mean of each column on inputCols list
# The mean value is stored on columns named 'inputCol_imputed'

inputCols = ['Age','Experience','Salary']
imputer = Imputer(strategy='mean', inputCols= inputCols,
                  outputCols=["{}_imputed".format(c) for c in inputCols])

In [90]:
# fit() to calculate the mean of each column on dataframe
# Output of fit() is stored by outputCols
# transform() to replace those missing value with output of fit()
imputed_df = imputer.fit(demo_df).transform(demo_df)
imputed_df.show()

+---------+----+----------+-------+-----------+------------------+--------------+
|     Name| Age|Experience| Salary|Age_imputed|Experience_imputed|Salary_imputed|
+---------+----+----------+-------+-----------+------------------+--------------+
|    Krish|  31|      10.0|30000.0|         31|              10.0|       30000.0|
|Sudhanshu|  30|       8.0|25000.0|         30|               8.0|       25000.0|
|    Sunny|  29|       4.0|20000.0|         29|               4.0|       20000.0|
|     Paul|  24|       3.0|20000.0|         24|               3.0|       20000.0|
|   Harsha|  21|       1.0|15000.0|         21|               1.0|       15000.0|
|  Shubham|  23|       2.0|18000.0|         23|               2.0|       18000.0|
|   Mahesh|NULL|      NULL|40000.0|         28| 5.428571428571429|       40000.0|
|     NULL|  34|      10.0|38000.0|         34|              10.0|       38000.0|
|     NULL|  36|      NULL|   NULL|         36| 5.428571428571429|       25750.0|
+---------+----+

In [92]:
imputed_df= imputed_df.fillna({'Name':first_name})


In [95]:
imputed_df.drop('Age','Experience','Salary').show()

+---------+-----------+------------------+--------------+
|     Name|Age_imputed|Experience_imputed|Salary_imputed|
+---------+-----------+------------------+--------------+
|    Krish|         31|              10.0|       30000.0|
|Sudhanshu|         30|               8.0|       25000.0|
|    Sunny|         29|               4.0|       20000.0|
|     Paul|         24|               3.0|       20000.0|
|   Harsha|         21|               1.0|       15000.0|
|  Shubham|         23|               2.0|       18000.0|
|   Mahesh|         28| 5.428571428571429|       40000.0|
|    Krish|         34|              10.0|       38000.0|
|    Krish|         36| 5.428571428571429|       25750.0|
+---------+-----------+------------------+--------------+

