In [1]:
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('Handling-Missing-Values').getOrCreate()
spark

In [2]:
df_pyspark = spark.read.csv('test2.csv',inferSchema=True,header=True)

In [3]:
df_pyspark.show()

+---------+----+----------+------+
|     Name| Age|Experience|Salary|
+---------+----+----------+------+
| Vaishnav|  24|        10| 50000|
|  Vaisakh|  13|         8| 40000|
|    Deepa|  46|         6| 30000|
|Sreekumar|  54|         4| 20000|
|   Kannan|null|      null| 10000|
|     null|  25|         2| 15000|
|     null|  36|      null|  null|
+---------+----+----------+------+



In [4]:
#Drop rows which contain NA/null value in any of its data

df_pyspark.na.drop().show()

+---------+---+----------+------+
|     Name|Age|Experience|Salary|
+---------+---+----------+------+
| Vaishnav| 24|        10| 50000|
|  Vaisakh| 13|         8| 40000|
|    Deepa| 46|         6| 30000|
|Sreekumar| 54|         4| 20000|
+---------+---+----------+------+



In [5]:
#There are different arguments that can be passed onto drop function.
#how() -> Default option is any; ie, if a row contains atleast one NA or null value, it will remove that particular row.
#         Other option is all; ie , if a row has all its value as NA or null value, it will remove that particular row.

df_pyspark.na.drop(how='any').show()

+---------+---+----------+------+
|     Name|Age|Experience|Salary|
+---------+---+----------+------+
| Vaishnav| 24|        10| 50000|
|  Vaisakh| 13|         8| 40000|
|    Deepa| 46|         6| 30000|
|Sreekumar| 54|         4| 20000|
+---------+---+----------+------+



In [6]:
#Second argument that can be passed to drop() is thresh
#This checks for how many non-null values should be present. ie thresh=2 means the row should contain atleast 2 non-null values

df_pyspark.na.drop(how='any',thresh=2).show()

+---------+----+----------+------+
|     Name| Age|Experience|Salary|
+---------+----+----------+------+
| Vaishnav|  24|        10| 50000|
|  Vaisakh|  13|         8| 40000|
|    Deepa|  46|         6| 30000|
|Sreekumar|  54|         4| 20000|
|   Kannan|null|      null| 10000|
|     null|  25|         2| 15000|
+---------+----+----------+------+



In [7]:
#Third argument that can be passed to drop() is subset
#This is kind of a filter. for eg: subset='column_name' means it will remove all rows which contains null value in the column_name column

df_pyspark.na.drop(how='any',subset='Experience').show()

+---------+---+----------+------+
|     Name|Age|Experience|Salary|
+---------+---+----------+------+
| Vaishnav| 24|        10| 50000|
|  Vaisakh| 13|         8| 40000|
|    Deepa| 46|         6| 30000|
|Sreekumar| 54|         4| 20000|
|     null| 25|         2| 15000|
+---------+---+----------+------+



In [9]:
#Filling missing values

df_pyspark.na.fill(value='Missing',subset='Experience').show()

+---------+----+----------+------+
|     Name| Age|Experience|Salary|
+---------+----+----------+------+
| Vaishnav|  24|        10| 50000|
|  Vaisakh|  13|         8| 40000|
|    Deepa|  46|         6| 30000|
|Sreekumar|  54|         4| 20000|
|   Kannan|null|      null| 10000|
|     null|  25|         2| 15000|
|     null|  36|      null|  null|
+---------+----+----------+------+



In [10]:
df_pyspark.show()

+---------+----+----------+------+
|     Name| Age|Experience|Salary|
+---------+----+----------+------+
| Vaishnav|  24|        10| 50000|
|  Vaisakh|  13|         8| 40000|
|    Deepa|  46|         6| 30000|
|Sreekumar|  54|         4| 20000|
|   Kannan|null|      null| 10000|
|     null|  25|         2| 15000|
|     null|  36|      null|  null|
+---------+----+----------+------+



In [17]:
#Filling null value with mean, median or mode.
#Here we will be using imputer functions

from pyspark.ml.feature import Imputer

imputer = Imputer(inputCols=['Age','Experience','Salary'],
                  outputCols= ['Imputed_{}'.format(c) for c in ['Age','Experience','Salary']]
                 ).setStrategy('median')

In [18]:
#Adding imputed columns to Dataframe

imputer.fit(df_pyspark).transform(df_pyspark).show()

+---------+----+----------+------+-----------+------------------+--------------+
|     Name| Age|Experience|Salary|Imputed_Age|Imputed_Experience|Imputed_Salary|
+---------+----+----------+------+-----------+------------------+--------------+
| Vaishnav|  24|        10| 50000|         24|                10|         50000|
|  Vaisakh|  13|         8| 40000|         13|                 8|         40000|
|    Deepa|  46|         6| 30000|         46|                 6|         30000|
|Sreekumar|  54|         4| 20000|         54|                 4|         20000|
|   Kannan|null|      null| 10000|         25|                 6|         10000|
|     null|  25|         2| 15000|         25|                 2|         15000|
|     null|  36|      null|  null|         36|                 6|         20000|
+---------+----+----------+------+-----------+------------------+--------------+

