In [1]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Practice').getOrCreate()
spark

In [2]:
df_pyspark = spark.read.csv('test1.csv',header=True,inferSchema=True)
df_pyspark.show()

+----+----+----------+------+
|Name| Age|Experience|Salary|
+----+----+----------+------+
|   a|  20|        10|  3000|
|   b|  30|         8|  2500|
|   c|  40|         4|  1000|
|   e|NULL|         2|  3000|
|NULL|  30|         2|  2000|
+----+----+----------+------+



## Dropping Null Values

In [3]:
df_pyspark.na.drop().show()

+----+---+----------+------+
|Name|Age|Experience|Salary|
+----+---+----------+------+
|   a| 20|        10|  3000|
|   b| 30|         8|  2500|
|   c| 40|         4|  1000|
+----+---+----------+------+



In [4]:
#if all the values of a row is NULL, then it will remove that row
df_pyspark.na.drop(how='all').show()

+----+----+----------+------+
|Name| Age|Experience|Salary|
+----+----+----------+------+
|   a|  20|        10|  3000|
|   b|  30|         8|  2500|
|   c|  40|         4|  1000|
|   e|NULL|         2|  3000|
|NULL|  30|         2|  2000|
+----+----+----------+------+



In [7]:
#id atleast two non-null values, it will keep that row; otherwise it will delete it 
df_pyspark.na.drop(how='all',thresh=2).show()

+----+----+----------+------+
|Name| Age|Experience|Salary|
+----+----+----------+------+
|   a|  20|        10|  3000|
|   b|  30|         8|  2500|
|   c|  40|         4|  1000|
|   e|NULL|         2|  3000|
|NULL|  30|         2|  2000|
+----+----+----------+------+



In [8]:
#drop Null values only from specific col
df_pyspark.na.drop(how='all',subset=['Age']).show()

+----+---+----------+------+
|Name|Age|Experience|Salary|
+----+---+----------+------+
|   a| 20|        10|  3000|
|   b| 30|         8|  2500|
|   c| 40|         4|  1000|
|NULL| 30|         2|  2000|
+----+---+----------+------+



## Filling Missing Value

In [11]:
df_pyspark.na.fill('Missing Values').show()

+--------------+----+----------+------+
|          Name| Age|Experience|Salary|
+--------------+----+----------+------+
|             a|  20|        10|  3000|
|             b|  30|         8|  2500|
|             c|  40|         4|  1000|
|             e|NULL|         2|  3000|
|Missing Values|  30|         2|  2000|
+--------------+----+----------+------+



In [12]:
df_pyspark

DataFrame[Name: string, Age: int, Experience: int, Salary: int]

#### Replace the missing values with mean,median

In [13]:
from pyspark.ml.feature import Imputer

imputer = Imputer(
    inputCols = ['Age','Experience','Salary'],
    outputCols = ["{}_imputed".format(c) for c in ['Age','Experience','Salary']]
).setStrategy("mean")

In [14]:
imputer.fit(df_pyspark).transform(df_pyspark).show()

+----+----+----------+------+-----------+------------------+--------------+
|Name| Age|Experience|Salary|Age_imputed|Experience_imputed|Salary_imputed|
+----+----+----------+------+-----------+------------------+--------------+
|   a|  20|        10|  3000|         20|                10|          3000|
|   b|  30|         8|  2500|         30|                 8|          2500|
|   c|  40|         4|  1000|         40|                 4|          1000|
|   e|NULL|         2|  3000|         30|                 2|          3000|
|NULL|  30|         2|  2000|         30|                 2|          2000|
+----+----+----------+------+-----------+------------------+--------------+

