In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('missing').getOrCreate()

In [3]:
df = spark.read.csv("input_data/ContainsNull.csv", inferSchema=True, header=True)

In [8]:
#if using head unable to show it
df.head(2)

[Row(Id='emp1', Name='John', Sales=None),
 Row(Id='emp2', Name=None, Sales=None)]

In [9]:
#drop missing data
df.na.drop().show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp4|Cindy|456.0|
+----+-----+-----+



In [11]:
#the overall result would have at least 2 non null error
df.na.drop(thresh=2).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [13]:
df.na.drop(how='any').show()
df.na.drop(how='all').show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp4|Cindy|456.0|
+----+-----+-----+

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp2| null| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [14]:
#Drop Based On Column
df.na.drop(subset='Sales').show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [16]:
#fill method fill in based on type ( from printSchema?)
df.na.fill('test').show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp2| test| null|
|emp3| test|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [17]:
df.na.fill(1.44).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| 1.44|
|emp2| null| 1.44|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [18]:
#fill based on columns
df.na.fill('Test',subset="Name").show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp2| Test| null|
|emp3| Test|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [20]:
#fill null value with mean value of existing value
from pyspark.sql.functions import mean

In [24]:
#It Based On Objective To Either Drop , or Fill
mean_val = df.select(mean(df['Sales'])).collect()

In [22]:
mean_sales = mean_val[0][0]

In [23]:
df.na.fill(mean_sales, subset='Sales').show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John|400.5|
|emp2| null|400.5|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+

