# Spark DataFrames Missing Data

In [1]:
import findspark
findspark.init()
import pyspark
findspark.find()

'D:\\Spark\\spark-3.2.1-bin-hadoop3.2'

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('missing data').getOrCreate()

In [7]:
df=spark.read.csv('./doc/Spark_DataFrames/ContainsNull.csv', header=True, inferSchema=True)

In [10]:
df.show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp2| null| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



#### Drop null columns

In [16]:
df.na.drop().show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp4|Cindy|456.0|
+----+-----+-----+



In [15]:
df.na.drop(thresh=2).show()
# just drop the row with at least 2 null 

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [17]:
# drop row with all null values
df.na.drop(how='all').show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp2| null| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



#### Fill null values

In [18]:
df.na.fill('FILL VALUE').show()

+----+----------+-----+
|  Id|      Name|Sales|
+----+----------+-----+
|emp1|      John| null|
|emp2|FILL VALUE| null|
|emp3|FILL VALUE|345.0|
|emp4|     Cindy|456.0|
+----+----------+-----+



In [19]:
# fill specific columns
df.na.fill(0,subset=['Sales']).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John|  0.0|
|emp2| null|  0.0|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [20]:
from pyspark.sql.functions import mean

In [21]:
mean_val = df.select(mean(df['Sales'])).collect()

In [22]:
mean_val[0][0]

400.5

In [23]:
mean_sales = mean_val[0][0]
df.na.fill(mean_sales,subset=['Sales']).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John|400.5|
|emp2| null|400.5|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+

