# Deal with Missing 

## Initialize spark Session 

In [5]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("missing_values").getOrCreate()

## Import Data 

In [11]:
data = spark.read.csv( "ContainsNull.csv" , header = True , inferSchema = True )
data.show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp2| null| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



## Drop Missing data 


In [13]:
data.na.drop().show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp4|Cindy|456.0|
+----+-----+-----+



Remove all rows with missing value

## Drop Missing data - Only rows with number of missing 

In [14]:
data.na.drop(thresh=2).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



Remove rows with 2 missing values 

In [17]:
data.na.drop(how = 'all').show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp2| null| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



## Drop wiht a subset condition 

In [19]:
data.na.drop(subset = ['Sales'] ).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



Remove rows with missing in determinate column (Subset Parameter)

## Fill missing value

In [23]:
data.na.fill("MissingValue").show()

+----+------------+-----+
|  Id|        Name|Sales|
+----+------------+-----+
|emp1|        John| null|
|emp2|MissingValue| null|
|emp3|MissingValue|345.0|
|emp4|       Cindy|456.0|
+----+------------+-----+



In [28]:
from pyspark.sql.functions import mean 

## Replacing Missin values with mean 

In [36]:
mean_val   = data.select( mean( data['Sales'] ) ).collect()
mean_sales = mean_val[0][0] 
data.na.fill(  mean_sales , subset = ['Sales'] ).show()   

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John|400.5|
|emp2| null|400.5|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [44]:
data.na.fill(data.select( mean( data['Sales'] ) ).collect()[0][0] , subset = ['Sales'] ).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John|400.5|
|emp2| null|400.5|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+

