## Purpose of script:
#### Reviewing Spark and ways to deal with missing data
#### Referencing Jose Portilla's "Spark and Python for Big Data with PySpark" course

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('missing_data').getOrCreate()

In [3]:
df = spark.read.csv('../Datasets/ContainsNull.csv', inferSchema=True, header=True)

In [7]:
df.printSchema()

root
 |-- Id: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sales: double (nullable = true)



In [4]:
df.show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp2| null| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [6]:
# drops all rows with missing data
df.na.drop().show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp4|Cindy|456.0|
+----+-----+-----+



In [8]:
# threshold is the number of non-nulls a row needs to have
df.na.drop(thresh=2).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [12]:
df.na.drop(how='any').show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp4|Cindy|456.0|
+----+-----+-----+



In [13]:
# on drop rows if there are null values in Sales column
df.na.drop(subset=['Sales']).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [16]:
df.na.fill('Missing name', subset=['Name']).show()

+----+------------+-----+
|  Id|        Name|Sales|
+----+------------+-----+
|emp1|        John| null|
|emp2|Missing name| null|
|emp3|Missing name|345.0|
|emp4|       Cindy|456.0|
+----+------------+-----+



In [17]:
df.na.fill(0, subset=['Sales']).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John|  0.0|
|emp2| null|  0.0|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [18]:
from pyspark.sql.functions import mean

In [19]:
# filling missing numeric with mean value
mean_val = df.select(mean(df['Sales'])).collect()

mean_val

[Row(avg(Sales)=400.5)]

In [22]:
mean_sales = mean_val[0][0]

mean_sales

400.5

In [23]:
df.na.fill(mean_sales, subset=['Sales']).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John|400.5|
|emp2| null|400.5|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [24]:
spark.stop()