# Handling Missing Data in PySpark HW Solutions

1. Drop them missing data points (including the entire row)
2. Fill them in with some other value.

Start your Spark session

In [1]:
from pyspark.sql import SparkSession
import numpy as np

spark = SparkSession.builder.appName('nulls').getOrCreate()
cores = spark._jsc.sc().getExecutorMemoryStatus().keySet().size()

print("You are working with {} core(s)".format(cores))
spark

You are working with 1 core(s)


## Read in the dataset for this Notebook

In [2]:
import pandas as pd

weather = pd.read_csv("s3://***************/Weather.csv")

In [3]:
for col in weather.columns:
    weather[col] = weather[col].astype('str')

df = spark.createDataFrame(weather)

In [4]:
df.limit(8).toPandas()

Unnamed: 0,pickup_datetime,tempm,tempi,dewptm,dewpti,hum,wspdm,wspdi,wgustm,wgusti,...,precipm,precipi,conds,icon,fog,rain,snow,hail,thunder,tornado
0,2015-12-31 00:15:00,7.8,46.0,6.1,43.0,89.0,7.4,4.6,,,...,0.5,0.02,Light Rain,rain,0,1,0,0,0,0
1,2015-12-31 00:42:00,7.8,46.0,6.1,43.0,89.0,7.4,4.6,,,...,0.8,0.03,Overcast,cloudy,0,0,0,0,0,0
2,2015-12-31 00:51:00,7.8,46.0,6.1,43.0,89.0,5.6,3.5,,,...,0.8,0.03,Overcast,cloudy,0,0,0,0,0,0
3,2015-12-31 01:51:00,7.2,45.0,5.6,42.1,90.0,7.4,4.6,,,...,0.3,0.01,Overcast,cloudy,0,0,0,0,0,0
4,2015-12-31 02:51:00,7.2,45.0,5.6,42.1,90.0,0.0,0.0,,,...,,,Overcast,cloudy,0,0,0,0,0,0
5,2015-12-31 03:28:00,6.7,44.1,5.0,41.0,89.0,7.4,4.6,,,...,,,Overcast,cloudy,0,0,0,0,0,0
6,2015-12-31 03:40:00,7.2,45.0,5.0,41.0,86.0,0.0,0.0,,,...,,,Overcast,cloudy,0,0,0,0,0,0
7,2015-12-31 03:51:00,7.2,45.0,5.0,41.0,86.0,7.4,4.6,,,...,,,Overcast,cloudy,0,0,0,0,0,0


In [5]:
df.printSchema()

root
 |-- pickup_datetime: string (nullable = true)
 |-- tempm: string (nullable = true)
 |-- tempi: string (nullable = true)
 |-- dewptm: string (nullable = true)
 |-- dewpti: string (nullable = true)
 |-- hum: string (nullable = true)
 |-- wspdm: string (nullable = true)
 |-- wspdi: string (nullable = true)
 |-- wgustm: string (nullable = true)
 |-- wgusti: string (nullable = true)
 |-- wdird: string (nullable = true)
 |-- wdire: string (nullable = true)
 |-- vism: string (nullable = true)
 |-- visi: string (nullable = true)
 |-- pressurem: string (nullable = true)
 |-- pressurei: string (nullable = true)
 |-- windchillm: string (nullable = true)
 |-- windchilli: string (nullable = true)
 |-- heatindexm: string (nullable = true)
 |-- heatindexi: string (nullable = true)
 |-- precipm: string (nullable = true)
 |-- precipi: string (nullable = true)
 |-- conds: string (nullable = true)
 |-- icon: string (nullable = true)
 |-- fog: string (nullable = true)
 |-- rain: string (nullable = t

In [6]:
from pyspark.sql.types import DoubleType, IntegerType, DatetimeConverter, DateType
from pyspark.sql.functions import col
df = df.withColumn("pickup_datetime", col("pickup_datetime").cast(DateType()))\
    .withColumn("tempm", col("tempm").cast(DoubleType()))\
    .withColumn("tempi", col("tempi").cast(DoubleType()))\
    .withColumn("dewptm", col("dewptm").cast(DoubleType()))\
    .withColumn("dewpti", col("dewpti").cast(DoubleType()))\
    .withColumn("hum", col("hum").cast(DoubleType()))\
    .withColumn("wspdm", col("wspdm").cast(DoubleType()))\
    .withColumn("wspdi", col("wspdi").cast(DoubleType()))\
    .withColumn("wgustm", col("wgustm").cast(DoubleType()))\
    .withColumn("wdird", col("wdird").cast(IntegerType()))\
    .withColumn("vism", col("vism").cast(DoubleType()))\
    .withColumn("pressurem", col("pressurem").cast(DoubleType()))\
    .withColumn("pressurei", col("pressurei").cast(DoubleType()))\
    .withColumn("windchillm", col("windchillm").cast(DoubleType()))\
    .withColumn("windchilli", col("windchilli").cast(DoubleType()))\
    .withColumn("heatindexm", col("heatindexm").cast(DoubleType()))\
    .withColumn("heatindexi", col("heatindexi").cast(DoubleType()))\
    .withColumn("precipm", col("precipm").cast(DoubleType()))\
    .withColumn("precipi", col("precipi").cast(DoubleType()))\
    .withColumn("fog", col("fog").cast(IntegerType()))\
    .withColumn("rain", col("rain").cast(IntegerType()))\
    .withColumn("snow", col("snow").cast(IntegerType()))\
    .withColumn("hail", col("hail").cast(IntegerType()))\
    .withColumn("thunder", col("thunder").cast(IntegerType()))\
    .withColumn("tornado", col("tornado").cast(IntegerType()))

In [10]:
from pyspark.sql.functions import regexp_replace

In [28]:
df.printSchema()

root
 |-- pickup_datetime: date (nullable = true)
 |-- tempm: double (nullable = true)
 |-- tempi: double (nullable = true)
 |-- dewptm: double (nullable = true)
 |-- dewpti: double (nullable = true)
 |-- hum: double (nullable = true)
 |-- wspdm: double (nullable = true)
 |-- wspdi: double (nullable = true)
 |-- wgustm: double (nullable = true)
 |-- wgusti: string (nullable = true)
 |-- wdird: integer (nullable = true)
 |-- wdire: string (nullable = true)
 |-- vism: double (nullable = true)
 |-- visi: string (nullable = true)
 |-- pressurem: double (nullable = true)
 |-- pressurei: double (nullable = true)
 |-- windchillm: double (nullable = true)
 |-- windchilli: double (nullable = true)
 |-- heatindexm: double (nullable = true)
 |-- heatindexi: double (nullable = true)
 |-- precipm: double (nullable = true)
 |-- precipi: double (nullable = true)
 |-- conds: string (nullable = true)
 |-- icon: string (nullable = true)
 |-- fog: integer (nullable = true)
 |-- rain: integer (nullable = 

In [40]:
df = df.replace("nan", None)
df = df.replace(float("NaN"), None)

## 1. How much missing data are we working with?

Get a count and percentage of each variable in the dataset to answer this question.

In [43]:
%%time
from pyspark.sql.functions import col

def null_value_calc(df):
    null_columns_counts = []
    numRows = df.count()
    for k in df.columns:
        nullRows = df.where(col(k).isNull()).count()
        
        if (nullRows > 0) :
            temp = k, nullRows, (nullRows/numRows)*100
            null_columns_counts.append(temp)
    return (null_columns_counts)

null_columns_calc_list = null_value_calc(df)

spark.createDataFrame(null_columns_calc_list, ['Column_With_Null_Value', 
                                               'Null_Values_Count',
                                               'Null_Value_Percent']).show()

+----------------------+-----------------+-------------------+
|Column_With_Null_Value|Null_Values_Count| Null_Value_Percent|
+----------------------+-----------------+-------------------+
|                 tempm|                5|0.04770537162484496|
|                 tempi|                5|0.04770537162484496|
|                dewptm|                5|0.04770537162484496|
|                dewpti|                5|0.04770537162484496|
|                   hum|                5|0.04770537162484496|
|                 wspdm|              737|  7.031771777502146|
|                 wspdi|              737|  7.031771777502146|
|                wgustm|             8605|  82.10094456635817|
|                wgusti|             8605|  82.10094456635817|
|                  vism|              245| 2.3375632096174033|
|                  visi|              245| 2.3375632096174033|
|             pressurem|              239| 2.2803167636675887|
|             pressurei|              239| 2.2803167636