In [1]:
from pyspark.sql import SQLContext
from pyspark.sql import DataFrameNaFunctions
import pandas as pd
from pyspark.ml import Pipeline

In [2]:
sqlContext = SQLContext(sc)
dataDF = sqlContext.read.load('file:///home/cloudera/coursera/courseraDataSimulation/course4-ML/daily_weather.csv', 
                          format='com.databricks.spark.csv', 
                          header='true',inferSchema='true')

In [3]:
dataDF = dataDF.drop('number') # row number

# Hands on 2: Data Preparation

## Handling Missing Values

## (1) Missing Values: Remove them

In [4]:
dataDF.describe().show()

+-------+------------------+------------------+----------------------+------------------+----------------------+------------------+---------------------+-----------------+---------------------+---------------------+
|summary|  air_pressure_9am|      air_temp_9am|avg_wind_direction_9am|avg_wind_speed_9am|max_wind_direction_9am|max_wind_speed_9am|rain_accumulation_9am|rain_duration_9am|relative_humidity_9am|relative_humidity_3pm|
+-------+------------------+------------------+----------------------+------------------+----------------------+------------------+---------------------+-----------------+---------------------+---------------------+
|  count|              1092|              1090|                  1091|              1092|                  1092|              1091|                 1089|             1092|                 1095|                 1095|
|   mean| 918.8825513138097| 64.93300141287075|    142.23551070057584|  5.50828424225493|     148.9535179651692| 7.019513529175272|  0.2

In [5]:
#Remove 
dataDF_remove = dataDF.na.drop()

In [6]:
dataDF_remove.describe().show()

+-------+------------------+------------------+----------------------+------------------+----------------------+------------------+---------------------+------------------+---------------------+---------------------+
|summary|  air_pressure_9am|      air_temp_9am|avg_wind_direction_9am|avg_wind_speed_9am|max_wind_direction_9am|max_wind_speed_9am|rain_accumulation_9am| rain_duration_9am|relative_humidity_9am|relative_humidity_3pm|
+-------+------------------+------------------+----------------------+------------------+----------------------+------------------+---------------------+------------------+---------------------+---------------------+
|  count|              1064|              1064|                  1064|              1064|                  1064|              1064|                 1064|              1064|                 1064|                 1064|
|   mean| 918.9031798641055| 65.02260949558739|    142.30675564934032| 5.485793050713691|    148.48042413321312|6.9997136588756925| 

## Missing values : Replace with Mean

In [7]:
# let's checkout the original : count leaves NULL values

# count = number of rows
dataDF.describe().show() 

+-------+------------------+------------------+----------------------+------------------+----------------------+------------------+---------------------+-----------------+---------------------+---------------------+
|summary|  air_pressure_9am|      air_temp_9am|avg_wind_direction_9am|avg_wind_speed_9am|max_wind_direction_9am|max_wind_speed_9am|rain_accumulation_9am|rain_duration_9am|relative_humidity_9am|relative_humidity_3pm|
+-------+------------------+------------------+----------------------+------------------+----------------------+------------------+---------------------+-----------------+---------------------+---------------------+
|  count|              1092|              1090|                  1091|              1092|                  1092|              1091|                 1089|             1092|                 1095|                 1095|
|   mean| 918.8825513138097| 64.93300141287075|    142.23551070057584|  5.50828424225493|     148.9535179651692| 7.019513529175272|  0.2

In [8]:
from pyspark.sql.functions import avg

# function: fill(double value, scala.collection.Seq<String> cols)
R = dataDF # creat a copy

#go through each column and replace missing values with mean

for x in R.columns:
    meanValue = R.na.drop().agg(avg(x)).first()[0]
    R         = R.na.fill(meanValue, [x])

In [9]:
R.describe().show() 

# check out the counts
# mean has changed a little bit

+-------+------------------+------------------+----------------------+------------------+----------------------+------------------+---------------------+-----------------+---------------------+---------------------+
|summary|  air_pressure_9am|      air_temp_9am|avg_wind_direction_9am|avg_wind_speed_9am|max_wind_direction_9am|max_wind_speed_9am|rain_accumulation_9am|rain_duration_9am|relative_humidity_9am|relative_humidity_3pm|
+-------+------------------+------------------+----------------------+------------------+----------------------+------------------+---------------------+-----------------+---------------------+---------------------+
|  count|              1095|              1095|                  1095|              1095|                  1095|              1095|                 1095|             1095|                 1095|                 1095|
|   mean| 918.8826078303855| 64.93324212458779|     142.2364253915363| 5.508212238967135|     148.9535144804564| 7.019460030404404|   0.