In [1]:
# Download JAR from https://spark-packages.org/package/databricks/spark-csv (1.5.0)

# Command to start: pyspark --packages com.databricks:spark-csv_2.11:1.5.0

from pyspark.sql import SQLContext
from pyspark.sql import DataFrameNaFunctions
import pandas as pd
from pyspark.ml import Pipeline

In [2]:
sqlContext = SQLContext(sc)

In [3]:
dataDF = sqlContext.read.load('file:///home/cloudera/coursera/courseraDataSimulation/course4-ML/daily_weather.csv', 
                          format='com.databricks.spark.csv', 
                          header='true',inferSchema='true')

In [4]:
dataDF.columns

['number',
 'air_pressure_9am',
 'air_temp_9am',
 'avg_wind_direction_9am',
 'avg_wind_speed_9am',
 'max_wind_direction_9am',
 'max_wind_speed_9am',
 'rain_accumulation_9am',
 'rain_duration_9am',
 'relative_humidity_9am',
 'relative_humidity_3pm']

In [5]:
dataDF.describe().show()

+-------+------------------+------------------+------------------+----------------------+------------------+----------------------+------------------+---------------------+-----------------+---------------------+---------------------+
|summary|            number|  air_pressure_9am|      air_temp_9am|avg_wind_direction_9am|avg_wind_speed_9am|max_wind_direction_9am|max_wind_speed_9am|rain_accumulation_9am|rain_duration_9am|relative_humidity_9am|relative_humidity_3pm|
+-------+------------------+------------------+------------------+----------------------+------------------+----------------------+------------------+---------------------+-----------------+---------------------+---------------------+
|  count|              1095|              1092|              1090|                  1091|              1092|                  1092|              1091|                 1089|             1092|                 1095|                 1095|
|   mean|             547.0| 918.8825513138097| 64.933001412

In [6]:
dataDF = dataDF.drop('number') # row number
dataDF.columns

['air_pressure_9am',
 'air_temp_9am',
 'avg_wind_direction_9am',
 'avg_wind_speed_9am',
 'max_wind_direction_9am',
 'max_wind_speed_9am',
 'rain_accumulation_9am',
 'rain_duration_9am',
 'relative_humidity_9am',
 'relative_humidity_3pm']

# Hands On 1: Data Exploration

In [7]:
# Displat first few lines
dataDF.take(2)

[Row(air_pressure_9am=918.0600000000087, air_temp_9am=74.82200000000041, avg_wind_direction_9am=271.1, avg_wind_speed_9am=2.080354199999768, max_wind_direction_9am=295.39999999999986, max_wind_speed_9am=2.863283199999908, rain_accumulation_9am=0.0, rain_duration_9am=0.0, relative_humidity_9am=42.42000000000046, relative_humidity_3pm=36.160000000000494),
 Row(air_pressure_9am=917.3476881177097, air_temp_9am=71.40384263106537, avg_wind_direction_9am=101.93517935618371, avg_wind_speed_9am=2.4430092157340217, max_wind_direction_9am=140.47154847112498, max_wind_speed_9am=3.5333236016106238, rain_accumulation_9am=0.0, rain_duration_9am=0.0, relative_humidity_9am=24.328697291802207, relative_humidity_3pm=19.4265967985621)]

### Summary Statistics

In [8]:
#Columns in DataFrame
len(dataDF.columns)

10

In [9]:
#Rows in DataFrame
dataDF.count()

1095

In [10]:
# Show summary of one column
dataDF.describe("air_pressure_9am").show()

#Notice the Nan

+-------+------------------+
|summary|  air_pressure_9am|
+-------+------------------+
|  count|              1092|
|   mean| 918.8825513138097|
| stddev|3.1841611803868353|
|    min| 907.9900000000024|
|    max| 929.3200000000012|
+-------+------------------+



In [11]:
# Drop the rows with missing values on specific column and show statistics
M = dataDF.na.drop(subset=['air_pressure_9am'])

M.describe().show()

+-------+------------------+------------------+----------------------+------------------+----------------------+------------------+---------------------+------------------+---------------------+---------------------+
|summary|  air_pressure_9am|      air_temp_9am|avg_wind_direction_9am|avg_wind_speed_9am|max_wind_direction_9am|max_wind_speed_9am|rain_accumulation_9am| rain_duration_9am|relative_humidity_9am|relative_humidity_3pm|
+-------+------------------+------------------+----------------------+------------------+----------------------+------------------+---------------------+------------------+---------------------+---------------------+
|  count|              1092|              1087|                  1088|              1089|                  1089|              1088|                 1086|              1089|                 1092|                 1092|
|   mean| 918.8825513138097| 64.96896753146554|    142.09831025625775| 5.504611894676135|    148.78720908602853| 7.014192386313907| 

In [12]:
# Drop the rows with missing values and check statistics
M.describe("air_pressure_9am").show()

+-------+------------------+
|summary|  air_pressure_9am|
+-------+------------------+
|  count|              1092|
|   mean| 918.8825513138097|
| stddev|3.1841611803868353|
|    min| 907.9900000000024|
|    max| 929.3200000000012|
+-------+------------------+



# Pairwise Correlation

In [13]:
# Correlation between two columns
dataDF.stat.corr("rain_accumulation_9am","rain_duration_9am")

0.7337968783310981

# Hands on 2: Data Preparation

## Handling Missing Values

## Missing Values: Remove them

In [14]:
dataDF.describe().show()

+-------+------------------+------------------+----------------------+------------------+----------------------+------------------+---------------------+-----------------+---------------------+---------------------+
|summary|  air_pressure_9am|      air_temp_9am|avg_wind_direction_9am|avg_wind_speed_9am|max_wind_direction_9am|max_wind_speed_9am|rain_accumulation_9am|rain_duration_9am|relative_humidity_9am|relative_humidity_3pm|
+-------+------------------+------------------+----------------------+------------------+----------------------+------------------+---------------------+-----------------+---------------------+---------------------+
|  count|              1092|              1090|                  1091|              1092|                  1092|              1091|                 1089|             1092|                 1095|                 1095|
|   mean| 918.8825513138097| 64.93300141287075|    142.23551070057584|  5.50828424225493|     148.9535179651692| 7.019513529175272|  0.2

In [15]:
#Remove 
dataDF_remove = dataDF.na.drop()

In [16]:
dataDF_remove.describe().show()

+-------+------------------+------------------+----------------------+------------------+----------------------+------------------+---------------------+------------------+---------------------+---------------------+
|summary|  air_pressure_9am|      air_temp_9am|avg_wind_direction_9am|avg_wind_speed_9am|max_wind_direction_9am|max_wind_speed_9am|rain_accumulation_9am| rain_duration_9am|relative_humidity_9am|relative_humidity_3pm|
+-------+------------------+------------------+----------------------+------------------+----------------------+------------------+---------------------+------------------+---------------------+---------------------+
|  count|              1064|              1064|                  1064|              1064|                  1064|              1064|                 1064|              1064|                 1064|                 1064|
|   mean| 918.9031798641055| 65.02260949558739|    142.30675564934032| 5.485793050713691|    148.48042413321312|6.9997136588756925| 

In [17]:
dataDF_replace = dataDF.na.fill(0)
dataDF_replace.describe().show()

+-------+-----------------+------------------+----------------------+------------------+----------------------+------------------+---------------------+------------------+---------------------+---------------------+
|summary| air_pressure_9am|      air_temp_9am|avg_wind_direction_9am|avg_wind_speed_9am|max_wind_direction_9am|max_wind_speed_9am|rain_accumulation_9am| rain_duration_9am|relative_humidity_9am|relative_humidity_3pm|
+-------+-----------------+------------------+----------------------+------------------+----------------------+------------------+---------------------+------------------+---------------------+---------------------+
|  count|             1095|              1095|                  1095|              1095|                  1095|              1095|                 1095|              1095|                 1095|                 1095|
|   mean|916.3650648718541|  64.6365036895243|    141.71592892632717| 5.493193052550121|    148.54542613512763| 6.993871470621208|   0.2

In [18]:
dataDF.describe().show()

+-------+------------------+------------------+----------------------+------------------+----------------------+------------------+---------------------+-----------------+---------------------+---------------------+
|summary|  air_pressure_9am|      air_temp_9am|avg_wind_direction_9am|avg_wind_speed_9am|max_wind_direction_9am|max_wind_speed_9am|rain_accumulation_9am|rain_duration_9am|relative_humidity_9am|relative_humidity_3pm|
+-------+------------------+------------------+----------------------+------------------+----------------------+------------------+---------------------+-----------------+---------------------+---------------------+
|  count|              1092|              1090|                  1091|              1092|                  1092|              1091|                 1089|             1092|                 1095|                 1095|
|   mean| 918.8825513138097| 64.93300141287075|    142.23551070057584|  5.50828424225493|     148.9535179651692| 7.019513529175272|  0.2

## Missing values : Replace with Mean

In [19]:
# let's checkout the original : count leaves NULL values
dataDF.describe().show() 

+-------+------------------+------------------+----------------------+------------------+----------------------+------------------+---------------------+-----------------+---------------------+---------------------+
|summary|  air_pressure_9am|      air_temp_9am|avg_wind_direction_9am|avg_wind_speed_9am|max_wind_direction_9am|max_wind_speed_9am|rain_accumulation_9am|rain_duration_9am|relative_humidity_9am|relative_humidity_3pm|
+-------+------------------+------------------+----------------------+------------------+----------------------+------------------+---------------------+-----------------+---------------------+---------------------+
|  count|              1092|              1090|                  1091|              1092|                  1092|              1091|                 1089|             1092|                 1095|                 1095|
|   mean| 918.8825513138097| 64.93300141287075|    142.23551070057584|  5.50828424225493|     148.9535179651692| 7.019513529175272|  0.2

In [20]:
from pyspark.sql.functions import avg

# function: fill(double value, scala.collection.Seq<String> cols)
R = dataDF # creat a copy

#go through each column and replace missing values with mean

for x in R.columns:
    meanValue = R.na.drop().agg(avg(x)).first()[0]
    R         = R.na.fill(meanValue, [x])

In [21]:
R.describe().show() # mean has changed a little bit

+-------+------------------+------------------+----------------------+------------------+----------------------+------------------+---------------------+-----------------+---------------------+---------------------+
|summary|  air_pressure_9am|      air_temp_9am|avg_wind_direction_9am|avg_wind_speed_9am|max_wind_direction_9am|max_wind_speed_9am|rain_accumulation_9am|rain_duration_9am|relative_humidity_9am|relative_humidity_3pm|
+-------+------------------+------------------+----------------------+------------------+----------------------+------------------+---------------------+-----------------+---------------------+---------------------+
|  count|              1095|              1095|                  1095|              1095|                  1095|              1095|                 1095|             1095|                 1095|                 1095|
|   mean| 918.8826078303855| 64.93324212458779|     142.2364253915363| 5.508212238967135|     148.9535144804564| 7.019460030404404|   0.

# Hands On 3: Classification

In [27]:
from pyspark.ml.classification import DecisionTreeClassifier

#Remove all rows with missing vaues
dataDF = dataDF.na.drop()

## Features Column

In [28]:
cols = ['air_pressure_9am','air_temp_9am','avg_wind_direction_9am','avg_wind_speed_9am','max_wind_direction_9am','max_wind_speed_9am','rain_accumulation_9am','rain_duration_9am']

#### Create a Categorical column: 0 means relative_humidity_3pm is less than 25


In [29]:
from pyspark.ml.feature import Binarizer

binarizer = Binarizer(threshold=24.99999, inputCol="relative_humidity_3pm", outputCol="label")
binarizedDataFrame = binarizer.transform(dataDF)

binarizedDataFrame.select("relative_humidity_3pm","label").show()

+---------------------+-----+
|relative_humidity_3pm|label|
+---------------------+-----+
|   36.160000000000494|  1.0|
|     19.4265967985621|  0.0|
|   14.460000000000045|  0.0|
|   12.742547353761848|  0.0|
|    76.74000000000046|  1.0|
|   33.930000000000256|  1.0|
|   21.385656725200974|  0.0|
|    74.92000000000041|  1.0|
|   24.030000000000427|  0.0|
|     68.0500000000012|  1.0|
|    32.13000000000024|  1.0|
|     79.0900000000002|  1.0|
|    58.43000000000119|  1.0|
|   27.990000000000173|  1.0|
|   24.369999999999948|  0.0|
|   14.801705962979918|  0.0|
|    20.75568332171184|  0.0|
|    45.87000000000005|  1.0|
|    7.740000000000088|  0.0|
|   14.649909361535952|  0.0|
+---------------------+-----+
only showing top 20 rows



In [30]:
from pyspark.ml.feature import VectorAssembler, StringIndexer, VectorIndexer

assembler = VectorAssembler(inputCols=cols, outputCol="features")
assembled = assembler.transform(binarizedDataFrame)

assembled

DataFrame[air_pressure_9am: double, air_temp_9am: double, avg_wind_direction_9am: double, avg_wind_speed_9am: double, max_wind_direction_9am: double, max_wind_speed_9am: double, rain_accumulation_9am: double, rain_duration_9am: double, relative_humidity_9am: double, relative_humidity_3pm: double, label: double, features: vector]

In [31]:
# delete before classification
dataDF = dataDF.drop('relative_humidity_9am')
dataDF = dataDF.drop('relative_humidity_3pm')

## Test and Training Data

In [32]:
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = assembled.randomSplit([0.7, 0.3], seed = 1234)

In [33]:
labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(assembled)

# Automatically identify categorical features, and index them.
# We specify maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer= VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(assembled)

## Decision Tree in Spark

In [1]:
# Train a DecisionTree model.
dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures",
                  maxDepth=5,  minInstancesPerNode=20, 
                  impurity="gini")


# Chain indexers and tree in a Pipeline
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])

# Train model.  This also runs the indexers.
model = pipeline.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)

predictions.select("prediction", "indexedLabel", "label").toPandas().to_csv("predictions.csv", index=False )

NameError: name 'DecisionTreeClassifier' is not defined

# Hands On 4: Evaluation of Machine Learning Models

## Accuracy - Decision Tree

In [40]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="precision")

accuracy = evaluator.evaluate(predictions)
print("Accuracy = %g " % (accuracy))

Accuracy = 0.765677 


## Confusion Matrix - Decision Tree

In [36]:
from sklearn.metrics import confusion_matrix

confusion_matrix(predictions.select('label').toPandas(), predictions.select('prediction').toPandas())

array([[104,  33],
       [ 38, 128]])