In [60]:
from pyspark.sql import SQLContext
from pyspark.sql import DataFrameNaFunctions
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
import pandas as pd

In [61]:
dataDFX = sqlContext.read.load('file:///home/cloudera/coursera/courseraDataSimulation/course4-ML/daily_weather.csv', 
                          format='com.databricks.spark.csv', 
                          header='true',inferSchema='true')

In [62]:
dataDF = dataDFX.drop('number') # row number

# Hands On 3: Classification

## Features Column

In [63]:
cols = ['air_pressure_9am','air_temp_9am','avg_wind_direction_9am','avg_wind_speed_9am','max_wind_direction_9am','max_wind_speed_9am','rain_accumulation_9am','rain_duration_9am']

#### Create a Categorical column: 0 means relative_humidity_3pm is less than 25


In [64]:
dataDF = dataDF.na.drop() # Spark drops rows with missing values
dataDF.count(), len(dataDF.columns)

(1064, 10)

In [65]:
from pyspark.ml.feature import Binarizer

binarizer = Binarizer(threshold=24.99999, inputCol="relative_humidity_3pm", outputCol="label")
binarizedDataFrame = binarizer.transform(dataDF)
binarizedDataFrame.select("relative_humidity_3pm","label").show(4)

+---------------------+-----+
|relative_humidity_3pm|label|
+---------------------+-----+
|   36.160000000000494|  1.0|
|     19.4265967985621|  0.0|
|   14.460000000000045|  0.0|
|   12.742547353761848|  0.0|
+---------------------+-----+
only showing top 4 rows



In [66]:
from pyspark.ml.feature import VectorAssembler, StringIndexer, VectorIndexer

assembler = VectorAssembler(inputCols=cols, outputCol="features")
assembled = assembler.transform(binarizedDataFrame)

## Test and Training Data

In [67]:
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = assembled.randomSplit([0.8,0.2], seed = 13234 )

# Samples in each set:
trainingData.count(), testData.count()

(854, 210)

## Decision Tree in Spark

In [68]:
# Train a DecisionTree model.
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features",
                 maxDepth=5,  minInstancesPerNode=20, 
                impurity="gini")

# Pipeline
pipeline = Pipeline(stages=[dt])

# Train model.  This also runs the indexers.
model = pipeline.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)
predictions.select("prediction", "label", "features").show(5)

+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       1.0|  1.0|[908.970000000004...|
|       1.0|  1.0|[912.890000000011...|
|       1.0|  1.0|[912.990000000012...|
|       1.0|  1.0|[913.060000000003...|
|       1.0|  1.0|[913.070000000008...|
+----------+-----+--------------------+
only showing top 5 rows



# Hands On 4: Evaluation of Machine Learning Models

## Accuracy - Decision Tree

In [69]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="precision")

accuracy = evaluator.evaluate(predictions)
print("Accuracy = %g " % (accuracy))

Accuracy = 0.809524 


## Confusion Matrix - Decision Tree

In [70]:
from sklearn.metrics import confusion_matrix

confusion_matrix(predictions.select('label').toPandas(), predictions.select('prediction').toPandas())

array([[87, 14],
       [26, 83]])