In [1]:
from pyspark.sql import SQLContext
from pyspark.sql import DataFrameNaFunctions
import pandas as pd
from pyspark.ml import Pipeline

In [63]:
from pyspark.ml.classification import DecisionTreeClassifier
import pandas as pd

In [64]:
dataDFY = pd.read_csv("daily_weather.csv", sep=',')
del dataDFY['number']
dataDFY.head(2)

Unnamed: 0,air_pressure_9am,air_temp_9am,avg_wind_direction_9am,avg_wind_speed_9am,max_wind_direction_9am,max_wind_speed_9am,rain_accumulation_9am,rain_duration_9am,relative_humidity_9am,relative_humidity_3pm
0,918.06,74.822,271.1,2.080354,295.4,2.863283,0.0,0.0,42.42,36.16
1,917.347688,71.403843,101.935179,2.443009,140.471548,3.533324,0.0,0.0,24.328697,19.426597


In [65]:
#dataDFX = sqlContext.read.load('file:///home/cloudera/coursera/courseraDataSimulation/course4-ML/daily_weather.csv', 
#                          format='com.databricks.spark.csv', 
#                          header='true',inferSchema='true')

In [66]:
#dataDFX = dataDFX.drop('number') # row number
#dataDFX.columns

# Hands On 3: Classification

## Features Column

In [67]:
cols = ['air_pressure_9am','air_temp_9am','avg_wind_direction_9am','avg_wind_speed_9am','max_wind_direction_9am','max_wind_speed_9am','rain_accumulation_9am','rain_duration_9am']

In [68]:
# Use the copy read by PANDAS

dataDFf = dataDFY.copy(deep=True) #Y is pandas

In [69]:
#Remove all rows with missing vaues
sqlContext = SQLContext(sc)
dataDFq = sqlContext.createDataFrame(dataDFf) #spark

#### Create a Categorical column: 0 means relative_humidity_3pm is less than 25


In [70]:
dataDFpandas = dataDFf.dropna().copy(deep=True)
dataDFpandas.shape

(1064, 10)

In [71]:
dataDFspark = dataDFq.na.drop() # Spark drops rows with missing values
dataDFspark.count(), len(dataDFspark.columns)
#dataDFspark.toPandas().head(3)

(1064, 10)

In [72]:
B=dataDFspark.toPandas().sort_index(inplace=True)
K=dataDFpandas.sort_index(inplace=True)

In [73]:
B == K

True

In [74]:
df = pd.concat([dataDFspark.toPandas(), dataDFpandas])
df = df.reset_index(drop=True)
df_gpby = df.groupby(list(df.columns))
idx = [x[0] for x in df_gpby.groups.values() if len(x) == 1]
df.reindex(idx)

Unnamed: 0,air_pressure_9am,air_temp_9am,avg_wind_direction_9am,avg_wind_speed_9am,max_wind_direction_9am,max_wind_speed_9am,rain_accumulation_9am,rain_duration_9am,relative_humidity_9am,relative_humidity_3pm


In [75]:
# SPARK
dataDF = dataDFspark # missing values were dropped using .na.drop() of Spark Dataframe

# PANDAS
#sqlContext = SQLContext(sc)
#dataDF = sqlContext.createDataFrame(dataDFpandas)

In [76]:
len(dataDF.columns), dataDF.count()

(10, 1064)

In [77]:
from pyspark.ml.feature import Binarizer

binarizer = Binarizer(threshold=24.99999, inputCol="relative_humidity_3pm", outputCol="label")
binarizedDataFrame = binarizer.transform(dataDF)
binarizedDataFrame.select("relative_humidity_3pm","label").show(4)

+---------------------+-----+
|relative_humidity_3pm|label|
+---------------------+-----+
|   36.160000000000494|  1.0|
|     19.4265967985621|  0.0|
|   14.460000000000045|  0.0|
|   12.742547353761847|  0.0|
+---------------------+-----+
only showing top 4 rows



In [78]:
# delete before classification
dataDF = dataDF.drop('relative_humidity_9am')
dataDF = dataDF.drop('relative_humidity_3pm')

In [79]:
from pyspark.ml.feature import VectorAssembler, StringIndexer, VectorIndexer

assembler = VectorAssembler(inputCols=cols, outputCol="features")
assembled = assembler.transform(binarizedDataFrame)

assembled

DataFrame[air_pressure_9am: double, air_temp_9am: double, avg_wind_direction_9am: double, avg_wind_speed_9am: double, max_wind_direction_9am: double, max_wind_speed_9am: double, rain_accumulation_9am: double, rain_duration_9am: double, relative_humidity_9am: double, relative_humidity_3pm: double, label: double, features: vector]

## Test and Training Data

In [80]:
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = assembled.randomSplit([0.7, 0.3], seed = 1234)

In [81]:
labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(assembled)

# Automatically identify categorical features, and index them.
# We specify maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer= VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(assembled)

## Decision Tree in Spark

In [82]:
# Train a DecisionTree model.
#dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures",
#                 maxDepth=5,  minInstancesPerNode=20, 
#                impurity="gini")

dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures",
                  maxDepth=5, maxBins=32, minInstancesPerNode=20, minInfoGain=0.0,
                  maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
                  impurity="gini")

# Chain indexers and tree in a Pipeline
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])

# Train model.  This also runs the indexers.
model = pipeline.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)
predictions.select("prediction", "indexedLabel", "features").show(5)

+----------+------------+--------------------+
|prediction|indexedLabel|            features|
+----------+------------+--------------------+
|       1.0|         1.0|[907.990000000002...|
|       1.0|         1.0|[908.420000000007...|
|       1.0|         1.0|[908.970000000004...|
|       1.0|         1.0|[913.060000000003...|
|       1.0|         0.0|[913.633267677041...|
+----------+------------+--------------------+
only showing top 5 rows



# Hands On 4: Evaluation of Machine Learning Models

## Accuracy - Decision Tree

In [83]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="precision")

accuracy = evaluator.evaluate(predictions)
print("Accuracy = %g " % (accuracy))

Accuracy = 0.762376 


## Confusion Matrix - Decision Tree

In [84]:
from sklearn.metrics import confusion_matrix

confusion_matrix(predictions.select('label').toPandas(), predictions.select('prediction').toPandas())

array([[105,  36],
       [ 36, 126]])