In [51]:
# Data import using Spark ML
from pyspark.sql import SQLContext
#from pyspark.sql.types import *
import pandas as pd
from pyspark.ml import Pipeline
#from pyspark.ml.feature import StringIndexer, VectorIndexer

In [52]:
f = pd.read_csv("daily_weather.csv", sep=',', index_col=0)

# How many samples ?
f.shape

(1095, 10)

In [53]:
del f['relative_humidity_9am']

In [54]:
def functionG(row):
    if row['relative_humidity_3pm'] < 25:
        val = 1
    else:
        val = 0
    return val

In [55]:
# Convert to Format (label, features)
#g = f.copy(deep=True)
#low_humidity = pd.DataFrame()

f['label'] = f.apply(functionG, axis=1)

#How many samples are in Class 1 (low humidity) ?
f['label'].sum()

548

In [56]:
f[['rain_accumulation_9am', 'rain_duration_9am']].describe()

Unnamed: 0,rain_accumulation_9am,rain_duration_9am
count,1089.0,1092.0
mean,0.203079,294.108052
std,1.593952,1598.078779
min,0.0,0.0
25%,0.0,0.0
50%,0.0,0.0
75%,0.0,0.0
max,24.02,17704.0


In [57]:
del f['relative_humidity_3pm']

In [58]:
f.columns[0:]
#temp = f.map(lambda line:LabeledPoint(line[0],[line[1:]]))

Index([u'air_pressure_9am', u'air_temp_9am', u'avg_wind_direction_9am',
       u'avg_wind_speed_9am', u'max_wind_direction_9am', u'max_wind_speed_9am',
       u'rain_accumulation_9am', u'rain_duration_9am', u'label'],
      dtype='object')

In [59]:
#from pyspark.mllib.linalg import DenseVector 
cols = ['air_pressure_9am','air_temp_9am','avg_wind_direction_9am','avg_wind_speed_9am','max_wind_direction_9am','max_wind_speed_9am','rain_accumulation_9am','rain_duration_9am']

In [60]:
#from pyspark.mllib.linalg import Vectors, VectorUDT

#w = pd.DataFrame()
#w['fea'] = f[cols].apply(lambda x:','.join(x.map(str)), axis=1)

#w['fea'] = [ f[c] for c in cols ]
#w.head(10)

In [61]:
from pyspark.ml.classification import DecisionTreeClassifier

In [62]:
#data = pd.concat([low_humidity, f], axis = 1)
#data.head(1)
data = f.copy(deep=True)
data.head(2)

Unnamed: 0,air_pressure_9am,air_temp_9am,avg_wind_direction_9am,avg_wind_speed_9am,max_wind_direction_9am,max_wind_speed_9am,rain_accumulation_9am,rain_duration_9am,label
0,918.06,74.822,271.1,2.080354,295.4,2.863283,0.0,0.0,0
1,917.347688,71.403843,101.935179,2.443009,140.471548,3.533324,0.0,0.0,1


In [63]:
sqlContext = SQLContext(sc)
dataDF = sqlContext.createDataFrame(data.dropna())
dataDF.take(2)

[Row(air_pressure_9am=918.0600000000087, air_temp_9am=74.82200000000041, avg_wind_direction_9am=271.1, avg_wind_speed_9am=2.080354199999768, max_wind_direction_9am=295.3999999999998, max_wind_speed_9am=2.863283199999908, rain_accumulation_9am=0.0, rain_duration_9am=0.0, label=0),
 Row(air_pressure_9am=917.3476881177097, air_temp_9am=71.40384263106537, avg_wind_direction_9am=101.93517935618372, avg_wind_speed_9am=2.4430092157340217, max_wind_direction_9am=140.471548471125, max_wind_speed_9am=3.533323601610624, rain_accumulation_9am=0.0, rain_duration_9am=0.0, label=1)]

# Features Column

In [64]:
from pyspark.ml.feature import VectorAssembler, StringIndexer, VectorIndexer
assembler = VectorAssembler(inputCols=cols, outputCol="features")
assembled = assembler.transform(dataDF)
assembled

DataFrame[air_pressure_9am: double, air_temp_9am: double, avg_wind_direction_9am: double, avg_wind_speed_9am: double, max_wind_direction_9am: double, max_wind_speed_9am: double, rain_accumulation_9am: double, rain_duration_9am: double, label: bigint, features: vector]

In [65]:
labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(assembled)

# Automatically identify categorical features, and index them.
# We specify maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer= VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(assembled)

# Test and Training Data

In [66]:
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = assembled.randomSplit([0.7, 0.3], seed = 1234)

# Decision Tree in Spark

In [67]:
# Train a DecisionTree model.
dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures",
                  maxDepth=5, maxBins=32, minInstancesPerNode=20, minInfoGain=0.0,
                  maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
                  impurity="gini")
    
# Chain indexers and tree in a Pipeline
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])

# Train model.  This also runs the indexers.
model = pipeline.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)

# Select example rows to display.
# predictions.select("prediction", "indexedLabel", "features").show(5)

In [68]:
# Accuracy
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="precision")

accuracy = evaluator.evaluate(predictions)
print("Accuracy = %g " % (accuracy))

Accuracy = 0.801325 


In [69]:
treeModel = model.stages[2]
# summary only
print(treeModel)

DecisionTreeClassificationModel (uid=DecisionTreeClassifier_41c4bdde28636fc5fc48) of depth 5 with 33 nodes


# Naive Bayes in Spark

In [70]:
#Naive Bayes
from pyspark.ml.classification import NaiveBayes 
dtnb = NaiveBayes(labelCol="indexedLabel", featuresCol="indexedFeatures")

# Chain indexers and tree in a Pipeline
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dtnb])

# Train model.  This also runs the indexers.
model = pipeline.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)
predictions.select("prediction", "indexedLabel", "features").show(5)

from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="precision")

accuracy = evaluator.evaluate(predictions)
print("Accuracy = %g " % (accuracy))

+----------+------------+--------------------+
|prediction|indexedLabel|            features|
+----------+------------+--------------------+
|       1.0|         1.0|[907.990000000002...|
|       1.0|         1.0|[908.420000000007...|
|       1.0|         1.0|[908.970000000004...|
|       0.0|         1.0|[913.060000000003...|
|       0.0|         0.0|[913.633267677041...|
+----------+------------+--------------------+
only showing top 5 rows

Accuracy = 0.569536 


In [71]:
# Data cleaning (removing missing values) - check summary statistics

In [72]:
# Data Cleaning (replace with mean value) - check summary statistics

In [None]:
# Summary Statistics


In [130]:
# Plots:

# Histogram
# Scatter Plot
# Bar Plot
# Box Plot


In [None]:
# Data preparation (adding new columns)


In [None]:
# Declaring Features and Target Class


In [None]:
# Random Data splitting into test and train


In [None]:
# Training the Decision Tree Classifier


In [None]:
# Predicting 


In [None]:
# Check Accuracy and Confusion Matrix of Decision Tree

In [None]:
# Training the Naive Bayes Classifier


In [None]:
# Predicting

In [None]:
# Check Accuracy and Confusion Matrix of Decision Tree