In [13]:
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import Binarizer
from pyspark.ml.feature import VectorAssembler, StringIndexer, VectorIndexer

In [2]:
sc =SparkContext.getOrCreate()

In [3]:
sqlContext = SQLContext(sc)

In [4]:
df = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('C:/Users/leicheng/Downloads/daily_weather.csv')

In [5]:
df.columns

['number',
 'air_pressure_9am',
 'air_temp_9am',
 'avg_wind_direction_9am',
 'avg_wind_speed_9am',
 'max_wind_direction_9am',
 'max_wind_speed_9am',
 'rain_accumulation_9am',
 'rain_duration_9am',
 'relative_humidity_9am',
 'relative_humidity_3pm']

In [6]:
featurecolumns = ['air_pressure_9am',
 'air_temp_9am',
 'avg_wind_direction_9am',
 'avg_wind_speed_9am',
 'max_wind_direction_9am',
 'max_wind_speed_9am',
 'rain_accumulation_9am',
 'rain_duration_9am']

In [7]:
df = df.drop('number')

In [8]:
df = df.na.drop()

In [9]:
df.count(), len(df.columns)

(1064, 10)

Step 4. Create categorical variable. Let's create a categorical variable to denote if the humidity is not low. If the value is less than 25%, then we want the categorical value to be 0, otherwise the categorical value should be 1. We can create this categorical variable as a column in a DataFrame using Binarizer:

In [15]:
binarizer = Binarizer(threshold = 24.9999, inputCol = "relative_humidity_3pm", outputCol = "label")
binarizedDF = binarizer.transform(df)

In [18]:
binarizedDF.select("relative_humidity_3pm", "label").show(4)

+---------------------+-----+
|relative_humidity_3pm|label|
+---------------------+-----+
|   36.160000000000494|  1.0|
|     19.4265967985621|  0.0|
|   14.460000000000045|  0.0|
|   12.742547353761848|  0.0|
+---------------------+-----+
only showing top 4 rows



The first row's humidity value is greater than 25% and the label is 1. The other humidity values are less than 25% and have labels equal to 0.

Step 5. Aggregate features. Let's aggregate the features we will use to make predictions into a single column

In [19]:
assember = VectorAssembler(inputCols= featurecolumns, outputCol="features")
assembled = assember.transform(binarizedDF)

Step 6. Split training and test data. We can split the data by calling randomSplit():

In [20]:
(trainingData,testData) = assembled.randomSplit([0.8, 0.2], seed = 13234)

#The first argument is how many parts to split the data into and the approximate size of each. This specifies two sets of 80% and 20%. Normally, the seed should not be specified, but we use a specific value here so that everyone will get the same decision tree. 

In [22]:
trainingData.count(), testData.count()

(846, 218)

NOTE: if you get values (859, 205), then your Cloudera VM is most likely configured to only using 1 CPU. You need to reconfigure the VM to use 2 CPUs as described in the reading Instructions for Changing the Number of Cloudera VM CPUs.

Step 7. Create and train decision tree. Let's create the decision tree:

In [24]:
dt =DecisionTreeClassifier(labelCol= "label", featuresCol = "features", maxDepth=5, minInstancesPerNode=20, impurity = "gini")

The labelCol argument is the column we are trying to predict, featuresCol specifies the aggregated features column, maxDepth is stopping criterion for tree induction based on maximum depth of tree, minInstancesPerNode is stopping criterion for tree induction based on minimum number of samples in a node, and impurity is the impurity measure used to split node

In [25]:
pipeline = Pipeline(stages=[dt])
model = pipeline.fit(trainingData)

In [26]:
predictions = model.transform(testData)

In [30]:
predictions.select("prediction", "label").show(20)

+----------+-----+
|prediction|label|
+----------+-----+
|       1.0|  1.0|
|       1.0|  1.0|
|       0.0|  1.0|
|       1.0|  1.0|
|       1.0|  1.0|
|       1.0|  1.0|
|       1.0|  1.0|
|       1.0|  1.0|
|       0.0|  0.0|
|       1.0|  1.0|
|       1.0|  1.0|
|       1.0|  1.0|
|       1.0|  1.0|
|       0.0|  0.0|
|       1.0|  1.0|
|       1.0|  1.0|
|       0.0|  1.0|
|       0.0|  1.0|
|       0.0|  1.0|
|       1.0|  1.0|
+----------+-----+
only showing top 20 rows

