# Spark ML Homework - Rain in Austrailia

In [1]:
# spark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType, DoubleType
from pyspark.sql.functions import *

# model building
from pyspark.ml.feature import VectorAssembler, OneHotEncoder, StringIndexer
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml import Pipeline

# model tuning
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# model evaluation
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

In [2]:
spark = SparkSession.builder.getOrCreate()

In [3]:
schema = (StructType([
            StructField('Date', DateType(), True),
            StructField('Location', StringType(), True),
            StructField('MinTemp', DoubleType(), True),
            StructField('MaxTemp', DoubleType(), True),
            StructField('Rainfall', DoubleType(), True),
            StructField('Evaporation', DoubleType(), True),
            StructField('Sunshine', DoubleType(), True),
            StructField('WindGustDir', StringType(), True),
            StructField('WindGustSpeed', IntegerType(), True),
            StructField('WindDir9am', StringType(), True),
            StructField('WindDir3pm', StringType(), True),
            StructField('WindSpeed9am', IntegerType(), True),
            StructField('WindSpeed3pm', IntegerType(), True),
            StructField('Humidity9am', IntegerType(), True),
            StructField('Humidity3pm', IntegerType(), True),
            StructField('Pressure9am', DoubleType(), True),
            StructField('Pressure3pm', DoubleType(), True),
            StructField('Cloud9am', IntegerType(), True),
            StructField('Cloud3pm', IntegerType(), True),
            StructField('Temp9am', DoubleType(), True),
            StructField('Temp3pm', DoubleType(), True),
            StructField('RainToday', StringType(), True),
            StructField('RainTomorrow', StringType(), True)])
         )

In [4]:
rain_df = spark.read.csv('weatherAUS.csv', header=True, schema=schema)

## Remove Unnecessary Columns

In [5]:
# drop Date column
rain_df = rain_df.drop(col('Date'))

## Drop Null Values

In [6]:
# separate columns by datatype
num_cols = [colname for colname, dtype in rain_df.dtypes if dtype in ('int', 'double')]
categorical_cols = [colname for colname, dtype in rain_df.dtypes if dtype == 'string']
target_col = ['RainTomorrow']

In [7]:
# drop rows with null numerical col
rain_df = rain_df.dropna(subset=num_cols)

In [8]:
# drop NA in categorical col
for column in categorical_cols:
    rain_df = rain_df.filter(~(col(column) == 'NA'))

In [9]:
# drop NA in target col
rain_df = rain_df.filter(~(col('RainTomorrow') == 'NA'))

## Train Test Split

In [10]:
# split the data into train (80%) and test (20%)
train_rain, test_rain = rain_df.randomSplit([0.8, 0.2], seed=12345)

## One-Hot Encode Categorical Variables

In [11]:
# String index
indexOutput = [x + '_idx' for x in categorical_cols]
stringIndexer = StringIndexer(inputCols=categorical_cols, outputCols=indexOutput)

# One-hot encode
oheOutput = [x + '_ohe' for x in categorical_cols]
ohe = OneHotEncoder(inputCols=indexOutput, outputCols=oheOutput)

# create numerical version of target column
targetIndexer = StringIndexer(inputCol='RainTomorrow', outputCol='label')

## Model Building

In [12]:
# prepare feature vector with vector assembler
assemblerInput = oheOutput + num_cols
vecAssembler = VectorAssembler(inputCols=assemblerInput, outputCol='features')

In [13]:
# instantiate decision tree classifier
dtc = DecisionTreeClassifier(featuresCol='features', labelCol='label')

In [14]:
# create pipeline
pipeline = Pipeline(stages=[stringIndexer, ohe, targetIndexer, vecAssembler, dtc])

In [15]:
# evaluation metric for cross validation
binaryEval = BinaryClassificationEvaluator()

In [16]:
# create parameter grid for hyperparameter tuning 
paramGrid = (ParamGridBuilder()
             .addGrid(dtc.impurity, ['gini', 'entropy'])
             .addGrid(dtc.maxBins, [5, 10, 15])
             .addGrid(dtc.minInfoGain, [0.0, 0.2, 0.4])
             .addGrid(dtc.maxDepth, [3, 5, 7])
             .build()
            )

In [17]:
# cross validate with 3 folds
cv = CrossValidator(estimator=pipeline, 
                    estimatorParamMaps=paramGrid, 
                    evaluator=binaryEval, 
                    numFolds=4, 
                    parallelism=4)
# fit the model
cvModel = cv.fit(train_rain)
# get best model 
bestModel = cvModel.bestModel

In [18]:
# get best model parameters 
bestImpurity = bestModel.stages[-1]._java_obj.getImpurity()
bestMaxBins = bestModel.stages[-1]._java_obj.getMaxBins()
bestMinInfoGain = bestModel.stages[-1]._java_obj.getMinInfoGain()
bestMaxDepth = bestModel.stages[-1]._java_obj.getMaxDepth()
# print best parameters
print(f'Best parameters:')
print(f'—— Impurity: \t{bestImpurity}')
print(f'—— MaxBins: \t{bestMaxBins}')
print(f'—— MinInfoGain: {bestMinInfoGain}')
print(f'—— MaxDepth: \t{bestMaxDepth}')

Best parameters:
—— Impurity: 	gini
—— MaxBins: 	5
—— MinInfoGain: 0.0
—— MaxDepth: 	3


In [19]:
# make predictions with CV model - TEST data
cvPredRainTest = cvModel.transform(test_rain)

# predictions - TRAIN data
cvPredRainTrain = cvModel.transform(train_rain)

## Model Evaluation

In [20]:
# Calculate and print the Area under ROC Curve
# and Area under Precision-Recall Curve scores
# for your training and test data sets

binaryEval.getMetricName()

'areaUnderROC'

In [21]:
binaryEval.setRawPredictionCol("rawPrediction")
trainAUROC = binaryEval.evaluate(cvPredRainTrain)
testAUROC = binaryEval.evaluate(cvPredRainTest)
print(f'Training auROC: {trainAUROC}\nTest auROC: {testAUROC}')

Training auROC: 1.0
Test auROC: 1.0


In [22]:
binaryEval.setMetricName('areaUnderPR')

BinaryClassificationEvaluator_6cde3c8b799a

In [23]:
trainAUPR = binaryEval.evaluate(cvPredRainTrain)
testAUPR = binaryEval.evaluate(cvPredRainTest)
print(f'Training auPR: {trainAUPR}\nTest auPR: {testAUPR}')

Training auPR: 1.0
Test auPR: 1.0
