In [1]:
# Import the SparkSession class
from pyspark.sql import SparkSession

# Create SparkSession object
spark = SparkSession.builder \
                    .master('local[*]') \
                    .appName('test') \
                    .getOrCreate()

# Read data from CSV file
flights = spark.read.csv("dataset/flights.csv",
                         sep=',',
                         header=True,
                         inferSchema=True,
                         nullValue='NA')


# View the first five records
flights.show(5)

# Check column data types
print(flights.dtypes)

+---+---+---+-------+------+---+----+------+--------+-----+
|mon|dom|dow|carrier|flight|org|mile|depart|duration|delay|
+---+---+---+-------+------+---+----+------+--------+-----+
| 11| 20|  6|     US|    19|JFK|2153|  9.48|     351| NULL|
|  0| 22|  2|     UA|  1107|ORD| 316| 16.33|      82|   30|
|  2| 20|  4|     UA|   226|SFO| 337|  6.17|      82|   -8|
|  9| 13|  1|     AA|   419|ORD|1236| 10.33|     195|   -5|
|  4|  2|  5|     AA|   325|ORD| 258|  8.92|      65| NULL|
+---+---+---+-------+------+---+----+------+--------+-----+
only showing top 5 rows

[('mon', 'int'), ('dom', 'int'), ('dow', 'int'), ('carrier', 'string'), ('flight', 'int'), ('org', 'string'), ('mile', 'int'), ('depart', 'double'), ('duration', 'int'), ('delay', 'int')]


# Flight duration model: Pipeline stages

You're going to create the stages for the flights duration model pipeline. You will use these in the next exercise to build a pipeline and to create a regression model.

In [2]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.regression import LinearRegression
# Convert categorical strings to index values
indexer = StringIndexer(inputCol="org", outputCol="org_idx")

# One-hot encode index values
onehot = OneHotEncoder(
    inputCols=['org_idx' , 'dow'],
    outputCols=['org_dummy' , 'dow_dummy']
)

# Assemble predictors into a single column
assembler = VectorAssembler(inputCols=['km', 'org_dummy' , 'dow_dummy'], outputCol='features')

# A linear regression object
regression = LinearRegression(labelCol='duration')

# Flight duration model: Pipeline model

You're now ready to put those stages together in a pipeline.

You'll construct the pipeline and then train the pipeline on the training data. This will apply each of the individual stages in the pipeline to the training data in turn. None of the stages will be exposed to the testing data at all: there will be no leakage!

Once the entire pipeline has been trained it will then be used to make predictions on the testing data.

In [3]:
from pyspark.sql.functions import round
flights = flights.withColumn('km', round(flights.mile * 1.60934, 0)).drop('mile')
flights_train , flights_test = flights.randomSplit([0.8, 0.2], seed=42)

In [4]:
# Import class for creating a pipeline
from pyspark.ml import Pipeline

# Construct a pipeline
pipeline = Pipeline(stages=[indexer, onehot, assembler, regression])

# Train the pipeline on the training data
pipeline = pipeline.fit(flights_train)

# Make predictions on the testing data
predictions = pipeline.transform(flights_test)

# SMS spam pipeline

You haven't looked at the SMS data for quite a while. Last time we did the following:

- split the text into tokens
- removed stop words
- applied the hashing trick
- converted the data from counts to IDF and
- trained a logistic regression model.

In [5]:
# from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF
# from pyspark.ml.classification import LogisticRegression
# # Break text into tokens at non-word characters
# tokenizer = Tokenizer(inputCol='text', outputCol='words')

# # Remove stop words
# remover = StopWordsRemover(inputCol='words', outputCol='terms')

# # Apply the hashing trick and transform to TF-IDF
# hasher = HashingTF(inputCol='terms', outputCol="hash")
# idf = IDF(inputCol='hash', outputCol="features")

# # Create a logistic regression object and add everything to a pipeline
# logistic = LogisticRegression()
# pipeline = Pipeline(stages=[tokenizer, remover, hasher, idf, logistic])

# Cross validating simple flight duration model

You've already built a few models for predicting flight duration and evaluated them with a simple train/test split. However, cross-validation provides a much better way to evaluate model performance.

In this exercise you're going to train a simple model for flight duration using cross-validation. Travel time is usually strongly correlated with distance, so using the km column alone should give a decent model.

In [6]:
flights_train.show()

+---+---+---+-------+------+---+------+--------+-----+------+
|mon|dom|dow|carrier|flight|org|depart|duration|delay|    km|
+---+---+---+-------+------+---+------+--------+-----+------+
|  0|  1|  2|     AA|     3|JFK|  12.0|     370|   11|3983.0|
|  0|  1|  2|     AA|    59|JFK|   7.0|     385|  -16|4162.0|
|  0|  1|  2|     AA|   150|SFO| 23.42|     325|   22|4352.0|
|  0|  1|  2|     AA|   154|ORD| 17.25|     135|   49|1395.0|
|  0|  1|  2|     AA|   181|JFK|  17.0|     379|  -10|3983.0|
|  0|  1|  2|     AA|   317|LGA|  9.92|     170|   -9|1180.0|
|  0|  1|  2|     AA|   325|ORD|  10.0|      70| NULL| 415.0|
|  0|  1|  2|     AA|   335|LGA| 14.58|     165|   -4|1180.0|
|  0|  1|  2|     AA|   336|ORD| 21.58|     115|   55|1180.0|
|  0|  1|  2|     AA|   346|ORD|  19.5|     130|   63|1180.0|
|  0|  1|  2|     AA|   366|ORD|  14.5|     135|   52|1180.0|
|  0|  1|  2|     AA|   386|ORD|  8.75|     120|    8|1180.0|
|  0|  1|  2|     AA|   392|ORD|  8.08|     120|    4|1180.0|
|  0|  1

In [7]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# Create an empty parameter grid
params = ParamGridBuilder().build()

# Create objects for building and evaluating a regression model
regression = LinearRegression(labelCol="duration")
evaluator = RegressionEvaluator(labelCol="duration")

# Create a cross validator
cv = CrossValidator(estimator=Pipeline(stages=[indexer, onehot, assembler, regression]), estimatorParamMaps=params, evaluator=evaluator, numFolds=5)

# Train and test model on multiple folds of the training data
cv = cv.fit(flights_train)

# NOTE: Since cross-valdiation builds multiple models, the fit() method can take a little while to complete.

# Cross validating flight duration model pipeline

The cross-validated model that you just built was simple, using km alone to predict duration.

Another important predictor of flight duration is the origin airport. Flights generally take longer to get into the air from busy airports. Let's see if adding this predictor improves the model!

In this exercise you'll add the org field to the model. However, since org is categorical, there's more work to be done before it can be included: it must first be transformed to an index and then one-hot encoded before being assembled with km and used to build the regression model. We'll wrap these operations up in a pipeline.

In [8]:
# Create an indexer for the org field
indexer = StringIndexer(inputCol="org", outputCol="org_idx")

# Create an one-hot encoder for the indexed org field
onehot = OneHotEncoder(inputCols=["org_idx"], outputCols=["org_dummy"])

# Assemble the km and one-hot encoded fields
assembler = VectorAssembler(inputCols=["km", "org_dummy"], outputCol="features")

# Create a pipeline and cross-validator.
pipeline = Pipeline(stages=[indexer, onehot, assembler, regression])
cv = CrossValidator(estimator=pipeline,
          estimatorParamMaps=params,
          evaluator=evaluator)

# Optimizing flights linear regression

Up until now you've been using the default hyper-parameters when building your models. In this exercise you'll use cross validation to choose an optimal (or close to optimal) set of model hyper-parameters.

In [9]:
# Create parameter grid
params = ParamGridBuilder()

# Add grids for two parameters
params = params.addGrid(regression.regParam, [0.01, 0.1, 1.0, 10.0]) \
               .addGrid(regression.elasticNetParam, [0.0, 0.5, 1.0])

# Build the parameter grid
params = params.build()
print('Number of models to be tested: ', len(params))

# Create cross-validator
cv = CrossValidator(estimator=pipeline, estimatorParamMaps=params, evaluator=evaluator, numFolds=5)

Number of models to be tested:  12


# Dissecting the best flight duration model

Now you're going to take a closer look at the pipeline, split out the stages and use it to make predictions on the testing data.

In [10]:
cv = cv.fit(flights_train)
predictions = cv.transform(flights_test)
# Get the best model from cross validation
best_model = cv.bestModel

# Look at the stages in the best model
print(best_model.stages)

# Get the parameters for the LinearRegression object in the best model
best_model.stages[3].extractParamMap()

# Generate predictions on testing data using the best model then calculate RMSE
predictions = best_model.transform(flights_test)
print("RMSE =", evaluator.evaluate(predictions))

[StringIndexerModel: uid=StringIndexer_f74822d04096, handleInvalid=error, OneHotEncoderModel: uid=OneHotEncoder_d08bc69aee32, dropLast=true, handleInvalid=error, numInputCols=1, numOutputCols=1, VectorAssembler_cb88a8b41c43, LinearRegressionModel: uid=LinearRegression_8ac887bd3fa8, numFeatures=8]
RMSE = 11.06631665882969


# SMS spam optimised

The pipeline you built earlier for the SMS spam model used the default parameters for all of the elements in the pipeline. It's very unlikely that these parameters will give a particularly good model though. In this exercise you're going to run the pipeline for a selection of parameter values. We're going to do this in a systematic way: the values for each of the parameters will be laid out on a grid and then pipeline will systematically run across each point in the grid.

In this exercise you'll set up a parameter grid which can be used with cross validation to choose a good set of parameters for the SMS spam classifier.

In [11]:
# hasher = HashingTF()
# logistic = LogisticRegression()
# # Create parameter grid
# params = ParamGridBuilder()

# # Add grid for hashing trick parameters
# params = params.addGrid(hasher.numFeatures, [1024, 4096, 16384]) \
#                .addGrid(hasher.binary, [True, False])

# # Add grid for logistic regression parameters
# params = params.addGrid(logistic.regParam, [0.01, 0.1, 1.0 , 10.0]) \
#                .addGrid(logistic.elasticNetParam, [0.0, 0.5,  1.0 ])

# # Build parameter grid
# params = params.build()

# How many models for grid search?

```
params = ParamGridBuilder().addGrid(hasher.numFeatures, [1024, 4096, 16384]) \
                           .addGrid(hasher.binary, [True, False]) \
                           .addGrid(logistic.regParam, [0.01, 0.1, 1.0, 10.0]) \
                           .addGrid(logistic.elasticNetParam, [0.0, 0.5, 1.0]) \
                           .build()

cv = CrossValidator(..., estimatorParamMaps=params, numFolds=5)
```

- 3 * 2 * 4 * 3 * 5 (for 5 folds) = 360

# Delayed flights with Gradient-Boosted Trees

You've previously built a classifier for flights likely to be delayed using a Decision Tree. In this exercise you'll compare a Decision Tree model to a Gradient-Boosted Trees model.

In [12]:
indexer1 = StringIndexer(inputCol="org", outputCol="org_idx")
indexer2 = StringIndexer(inputCol="carrier", outputCol="carrier_idx")

# One-hot encode index values
onehot = OneHotEncoder(
    inputCols=['org_idx' , 'dow','carrier_idx'],
    outputCols=['org_dummy' , 'dow_dummy','carrier_dummy']
)

# Assemble predictors into a single column
assembler = VectorAssembler(inputCols=['km', 'org_dummy' , 'dow_dummy','carrier_dummy'], outputCol='features')


In [13]:
from pyspark.sql.functions import when, col, isnull
condition = ((col('delay') < 10) | (isnull(col('delay'))))
flights = flights.withColumn('label', when(condition, 0).otherwise(1))
pipeline = Pipeline(stages=[indexer1, indexer2, onehot, assembler])
pipeline_model = pipeline.fit(flights)
flights = pipeline_model.transform(flights)
flights_train , flights_test = flights.randomSplit([0.8, 0.2], seed=42)
flights.show(3)


+---+---+---+-------+------+---+------+--------+-----+------+-----+-------+-----------+-------------+-------------+-------------+--------------------+
|mon|dom|dow|carrier|flight|org|depart|duration|delay|    km|label|org_idx|carrier_idx|    org_dummy|    dow_dummy|carrier_dummy|            features|
+---+---+---+-------+------+---+------+--------+-----+------+-----+-------+-----------+-------------+-------------+-------------+--------------------+
| 11| 20|  6|     US|    19|JFK|  9.48|     351| NULL|3465.0|    0|    2.0|        6.0|(7,[2],[1.0])|    (6,[],[])|(8,[6],[1.0])|(22,[0,3,20],[346...|
|  0| 22|  2|     UA|  1107|ORD| 16.33|      82|   30| 509.0|    1|    0.0|        0.0|(7,[0],[1.0])|(6,[2],[1.0])|(8,[0],[1.0])|(22,[0,1,10,14],[...|
|  2| 20|  4|     UA|   226|SFO|  6.17|      82|   -8| 542.0|    0|    1.0|        0.0|(7,[1],[1.0])|(6,[4],[1.0])|(8,[0],[1.0])|(22,[0,2,12,14],[...|
+---+---+---+-------+------+---+------+--------+-----+------+-----+-------+-----------+-------

In [14]:
# Import the classes required
from pyspark.ml.classification import DecisionTreeClassifier, GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Create model objects and train on training data
tree = DecisionTreeClassifier().fit(flights_train)
gbt = GBTClassifier().fit(flights_train)

# Compare AUC on testing data
evaluator = BinaryClassificationEvaluator()
print(evaluator.evaluate(tree.transform(flights_test)))
print(evaluator.evaluate(gbt.transform(flights_test)))

# Find the number of trees and the relative importance of features
print(gbt.getNumTrees)
print(gbt.featureImportances)

0.528298086526482
0.618553739070929
20
(22,[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21],[0.3300171545838557,0.020393196663790546,0.028112408050702156,0.054261555010780116,0.046988953112125684,0.027490945231484036,0.02476292523639615,0.03793284453790415,0.038765884928593675,0.043579263909194005,0.015277357405699238,0.022537491409064116,0.03673663187204305,0.0667853205028239,0.012991268759016822,0.029851890027711932,0.03297375147574418,0.032638700570877026,0.01584451611134136,0.02184459942366945,0.039029031715426,0.02118430946175651])


# Delayed flights with a Random Forest

In this exercise you'll bring together cross validation and ensemble methods. You'll be training a Random Forest classifier to predict delayed flights, using cross validation to choose the best values for model parameters.

In [15]:
from pyspark.ml.classification import RandomForestClassifier
# Create a random forest classifier
forest = RandomForestClassifier()

# Create a parameter grid
params = ParamGridBuilder() \
            .addGrid(forest.featureSubsetStrategy, ['all', 'onethird', 'sqrt', 'log2']) \
            .addGrid(forest.maxDepth, [2, 5, 10]) \
            .build()

# Create a binary classification evaluator
evaluator = BinaryClassificationEvaluator()

# Create a cross-validator
cv = CrossValidator(estimator=forest, estimatorParamMaps=params, evaluator=evaluator, numFolds=5)

# Evaluating Random Forest

In this final exercise you'll be evaluating the results of cross-validation on a Random Forest model.

In [18]:
cv = cv.fit(flights_train)
# Average AUC for each parameter combination in grid
print(cv.avgMetrics)

# Average AUC for the best model
print(cv.avgMetrics)

# What's the optimal parameter value for maxDepth?
print(cv.bestModel.explainParam('maxDepth'))

# What's the optimal parameter value for featureSubsetStrategy?
print(cv.bestModel.explainParam('featureSubsetStrategy'))

# AUC for best model on testing data
print(evaluator.evaluate(cv.transform(flights_test)))


[0.5604117314968236, 0.5904127752907027, 0.6002863085788851, 0.580441259713781, 0.5914494930911941, 0.6021460908053026, 0.5836791565416339, 0.5934243222985572, 0.6019920635465991, 0.5836791565416339, 0.5934243222985572, 0.6019920635465991]
[0.5604117314968236, 0.5904127752907027, 0.6002863085788851, 0.580441259713781, 0.5914494930911941, 0.6021460908053026, 0.5836791565416339, 0.5934243222985572, 0.6019920635465991, 0.5836791565416339, 0.5934243222985572, 0.6019920635465991]
maxDepth: Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. Must be in range [0, 30]. (default: 5, current: 10)
featureSubsetStrategy: The number of features to consider for splits at each tree node. Supported options: 'auto' (choose automatically for task: If numTrees == 1, set to 'all'. If numTrees > 1 (forest), set to 'sqrt' for classification and to 'onethird' for regression), 'all' (use all features), 'onethird' (use 1/3 of the features), 'sqrt' (u