Spark ML
-------

Read training and test data. In this case test data is labeled as well (we will generate our label based on the `arrdelay` field) 

In [15]:
training = sqlContext.read.parquet("data/training.parquet")
test = sqlContext.read.parquet("data/test.parquet")


In [16]:
test.printSchema()

root
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- dayofmonth: integer (nullable = true)
 |-- dayofweek: integer (nullable = true)
 |-- deptime: integer (nullable = true)
 |-- crsdeptime: integer (nullable = true)
 |-- arrtime: integer (nullable = true)
 |-- crsarrtime: integer (nullable = true)
 |-- actualelapsetime: integer (nullable = true)
 |-- crselapsetime: integer (nullable = true)
 |-- airtime: integer (nullable = true)
 |-- arrdelay: integer (nullable = true)
 |-- depdelay: integer (nullable = true)
 |-- distance: integer (nullable = true)
 |-- taxiin: integer (nullable = true)
 |-- taxiout: integer (nullable = true)
 |-- cancelled: integer (nullable = true)
 |-- diverted: integer (nullable = true)
 |-- carrierdelay: integer (nullable = true)
 |-- weatherdelay: integer (nullable = true)
 |-- nasdelay: integer (nullable = true)
 |-- securitydelay: integer (nullable = true)
 |-- lateaircraftdelay: integer (nullable = true)



In [18]:
test.first()

Row(year=2006, month=2, dayofmonth=21, dayofweek=2, deptime=902, crsdeptime=905, arrtime=1027, crsarrtime=1030, actualelapsetime=205, crselapsetime=205, airtime=190, arrdelay=-3, depdelay=-3, distance=1162, taxiin=7, taxiout=8, cancelled=0, diverted=0, carrierdelay=0, weatherdelay=0, nasdelay=0, securitydelay=0, lateaircraftdelay=0)

Generate label column for the training data

In [19]:
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import udf

is_late = udf(lambda delay: 1.0 if delay > 0 else 0.0, DoubleType())
training = training.withColumn("is_late",is_late(training.arrdelay))


Create and fit Spark ML model

In [21]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline

# Create feature vectors. Ignore arr_delay and it's derivate, is_late
feature_assembler = VectorAssembler(
    inputCols=[x for x in training.columns if x not in ["is_late","arrdelay"]],
    outputCol="features")

reg = LogisticRegression().setParams(
    maxIter = 100,
    labelCol="is_late",
    predictionCol="prediction")

model = Pipeline(stages=[feature_assembler, reg]).fit(training)


In [20]:
[x for x in training.columns if x not in ["is_late","arrdelay"]]

['year',
 'month',
 'dayofmonth',
 'dayofweek',
 'deptime',
 'crsdeptime',
 'arrtime',
 'crsarrtime',
 'actualelapsetime',
 'crselapsetime',
 'airtime',
 'depdelay',
 'distance',
 'taxiin',
 'taxiout',
 'cancelled',
 'diverted',
 'carrierdelay',
 'weatherdelay',
 'nasdelay',
 'securitydelay',
 'lateaircraftdelay']

Predict whether the aircraft will be late

In [22]:
predicted = model.transform(test)

In [23]:
predicted.show()

+----+-----+----------+---------+-------+----------+-------+----------+----------------+-------------+-------+--------+--------+--------+------+-------+---------+--------+------------+------------+--------+-------------+-----------------+--------------------+--------------------+--------------------+----------+
|year|month|dayofmonth|dayofweek|deptime|crsdeptime|arrtime|crsarrtime|actualelapsetime|crselapsetime|airtime|arrdelay|depdelay|distance|taxiin|taxiout|cancelled|diverted|carrierdelay|weatherdelay|nasdelay|securitydelay|lateaircraftdelay|            features|       rawPrediction|         probability|prediction|
+----+-----+----------+---------+-------+----------+-------+----------+----------------+-------------+-------+--------+--------+--------+------+-------+---------+--------+------------+------------+--------+-------------+-----------------+--------------------+--------------------+--------------------+----------+
|2006|    2|        21|        2|    902|       905|   1027| 

In [24]:
predicted.select("is_late", "prediction").show()

AnalysisException: u"cannot resolve 'is_late' given input columns: [crselapsetime, dayofweek, taxiout, month, probability, carrierdelay, prediction, nasdelay, dayofmonth, lateaircraftdelay, rawPrediction, crsdeptime, airtime, year, securitydelay, cancelled, arrdelay, weatherdelay, actualelapsetime, arrtime, diverted, distance, features, depdelay, crsarrtime, deptime, taxiin];"

Check model performance

In [25]:
predicted = predicted.withColumn("is_late",is_late(predicted.arrdelay))
predicted.select("is_late", "prediction").show()

+-------+----------+
|is_late|prediction|
+-------+----------+
|    0.0|       0.0|
|    0.0|       0.0|
|    0.0|       0.0|
|    0.0|       0.0|
|    1.0|       1.0|
|    0.0|       0.0|
|    1.0|       0.0|
|    1.0|       1.0|
|    1.0|       0.0|
|    1.0|       1.0|
|    1.0|       0.0|
|    0.0|       0.0|
|    1.0|       1.0|
|    0.0|       0.0|
|    0.0|       0.0|
|    1.0|       1.0|
|    0.0|       0.0|
|    1.0|       0.0|
|    0.0|       0.0|
|    1.0|       1.0|
+-------+----------+
only showing top 20 rows



In [26]:
predicted.crosstab("is_late","prediction").show()

+------------------+----+----+
|is_late_prediction| 1.0| 0.0|
+------------------+----+----+
|               1.0|1448|1110|
|               0.0|  62|2805|
+------------------+----+----+

