In [0]:
from pyspark.sql.types import IntegerType, FloatType, DateType
import pyspark.sql.functions as F
from pyspark.mllib.linalg import Vectors
from pyspark.ml.param import Param, Params
from pyspark.ml.feature import OneHotEncoder, VectorAssembler, StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [0]:
SEED = 42
DEBUG = True # uses smaller dataset for testing
DISPLAY_LIMIT = 10

In [0]:
airline_df = spark.read.format("delta").load("dbfs:/user/airline/table")

In [0]:
airline_df.printSchema()

root
 |-- FL_DATE: string (nullable = true)
 |-- OP_CARRIER: string (nullable = true)
 |-- OP_CARRIER_FL_NUM: string (nullable = true)
 |-- ORIGIN: string (nullable = true)
 |-- DEST: string (nullable = true)
 |-- CRS_DEP_TIME: string (nullable = true)
 |-- DEP_TIME: string (nullable = true)
 |-- DEP_DELAY: string (nullable = true)
 |-- TAXI_OUT: string (nullable = true)
 |-- WHEELS_OFF: string (nullable = true)
 |-- WHEELS_ON: string (nullable = true)
 |-- TAXI_IN: string (nullable = true)
 |-- CRS_ARR_TIME: string (nullable = true)
 |-- ARR_TIME: string (nullable = true)
 |-- ARR_DELAY: string (nullable = true)
 |-- CANCELLED: string (nullable = true)
 |-- CANCELLATION_CODE: string (nullable = true)
 |-- DIVERTED: string (nullable = true)
 |-- CRS_ELAPSED_TIME: string (nullable = true)
 |-- ACTUAL_ELAPSED_TIME: string (nullable = true)
 |-- AIR_TIME: string (nullable = true)
 |-- DISTANCE: string (nullable = true)
 |-- CARRIER_DELAY: string (nullable = true)
 |-- WEATHER_DELAY: strin

In [0]:
airline_df.count()

Out[65]: 43051239

In [0]:
if DEBUG:
    airline_df = airline_df.limit(400_000)

In [0]:
# TODO: Figure out what to do with nulls in delay columns

In [0]:
# cast columns

def cast_types(airline_df):
     return airline_df.withColumn("FL_DATE", airline_df.FL_DATE.cast(DateType())) \
                   .withColumn("OP_CARRIER_FL_NUM", airline_df.OP_CARRIER_FL_NUM.cast(IntegerType())) \
                   .withColumn("CRS_DEP_TIME", airline_df.CRS_DEP_TIME.cast(IntegerType())) \
                   .withColumn("DEP_TIME", airline_df.DEP_TIME.cast(FloatType())) \
                   .withColumn("DEP_DELAY", airline_df.DEP_DELAY.cast(FloatType())) \
                   .withColumn("TAXI_OUT", airline_df.TAXI_OUT.cast(FloatType())) \
                   .withColumn("WHEELS_OFF", airline_df.WHEELS_OFF.cast(FloatType())) \
                   .withColumn("WHEELS_ON", airline_df.WHEELS_ON.cast(FloatType())) \
                   .withColumn("TAXI_IN", airline_df.TAXI_IN.cast(FloatType())) \
                   .withColumn("CRS_ARR_TIME", airline_df.CRS_ARR_TIME.cast(IntegerType())) \
                   .withColumn("ARR_TIME", airline_df.ARR_TIME.cast(FloatType())) \
                   .withColumn("ARR_DELAY", airline_df.ARR_DELAY.cast(FloatType())) \
                   .withColumn("CANCELLED", airline_df.CANCELLED.cast(FloatType())) \
                   .withColumn("DIVERTED", airline_df.DIVERTED.cast(FloatType())) \
                   .withColumn("CRS_ELAPSED_TIME", airline_df.CRS_ELAPSED_TIME.cast(FloatType())) \
                   .withColumn("ACTUAL_ELAPSED_TIME", airline_df.ACTUAL_ELAPSED_TIME.cast(FloatType())) \
                   .withColumn("AIR_TIME", airline_df.AIR_TIME.cast(FloatType())) \
                   .withColumn("DISTANCE", airline_df.DISTANCE.cast(FloatType())) \
                   .withColumn("CARRIER_DELAY", airline_df.CARRIER_DELAY.cast(FloatType())) \
                   .withColumn("WEATHER_DELAY", airline_df.WEATHER_DELAY.cast(FloatType())) \
                   .withColumn("NAS_DELAY", airline_df.NAS_DELAY.cast(FloatType())) \
                   .withColumn("SECURITY_DELAY", airline_df.SECURITY_DELAY.cast(FloatType())) \
                   .withColumn("LATE_AIRCRAFT_DELAY", airline_df.LATE_AIRCRAFT_DELAY.cast(FloatType()))
    
airline_df = cast_types(airline_df)

In [0]:
def feature_engineering(df):
    return (df
            .withColumn("FL_YEAR", F.year(F.col("FL_DATE")) - 2009)
            .withColumn("FL_MONTH", F.month(F.col("FL_DATE")))
            .withColumn("FL_DAYOFMONTH", F.dayofmonth(F.col("FL_DATE")))
            .withColumn("FL_DAYOFWEEK", F.dayofweek(F.col("FL_DATE")))
           )
    
airline_df = feature_engineering(airline_df)

In [0]:

# added only some for now until it is figured out how to handle nulls
numeric_features = ["OP_CARRIER_FL_NUM", "CRS_DEP_TIME"]

categorical_features = ["ORIGIN", "DEST", "OP_CARRIER"]

# month, dayofmonth and dayofweek could be numeric or categorical
date_columns = ["FL_MONTH", "FL_DAYOFMONTH", "FL_DAYOFWEEK"]
numeric_features += date_columns # adding to numeric for now

target_col = "CANCELLED"

airline_df = airline_df.withColumn("label", F.col(target_col))

In [0]:
indexers = [
    StringIndexer(inputCol=col, outputCol=f"{col}_INDEXED", handleInvalid="error")
    for col in categorical_features
]

encoders = [
    OneHotEncoder(inputCols=[indexer.getOutputCol()], outputCols=[f"{indexer.getOutputCol()}_ENCODED"], handleInvalid="error")
    for indexer in indexers
]

assembler = VectorAssembler(inputCols=[encoder.getOutputCols()[0] for encoder in encoders] + numeric_features, outputCol="features")

stages = indexers + encoders + [assembler]

In [0]:
# it makes sense to split the data before fitting the pipeline
# the other scenario would not be a good ML practice

(train_df, test_df) = airline_df.randomSplit([0.7, 0.3], seed=SEED)

In [0]:
pipeline = Pipeline(stages=stages)
pipeline_model = pipeline.fit(train_df)

In [0]:
train_df = pipeline_model.transform(train_df)
test_df = pipeline_model.transform(test_df)

In [0]:
# low number of iterations for testing
lr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=3, regParam=0.1)

In [0]:
lr_model = lr.fit(train_df)

In [0]:
predictions = lr_model.transform(test_df)

In [0]:
display(predictions.limit(DISPLAY_LIMIT))

FL_DATE,OP_CARRIER,OP_CARRIER_FL_NUM,ORIGIN,DEST,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,TAXI_OUT,WHEELS_OFF,WHEELS_ON,TAXI_IN,CRS_ARR_TIME,ARR_TIME,ARR_DELAY,CANCELLED,CANCELLATION_CODE,DIVERTED,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,DISTANCE,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,FL_YEAR,FL_MONTH,FL_DAYOFMONTH,FL_DAYOFWEEK,label,ORIGIN_INDEXED,DEST_INDEXED,OP_CARRIER_INDEXED,ORIGIN_INDEXED_ENCODED,DEST_INDEXED_ENCODED,OP_CARRIER_INDEXED_ENCODED,features,rawPrediction,probability,prediction
2009-01-01,9E,2108,OKC,MSP,700,650.0,-10.0,15.0,705.0,841.0,19.0,915,900.0,-15.0,0.0,,0.0,135.0,130.0,96.0,695.0,,,,,,0,1,1,5,0.0,62.0,14.0,9.0,"Map(vectorType -> sparse, length -> 278, indices -> List(62), values -> List(1.0))","Map(vectorType -> sparse, length -> 278, indices -> List(14), values -> List(1.0))","Map(vectorType -> sparse, length -> 18, indices -> List(9), values -> List(1.0))","Map(vectorType -> sparse, length -> 579, indices -> List(62, 292, 565, 574, 575, 576, 577, 578), values -> List(1.0, 1.0, 1.0, 2108.0, 700.0, 1.0, 1.0, 5.0))","Map(vectorType -> dense, length -> 2, values -> List(4.023346000160995, -4.023346000160995))","Map(vectorType -> dense, length -> 2, values -> List(0.9824215366495133, 0.01757846335048674))",0.0
2009-01-01,9E,2115,MSP,ALO,2245,2245.0,0.0,31.0,2316.0,2346.0,7.0,2343,2353.0,10.0,0.0,,0.0,58.0,68.0,30.0,166.0,,,,,,0,1,1,5,0.0,14.0,276.0,9.0,"Map(vectorType -> sparse, length -> 278, indices -> List(14), values -> List(1.0))","Map(vectorType -> sparse, length -> 278, indices -> List(276), values -> List(1.0))","Map(vectorType -> sparse, length -> 18, indices -> List(9), values -> List(1.0))","Map(vectorType -> sparse, length -> 579, indices -> List(14, 554, 565, 574, 575, 576, 577, 578), values -> List(1.0, 1.0, 1.0, 2115.0, 2245.0, 1.0, 1.0, 5.0))","Map(vectorType -> dense, length -> 2, values -> List(2.4229784226192077, -2.4229784226192077))","Map(vectorType -> dense, length -> 2, values -> List(0.9185628238648527, 0.08143717613514734))",0.0
2009-01-01,9E,2120,STL,MSP,1610,1552.0,-18.0,7.0,1559.0,1705.0,16.0,1800,1721.0,-39.0,0.0,,0.0,110.0,89.0,66.0,449.0,,,,,,0,1,1,5,0.0,30.0,14.0,9.0,"Map(vectorType -> sparse, length -> 278, indices -> List(30), values -> List(1.0))","Map(vectorType -> sparse, length -> 278, indices -> List(14), values -> List(1.0))","Map(vectorType -> sparse, length -> 18, indices -> List(9), values -> List(1.0))","Map(vectorType -> sparse, length -> 579, indices -> List(30, 292, 565, 574, 575, 576, 577, 578), values -> List(1.0, 1.0, 1.0, 2120.0, 1610.0, 1.0, 1.0, 5.0))","Map(vectorType -> dense, length -> 2, values -> List(3.91218282894715, -3.91218282894715))","Map(vectorType -> dense, length -> 2, values -> List(0.9803952289744075, 0.01960477102559255))",0.0
2009-01-01,9E,2122,CLE,MSP,1343,1338.0,-5.0,11.0,1349.0,1431.0,8.0,1458,1439.0,-19.0,0.0,,0.0,135.0,121.0,102.0,622.0,,,,,,0,1,1,5,0.0,34.0,14.0,9.0,"Map(vectorType -> sparse, length -> 278, indices -> List(34), values -> List(1.0))","Map(vectorType -> sparse, length -> 278, indices -> List(14), values -> List(1.0))","Map(vectorType -> sparse, length -> 18, indices -> List(9), values -> List(1.0))","Map(vectorType -> sparse, length -> 579, indices -> List(34, 292, 565, 574, 575, 576, 577, 578), values -> List(1.0, 1.0, 1.0, 2122.0, 1343.0, 1.0, 1.0, 5.0))","Map(vectorType -> dense, length -> 2, values -> List(3.8885166096913455, -3.8885166096913455))","Map(vectorType -> dense, length -> 2, values -> List(0.979935144920702, 0.020064855079298005))",0.0
2009-01-01,9E,2125,MSP,CLE,1015,1014.0,-1.0,23.0,1037.0,1259.0,6.0,1312,1305.0,-7.0,0.0,,0.0,117.0,111.0,82.0,622.0,,,,,,0,1,1,5,0.0,14.0,34.0,9.0,"Map(vectorType -> sparse, length -> 278, indices -> List(14), values -> List(1.0))","Map(vectorType -> sparse, length -> 278, indices -> List(34), values -> List(1.0))","Map(vectorType -> sparse, length -> 18, indices -> List(9), values -> List(1.0))","Map(vectorType -> sparse, length -> 579, indices -> List(14, 312, 565, 574, 575, 576, 577, 578), values -> List(1.0, 1.0, 1.0, 2125.0, 1015.0, 1.0, 1.0, 5.0))","Map(vectorType -> dense, length -> 2, values -> List(3.941171183405511, -3.941171183405511))","Map(vectorType -> dense, length -> 2, values -> List(0.9809447070017239, 0.01905529299827613))",0.0
2009-01-01,9E,2126,IND,FLL,715,712.0,-3.0,13.0,725.0,945.0,3.0,1005,948.0,-17.0,0.0,,0.0,170.0,156.0,140.0,1005.0,,,,,,0,1,1,5,0.0,48.0,27.0,9.0,"Map(vectorType -> sparse, length -> 278, indices -> List(48), values -> List(1.0))","Map(vectorType -> sparse, length -> 278, indices -> List(27), values -> List(1.0))","Map(vectorType -> sparse, length -> 18, indices -> List(9), values -> List(1.0))","Map(vectorType -> sparse, length -> 579, indices -> List(48, 305, 565, 574, 575, 576, 577, 578), values -> List(1.0, 1.0, 1.0, 2126.0, 715.0, 1.0, 1.0, 5.0))","Map(vectorType -> dense, length -> 2, values -> List(3.995189587610272, -3.995189587610272))","Map(vectorType -> dense, length -> 2, values -> List(0.9819286278369985, 0.018071372163001476))",0.0
2009-01-01,9E,2127,FLL,IND,1115,1110.0,-5.0,17.0,1127.0,1346.0,3.0,1412,1349.0,-23.0,0.0,,0.0,177.0,159.0,139.0,1005.0,,,,,,0,1,1,5,0.0,28.0,47.0,9.0,"Map(vectorType -> sparse, length -> 278, indices -> List(28), values -> List(1.0))","Map(vectorType -> sparse, length -> 278, indices -> List(47), values -> List(1.0))","Map(vectorType -> sparse, length -> 18, indices -> List(9), values -> List(1.0))","Map(vectorType -> sparse, length -> 579, indices -> List(28, 325, 565, 574, 575, 576, 577, 578), values -> List(1.0, 1.0, 1.0, 2127.0, 1115.0, 1.0, 1.0, 5.0))","Map(vectorType -> dense, length -> 2, values -> List(4.050870829010496, -4.050870829010496))","Map(vectorType -> dense, length -> 2, values -> List(0.9828906172723443, 0.01710938272765572))",0.0
2009-01-01,9E,2133,MSP,PIT,1320,1320.0,0.0,26.0,1346.0,1615.0,4.0,1628,1619.0,-9.0,0.0,,0.0,128.0,119.0,89.0,726.0,,,,,,0,1,1,5,0.0,14.0,49.0,9.0,"Map(vectorType -> sparse, length -> 278, indices -> List(14), values -> List(1.0))","Map(vectorType -> sparse, length -> 278, indices -> List(49), values -> List(1.0))","Map(vectorType -> sparse, length -> 18, indices -> List(9), values -> List(1.0))","Map(vectorType -> sparse, length -> 579, indices -> List(14, 327, 565, 574, 575, 576, 577, 578), values -> List(1.0, 1.0, 1.0, 2133.0, 1320.0, 1.0, 1.0, 5.0))","Map(vectorType -> dense, length -> 2, values -> List(3.9967657758445942, -3.9967657758445942))","Map(vectorType -> dense, length -> 2, values -> List(0.9819565757430068, 0.018043424256993235))",0.0
2009-01-01,9E,2139,IND,SAT,1655,1655.0,0.0,21.0,1716.0,1840.0,7.0,1835,1847.0,12.0,0.0,,0.0,160.0,172.0,144.0,986.0,,,,,,0,1,1,5,0.0,48.0,44.0,9.0,"Map(vectorType -> sparse, length -> 278, indices -> List(48), values -> List(1.0))","Map(vectorType -> sparse, length -> 278, indices -> List(44), values -> List(1.0))","Map(vectorType -> sparse, length -> 18, indices -> List(9), values -> List(1.0))","Map(vectorType -> sparse, length -> 579, indices -> List(48, 322, 565, 574, 575, 576, 577, 578), values -> List(1.0, 1.0, 1.0, 2139.0, 1655.0, 1.0, 1.0, 5.0))","Map(vectorType -> dense, length -> 2, values -> List(3.9630597040366866, -3.9630597040366866))","Map(vectorType -> dense, length -> 2, values -> List(0.9813495731041358, 0.01865042689586416))",0.0
2009-01-01,9E,2141,DCA,MSN,1105,1100.0,-5.0,10.0,1110.0,1204.0,4.0,1218,1208.0,-10.0,0.0,,0.0,133.0,128.0,114.0,707.0,,,,,,0,1,1,5,0.0,21.0,83.0,9.0,"Map(vectorType -> sparse, length -> 278, indices -> List(21), values -> List(1.0))","Map(vectorType -> sparse, length -> 278, indices -> List(83), values -> List(1.0))","Map(vectorType -> sparse, length -> 18, indices -> List(9), values -> List(1.0))","Map(vectorType -> sparse, length -> 579, indices -> List(21, 361, 565, 574, 575, 576, 577, 578), values -> List(1.0, 1.0, 1.0, 2141.0, 1105.0, 1.0, 1.0, 5.0))","Map(vectorType -> dense, length -> 2, values -> List(3.7357840024646625, -3.7357840024646625))","Map(vectorType -> dense, length -> 2, values -> List(0.9767013157930974, 0.02329868420690262))",0.0


In [0]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
evaluator.evaluate(predictions)

Out[162]: 0.742386107443452