In [None]:
# Static variables
BUCKET='elite-caster-125113'

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.dataframe import DataFrame
spark = SparkSession\
  .builder \
  .appName("Lgistic regression w/ Spark ML") \
  .getOrCreate()

In [None]:
from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel
from pyspark.mllib.regression import LabeledPoint
import numpy as np
from matplotlib import pyplot as plt

## Creating a Training Dataset

In [None]:
# CSV to Dataframe
traindays: DataFrame = spark.read \
  .option("header", "true") \
  .csv('gs://{}/flights/trainday.csv'.format(BUCKET))

In [None]:
traindays.printSchema()

In [None]:
# Register the dataframe as TempView for spark sql
traindays.createOrReplaceTempView('traindays')

In [None]:
spark.sql("SELECT * FROM traindays LIMIT 5").show()

In [None]:
from pyspark.sql.types import StringType, FloatType, StructType, StructField

header = 'FL_DATE,OP_UNIQUE_CARRIER,OP_CARRIER_AIRLINE_ID,OP_CARRIER,OP_CARRIER_FL_NUM,ORIGIN_AIRPORT_ID,'
header += 'ORIGIN_AIRPORT_SEQ_ID,ORIGIN_CITY_MARKET_ID,ORIGIN,DEST_AIRPORT_ID,'
header += 'DEST_AIRPORT_SEQ_ID,DEST_CITY_MARKET_ID,DEST,CRS_DEP_TIME,DEP_TIME,'
header += 'DEP_DELAY,TAXI_OUT,WHEELS_OFF,WHEELS_ON,TAXI_IN,'
header += 'CRS_ARR_TIME,ARR_TIME,ARR_DELAY,CANCELLED,'
header += 'CANCELLATION_CODE,DIVERTED,DISTANCE,DEP_AIRPORT_LAT,'
header += 'DEP_AIRPORT_LON,DEP_AIRPORT_TZOFFSET,ARR_AIRPORT_LAT,ARR_AIRPORT_LON,'
header += 'ARR_AIRPORT_TZOFFSET,EVENT,NOTIFY_TIME'

print(header)

def get_structfield(colname: str) -> StructField:
    if colname in ['ARR_DELAY', 'DEP_DELAY', 'DISTANCE', 'TAXI_OUT']:
        return StructField(colname, FloatType(), True)
    else:
        return StructField(colname, StringType(), True)


schema = StructType([get_structfield(colname) for colname in header.split(',')])

In [None]:
inputs = 'gs://{}/flights/tzcorr/flights-00000-*'.format(BUCKET)
# inputs = 'gs://{}/flights/tzcorr/flights-*'.format(BUCKET)

In [None]:
flights: DataFrame = spark.read \
  .schema(schema) \
  .csv(inputs)
    
flights.createOrReplaceTempView('flights')

In [None]:
trainquery: str = """
SELECT
  f.*
FROM flights f
JOIN traindays t
ON f.FL_DATE == t.FL_DATE
WHERE
 t.is_train_day == 'True'
"""

traindata: DataFrame = spark.sql(trainquery)

## Dealing with Corner Cases

In [None]:
traindata.head(2)

In [None]:
traindata[["DEP_DELAY", "TAXI_OUT", "ARR_DELAY", "DISTANCE"]].describe().show()

In [None]:
# Revise query by putting NULL fields into account
# Flights that were scheduled but 
#   never left the gate (DEP_DELAY is null)
#   never take off (TAXI_OUT is null) 
# Flights took off but diverted and do not have an ARR_DELAY (This includes TAXI_OUT)
trainquery_revised: str = """
SELECT
  DEP_DELAY, TAXI_OUT, ARR_DELAY, DISTANCE
FROM flights f
JOIN traindays t
ON f.FL_DATE == t.FL_DATE
WHERE
 t.is_train_day == 'True' AND
 f.DEP_DELAY IS NOT NULL AND
 f.ARR_DELAY IS NOT NULL
"""
    
traindata: DataFrame = spark.sql(trainquery_revised)
traindata.describe().show()


In [None]:
# I want to fix the root cause instead of fixing the symptom
# See if there are really no NULLs
trainquery_revised_test: str = """
SELECT
  DEP_DELAY, TAXI_OUT, ARR_DELAY, DISTANCE
FROM flights f
JOIN traindays t
ON f.FL_DATE == t.FL_DATE
WHERE
 t.is_train_day == 'True' AND
 f.CANCELLED == '0.00' AND
 f.DIVERTED == '0.00' AND
 (f.DEP_DELAY IS NULL) OR
 (f.ARR_DELAY IS NULL)
"""
    
traindata: DataFrame = spark.sql(trainquery_revised_test)
traindata.head(5)

In [None]:
# Lookds like there still NULLs although we have excluded CACELLED and DIVERTED flights.
# Note: In the book it says that counts will be the same but in my caee it was not so I still needed to exclude the NULLs.
trainquery_revised_final: str = """
SELECT
  DEP_DELAY, TAXI_OUT, ARR_DELAY, DISTANCE
FROM flights f
JOIN traindays t
ON f.FL_DATE == t.FL_DATE
WHERE
 t.is_train_day == 'True' AND
 f.CANCELLED == '0.00' AND
 f.DIVERTED == '0.00' AND
 f.DEP_DELAY IS NOT NULL AND
 f.ARR_DELAY IS NOT NULL
"""
    
traindata: DataFrame = spark.sql(trainquery_revised_final)
traindata.describe().show()


## Creating Training Examples

In [None]:
# To use Logistic Regression (https://bit.ly/3HGBYpw)
# I first need labled training sets for binary outcomes.
# In this case , positive lable (1) and negative label(0)
# Note: https://spark.apache.org/docs/3.1.1/mllib-linear-methods.html#loss-functions
# Note that, in the mathematical formulation above, a binary label y is denoted as either +1 (positive) or −1 (negative), 
# which is convenient for the formulation. 
# However, the negative label is represented by 0 in spark.mllib instead of −1, to be consistent with multiclass labeling.
def to_example(raw_data_point: DataFrame) -> LabeledPoint:
    return LabeledPoint(\
            float(raw_data_point['ARR_DELAY'] < 15),  # on-time? \
            [ \
                raw_data_point['DEP_DELAY'], \
                raw_data_point['TAXI_OUT'], \
                raw_data_point['DISTANCE'], \
            ])

examples: DataFrame = traindata.rdd.map(to_example)

## Training

In [None]:
# Creating a model means finding out the weights
# w0*x0 + w1*x1 + w2*x2 + b
lrmodel: LogisticRegressionModel = LogisticRegressionWithLBFGS.train(examples, intercept=True)

In [None]:
print(lrmodel.weights,lrmodel.intercept)

In [None]:
lrmodel.predict([6.0, 12.0, 594.0])

In [None]:
lrmodel.predict([36.0, 12.0, 594.0])

In [None]:
lrmodel.clearThreshold()

In [None]:
# Predict probability with fixed dep delay and taxi-out
dist: np.ndarray = np.arange(10, 2000, 10)
prob: list = [lrmodel.predict([20,10,d]) for d in dist]
plt.plot(dist, prob)

In [None]:
# Predict probability with fixed taxi-out  and distance
delay: np.ndarray = np.arange(-20, 60, 1)
prob= list = [lrmodel.predict([d, 10, 500]) for d in delay]
ax = plt.plot(delay, prob)

In [None]:
lrmodel.setThreshold(0.7)

# Predicting by Using a Model

In [None]:
# Save model to cloud stroage for future use
MODEL_FILE: str = f"gs://{BUCKET}/flights/sparkmloutput/model"
lrmodel.save(sc, MODEL_FILE)

In [None]:
# Predict from saved model in google storage
from pyspark.mllib.classification import LogisticRegressionModel
lrmodel: LogisticRegressionModel = LogisticRegressionModel.load(sc, MODEL_FILE)
lrmodel.setThreshold(0.7)

In [None]:
print(lrmodel.predict([36.0, 12.0, 594.0]))