In [0]:
# Run this script to load in the master xgb_predict() function. 
# This function should be run on the all_time_full_join_6 without any data alterations. 
# Note the model saved at this path as the following parameters: 'files/shared_uploads/trevorj@berkeley.edu/xgb_0408_v1'
## max_depth=7, 
## n_estimators=30, 
## learning_rate=.05, 
## colsample_bytree=.8, 
## gamma = .1, 
## reg_alpha = 0, 
## reg_lambda = 0

## but the model saved at this path has the following hyperparams: 'files/shared_uploads/trevorj@berkeley.edu/xgb_0408_v2'
# xgb_final = XgboostClassifier(labelCol=y_var, missing=0.0, max_depth=10, n_estimators=60, learning_rate=.1, colsample_bytree=.8, 
#                               gamma = .05, reg_alpha = 0, reg_lambda = .1).fit(df_train)
## And the model is trained on a downsampled version of <=2018 data for an even 50-50 response variable split

In [0]:
from pyspark.sql import types, Window, functions as F
import pandas as pd
import numpy as np
from pyspark.ml.feature import VectorAssembler, StringIndexer
from sparkdl.xgboost import XgboostClassifier, XgboostClassifierModel
from pyspark.mllib.evaluation import MulticlassMetrics, BinaryClassificationMetrics

In [0]:
def xgb_predict(df):
    '''
    Calculated predictions on the dataset you pass in. 
    Use function on all_time_full_join_6 without any data cleaning.
    All data cleaning is done within this function.
    
    Output:
    spark.DataFrame object with two new columns: xgb_prob, and xgb_prediction
    '''
    
    # load the final fitted xgb model
    # model_path = 'files/shared_uploads/trevorj@berkeley.edu/xgb_0408_v1' # 0.5692620
    model_path = 'files/shared_uploads/trevorj@berkeley.edu/xgb_0408_v2' # 
    xgb_fit = XgboostClassifierModel.load('dbfs:/' + model_path)
    
    # vars to use in the model
    X_vars = [
        # time vars
        'YEAR_AIRLNS', 'QUARTER_AIRLNS', 'MONTH_AIRLNS', 'DAY_OF_WEEK_AIRLNS', 

        # airport location stuff
        'CRS_ELAPSED_TIME_AIRLNS', 'DISTANCE_AIRLNS', 'ELEVATION_WTHR_origin', 'ELEVATION_WTHR_dest', 
        'LATITUDE_WTHR_origin', 'LONGITUDE_WTHR_origin', 'LATITUDE_WTHR_dest', 'LONGITUDE_WTHR_dest',

        # airport cat vars to encode/index
        'ORIGIN_AIRLNS', 'DEST_AIRLNS', 'OP_UNIQUE_CARRIER_AIRLNS', 

        # weather vars origin
        'WND_WTHR_direction_angle_origin', 'WND_WTHR_speed_rate_origin', 'TMP_WTHR_air_temperature_origin', 'DEW_WTHR_dew_point_temperature_origin',
        'VIS_WTHR_distance_dimension_origin', 'GA1_WTHR_base_height_dimension_origin', 'GF1_WTHR_lowest_cloud_base_height_dimension_origin', 
        'AA1_WTHR_period_quantity_in_hours_origin', 'AA1_WTHR_depth_dimension_origin', 

        # same weather vars, but for dest
        'WND_WTHR_direction_angle_dest', 'WND_WTHR_speed_rate_dest', 'TMP_WTHR_air_temperature_dest', 'DEW_WTHR_dew_point_temperature_dest',
        'VIS_WTHR_distance_dimension_dest', 'GA1_WTHR_base_height_dimension_dest', 'GF1_WTHR_lowest_cloud_base_height_dimension_dest', 
        'AA1_WTHR_period_quantity_in_hours_dest', 'AA1_WTHR_depth_dimension_dest', 

        # esther feature eng
        'LOCAL_DEP_HOUR', 'HOLIDAY', 'Prev_Flight_Delay_15', 'Enough_Time_Btwn_Estimate_Arrival_and_Planned_Dep', 'Poor_Schedule'
    ]

    y_var = 'DEP_DEL15_AIRLNS'
    
    # create an id column for final join
    df = df.withColumn("xgb_id", F.monotonically_increasing_id())

    # impute some missing values
    df2 = df.na.fill({
        'LATITUDE_WTHR_origin': 0
        ,'LONGITUDE_WTHR_origin': 0
        ,'ELEVATION_WTHR_origin': 0
        ,'LATITUDE_WTHR_dest': 0
        ,'LONGITUDE_WTHR_dest': 0
        ,'ELEVATION_WTHR_dest': 0
        ,'Prev_Flight_Delay_15': 0
        #,'DEP_DEL15_AIRLNS': 0
    })
    
    # cast some vars to int
    str_cols = ['Prev_Flight_Delay_15', 'Poor_Schedule', 'Enough_Time_Btwn_Estimate_Arrival_and_Planned_Dep']
    for column in str_cols:
        df2 = df2.withColumn(column, F.col(column).cast(types.IntegerType())) 

    # vars to index
    # Specify which columns to index (ie cast to int)
    vars_to_index = [
        'ORIGIN_AIRLNS', 
        'DEST_AIRLNS', 
        'OP_UNIQUE_CARRIER_AIRLNS' # a more granular form of origin/dest airlines
    ]

    # rename cols to drop them later
    for var in vars_to_index:
        df2 = df2.withColumnRenamed(var, var+'_old')

    # finally, index them
    indexer = StringIndexer(inputCols=[i+'_old' for i in vars_to_index], outputCols=vars_to_index)
    df2 = indexer.fit(df2).transform(df2)
    df2 = df2.drop(*[i+'_old' for i in vars_to_index])
    
    # vectorize
    df2 = df2.select(X_vars + [y_var])
    vectorAssembler = VectorAssembler(inputCols = X_vars, outputCol = 'features', handleInvalid='skip')
    df2 = vectorAssembler.transform(df2).select(['features', y_var])
    
    # make predictions
    df2 = xgb_fit.transform(df2)
    
    # Extract probabilities
    get_item=F.udf(lambda v:float(v[1]), types.FloatType())
    df2 = df2.withColumn("xgb_prob", get_item('probability'))
    df2 = df2.withColumnRenamed('prediction', 'xgb_prediction')
    df2 = df2.select('xgb_prob', 'xgb_prediction')
    df2 = df2.withColumn('xgb_id', F.monotonically_increasing_id())
    
    # join preds to original dataset and return it
    df = df = df.join(df2, on='xgb_id', how='left').drop('xgb_id')
    return df
    
    

In [0]:
# Demonstration of the algorithm below:

In [0]:

blob_container = "main-storage" # The name of your container created in https://portal.azure.com
storage_account = "team05w261" # The name of your Storage account created in https://portal.azure.com
secret_scope = "team05" # The name of the scope created in your local computer using the Databricks CLI
secret_key = "team05-key" # The name of the secret key created in your local computer using the Databricks CLI 
blob_url = f"wasbs://{blob_container}@{storage_account}.blob.core.windows.net"
mount_path = "/mnt/mids-w261"
spark.conf.set(
  f"fs.azure.account.key.{storage_account}.blob.core.windows.net",
  dbutils.secrets.get(scope = secret_scope, key = secret_key)
)
df = spark.read.parquet(f"{blob_url}/all_time_full_join_6")

df_train = xgb_predict(df.filter(F.col('YEAR_AIRLNS')<=2018))
df_test = xgb_predict(df.filter(F.col('YEAR_AIRLNS')==2019))

In [0]:
# df_test.groupby('xgb_prediction', 'DEP_DEL15_AIRLNS').count().show()

In [0]:
# +--------------+----------------+-------+
# |xgb_prediction|DEP_DEL15_AIRLNS|  count|
# +--------------+----------------+-------+
# |           1.0|             1.0| 851342|
# |           0.0|             1.0| 502117|
# |           1.0|             0.0| 933015|
# |           0.0|             0.0|4981758|
# +--------------+----------------+-------+

In [0]:
# #f2 score
# tp = 813227
# fp = 915754
# tn = 4999019
# fn = 540232
# ((1+2**2) * tp) / ((1+2**2)*tp + 2**2 * fn + fp)

In [0]:
df_test_rdd = df_test.select('xgb_prediction', 'dep_del15_airlns').withColumnRenamed('xgb_prediction', 'prediction').withColumnRenamed('dep_del15_airlns', 'label').rdd
metrics = MulticlassMetrics(df_test_rdd)
    
# classification metrics
cm = metrics.confusionMatrix().toArray()
print(f'f2 score: {metrics.fMeasure(0.0, 2.0)}')

# confirm I'm getting the same f score here
accuracy = (cm[0][0] + cm[1][1]) / cm.sum()
precision = (cm[1][1]) / (cm[1][1] + cm[0][1])
recall = (cm[1][1]) / (cm[1][1] + cm[1][0])

def f_score(beta, precision, recall):
    return (1+beta**2) * precision * recall / (beta**2 * precision + recall)

print(f'f2 score calc2: {f_score(2, precision, recall)}')

In [0]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator


predictions = df_train\
    .withColumnRenamed('xgb_prob', 'probability')\
    .withColumnRenamed('xgb_prediction', 'rawPrediction')\
    .withColumnRenamed('DEP_DEL15_AIRLNS', 'label')\
    .select('probability', 'label', 'rawPrediction')\
    .filter(F.col('label').isNotNull())

evaluator = BinaryClassificationEvaluator(labelCol='label')

# We have only two choices: area under ROC and PR curves :-(
auroc = evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"})
auprc = evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderPR"})
print("Area under ROC Curve: {:.4f}".format(auroc))
print("Area under PR Curve: {:.4f}".format(auprc))