In [0]:
# Run this script to load in the master rf_predict() function. 
# This function should be run on the all_time_full_join_6 without any data alterations. 

# RandomForestClassifier(featuresCol="features", labelCol=y_var, maxBins=370
#                                         ,maxDepth=12
#                                         ,numTrees=50
#                                         ,featureSubsetStrategy='sqrt'
#                                         ,subsamplingRate=.8).fit(df_train)


In [0]:
from pyspark.sql import types, Window, functions as F
import pandas as pd
import numpy as np
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.classification import GBTClassifier, RandomForestClassifier, RandomForestClassificationModel
from pyspark.mllib.evaluation import MulticlassMetrics

In [0]:
def rf_predict(df):
    '''
    Calculated predictions on the dataset you pass in. 
    Use function on all_time_full_join_6 without any data cleaning.
    All data cleaning is done within this function.
    
    Output:
    spark.DataFrame object with two new columns: rf_prob, and rf_prediction
    '''
    
    # load the final fitted model
    model_path = 'files/shared_uploads/trevorj@berkeley.edu/rf_0409_v2'
    rf_fit = RandomForestClassificationModel.load('dbfs:/' + model_path)
    
    # vars to use in the model
    X_vars = [
    # time vars
    'YEAR_AIRLNS', 'QUARTER_AIRLNS', 'MONTH_AIRLNS', 'DAY_OF_WEEK_AIRLNS', 'CRS_DEP_TIME_AIRLNS', 'CRS_ARR_TIME_AIRLNS', 
    
    # airport location stuff
    'CRS_ELAPSED_TIME_AIRLNS', 'DISTANCE_AIRLNS', 'ELEVATION_WTHR_origin', 'ELEVATION_WTHR_dest', 
    'LATITUDE_WTHR_origin', 'LONGITUDE_WTHR_origin', 'LATITUDE_WTHR_dest', 'LONGITUDE_WTHR_dest',
    
    # airport cat vars to encode/index
    'ORIGIN_AIRLNS', 'DEST_AIRLNS', 'OP_UNIQUE_CARRIER_AIRLNS', 
    
    # weather vars origin
    'WND_WTHR_direction_angle_origin', 'WND_WTHR_speed_rate_origin', 'TMP_WTHR_air_temperature_origin', 'DEW_WTHR_dew_point_temperature_origin',
    'VIS_WTHR_distance_dimension_origin', 'GA1_WTHR_base_height_dimension_origin', 'GF1_WTHR_lowest_cloud_base_height_dimension_origin', 
    'AA1_WTHR_period_quantity_in_hours_origin', 'AA1_WTHR_depth_dimension_origin', 'AA2_WTHR_depth_dimension_origin', 
    'AJ1_WTHR_equivalent_water_depth_dimension_origin', 'AN1_WTHR_depth_dimension_origin', 
    'AL1_WTHR_period_quantity_origin', 'AL1_WTHR_depth_dimension_origin', 'SLP_WTHR_sea_level_pressure_origin',
    'GA1_WTHR_coverage_code_origin-00', 'GF1_WTHR_total_coverage_code_origin-00', 'AA1_WTHR_condition_code_origin-3', 'AU1_WTHR_descriptor_code_origin-0',
    'AU1_WTHR_descriptor_code_origin-7', 'AU1_WTHR_obscuration_code_origin-0', 'AU1_WTHR_other_weather_phenomena_code_origin-0', 

    # same weather vars, but for dest
    'WND_WTHR_direction_angle_dest', 'WND_WTHR_speed_rate_dest', 'TMP_WTHR_air_temperature_dest', 'DEW_WTHR_dew_point_temperature_dest',
    'VIS_WTHR_distance_dimension_dest', 'GA1_WTHR_base_height_dimension_dest', 'GF1_WTHR_lowest_cloud_base_height_dimension_dest', 
    'AA1_WTHR_period_quantity_in_hours_dest', 'AA1_WTHR_depth_dimension_dest', 'AA2_WTHR_depth_dimension_dest', 
    'AJ1_WTHR_equivalent_water_depth_dimension_dest', 'AN1_WTHR_depth_dimension_dest', 
    'AL1_WTHR_period_quantity_dest', 'AL1_WTHR_depth_dimension_dest', 'SLP_WTHR_sea_level_pressure_dest', 
    'GA1_WTHR_coverage_code_dest-00', 'GF1_WTHR_total_coverage_code_dest-00', 'AA1_WTHR_condition_code_dest-3',
    'AU1_WTHR_descriptor_code_dest-7', 'AU1_WTHR_obscuration_code_dest-0', 'AU1_WTHR_other_weather_phenomena_code_dest-0', 
    
    # esther feature eng
    'LOCAL_DEP_HOUR', 'HOLIDAY', 'Prev_Flight_Delay_15', 'Enough_Time_Btwn_Estimate_Arrival_and_Planned_Dep', 'Poor_Schedule'
    ]

    y_var = 'DEP_DEL15_AIRLNS'
    
    # create an id column for final join
    df = df.withColumn("rf_id", F.monotonically_increasing_id())
    
    df2 = df.alias('df2')
    
    # cast to int
    str_cols = ['Prev_Flight_Delay_15', 'Poor_Schedule', 'Enough_Time_Btwn_Estimate_Arrival_and_Planned_Dep']
    for column in str_cols:
        df2 = df2.withColumn(column, F.col(column).cast(types.IntegerType())) 
    
    # drop missing values from Y var
    df2 = df2.filter(~F.col('DEP_DEL15_AIRLNS').isNull())

    # impute some missing values
    df2 = df2.na.fill(0)
    
    # get fields
    features = [i for i in df2.columns if i != "DEP_DEL15_AIRLNS"]
    str_cols = [t[0] for t in df2.dtypes if t[1] == 'string' and t[0] in features]
    # drop some features
    #features.remove('FL_DATE_AIRLNS')
    
    # index all str columns    
    vars_to_index = [i for i in str_cols if i != 'FL_DATE_AIRLNS']

    # rename cols to drop them later
    for var in vars_to_index:
        df2 = df2.withColumnRenamed(var, var+'_old')

    # finally, index them
    indexer = StringIndexer(inputCols=[i+'_old' for i in vars_to_index], outputCols=vars_to_index)
    df2 = indexer.fit(df2).transform(df2)
    df2 = df2.drop(*[i+'_old' for i in vars_to_index])
    
    
    
    # vectorize
    df2 = df2.select(X_vars + [y_var])
    vectorAssembler = VectorAssembler(inputCols = X_vars, outputCol = 'features', handleInvalid='skip')
    df2 = vectorAssembler.transform(df2).select(['features', y_var])
    
    # make predictions
    df2 = rf_fit.transform(df2)
    
    # Extract probabilities
    get_item=F.udf(lambda v:float(v[1]), types.FloatType())
    df2 = df2.withColumn("rf_prob", get_item('probability'))
    df2 = df2.withColumnRenamed('prediction', 'rf_prediction')
    df2 = df2.select('rf_prob', 'rf_prediction')
    df2 = df2.withColumn('rf_id', F.monotonically_increasing_id())
    
    # join preds to original dataset and return it
    df = df = df.join(df2, on='rf_id', how='left').drop('rf_id')
    return df
    
    

In [0]:
# Demonstration of the algorithm below:

In [0]:
blob_container = "main-storage" # The name of your container created in https://portal.azure.com
storage_account = "team05w261" # The name of your Storage account created in https://portal.azure.com
secret_scope = "team05" # The name of the scope created in your local computer using the Databricks CLI
secret_key = "team05-key" # The name of the secret key created in your local computer using the Databricks CLI 
blob_url = f"wasbs://{blob_container}@{storage_account}.blob.core.windows.net"
mount_path = "/mnt/mids-w261"
spark.conf.set(
  f"fs.azure.account.key.{storage_account}.blob.core.windows.net",
  dbutils.secrets.get(scope = secret_scope, key = secret_key)
)
df = spark.read.parquet(f"{blob_url}/all_time_full_join_6")

df_test = df.filter(F.col('YEAR_AIRLNS')==2019)
df_train = df.filter(F.col('YEAR_AIRLNS')<=2018)
# df_test = rf_predict(df_test)
# display(df_test)

In [0]:
# df_test.groupby('rf_prediction', 'DEP_DEL15_AIRLNS').count().show()

In [0]:
# +-------------+----------------+-------+
# |rf_prediction|DEP_DEL15_AIRLNS|  count|
# +-------------+----------------+-------+
# |          1.0|             1.0| 819949|
# |          0.0|             1.0| 533510|
# |          1.0|             0.0| 937419|
# |          0.0|             0.0|4977354|
# +-------------+----------------+-------+

In [0]:
#f2 score
tp = 819949
fp = 533510
tn = 4977354
fn = 937419
precision = tp / (tp + fp)
recall = tp / (tp + fn)
((1+2**2) * tp) / ((1+2**2)*tp + 2**2 * fn + fp)

In [0]:
print(precision)
print(recall)

In [0]:
df_test_rdd = df_test.select('rf_prediction', 'dep_del15_airlns').withColumnRenamed('rf_prediction', 'prediction').withColumnRenamed('dep_del15_airlns', 'label').rdd
metrics = MulticlassMetrics(df_test_rdd)
    
# classification metrics
cm = metrics.confusionMatrix().toArray()
print(f'f2 score with package: {metrics.fMeasure(0.0, 2.0)}')

# confirm I'm getting the same f score here
accuracy = (cm[0][0] + cm[1][1]) / cm.sum()
precision = (cm[1][1]) / (cm[1][1] + cm[0][1])
recall = (cm[1][1]) / (cm[1][1] + cm[1][0])

def f_score(beta, precision, recall):
    return (1+beta**2) * precision * recall / (beta**2 * precision + recall)

print(f'f2 score from first principles: {f_score(2, precision, recall)}')

In [0]:
metrics.weightedFMeature(2.0)

In [0]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator


predictions = df_train\
    .withColumnRenamed('xgb_prob', 'probability')\
    .withColumnRenamed('xgb_prediction', 'rawPrediction')\
    .withColumnRenamed('DEP_DEL15_AIRLNS', 'label')\
    .select('probability', 'label', 'rawPrediction')\
    .filter(F.col('label').isNotNull())

evaluator = BinaryClassificationEvaluator(labelCol='label')

# We have only two choices: area under ROC and PR curves :-(
auroc = evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"})
auprc = evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderPR"})
print("Area under ROC Curve: {:.4f}".format(auroc))
print("Area under PR Curve: {:.4f}".format(auprc))