In [0]:
from pyspark.sql import types, Window, functions as F
import pandas as pd
import numpy as np
from pyspark.ml.feature import OneHotEncoder, OneHotEncoderModel, StringIndexer, StringIndexerModel, StandardScaler, StandardScalerModel, PCA, PCAModel, MinMaxScaler, MinMaxScalerModel, VectorAssembler
from pyspark.ml.classification import GBTClassifier, RandomForestClassifier, RandomForestClassificationModel, LogisticRegressionModel, LogisticRegression
from pyspark.mllib.evaluation import MulticlassMetrics
from sparkdl.xgboost import XgboostClassifier, XgboostClassifierModel

# The Class

In [0]:
class EnsemblePredict:
    
    def __init__(self):
        pass
        
    def xgb_predict(self, df):
        '''
        Calculated predictions on the dataset you pass in. 
        Use function on all_time_full_join_6 without any data cleaning.
        All data cleaning is done within this function.

        Output:
        spark.DataFrame object with two new columns: xgb_prob, and xgb_prediction
        '''

        # vars to use in the model
        X_vars = [
            # time vars
            'YEAR_AIRLNS', 'QUARTER_AIRLNS', 'MONTH_AIRLNS', 'DAY_OF_WEEK_AIRLNS', 

            # airport location stuff
            'CRS_ELAPSED_TIME_AIRLNS', 'DISTANCE_AIRLNS', 'ELEVATION_WTHR_origin', 'ELEVATION_WTHR_dest', 
            'LATITUDE_WTHR_origin', 'LONGITUDE_WTHR_origin', 'LATITUDE_WTHR_dest', 'LONGITUDE_WTHR_dest',

            # airport cat vars to encode/index
            'ORIGIN_AIRLNS', 'DEST_AIRLNS', 'OP_UNIQUE_CARRIER_AIRLNS', 

            # weather vars origin
            'WND_WTHR_direction_angle_origin', 'WND_WTHR_speed_rate_origin', 'TMP_WTHR_air_temperature_origin', 'DEW_WTHR_dew_point_temperature_origin',
            'VIS_WTHR_distance_dimension_origin', 'GA1_WTHR_base_height_dimension_origin', 'GF1_WTHR_lowest_cloud_base_height_dimension_origin', 
            'AA1_WTHR_period_quantity_in_hours_origin', 'AA1_WTHR_depth_dimension_origin', 

            # same weather vars, but for dest
            'WND_WTHR_direction_angle_dest', 'WND_WTHR_speed_rate_dest', 'TMP_WTHR_air_temperature_dest', 'DEW_WTHR_dew_point_temperature_dest',
            'VIS_WTHR_distance_dimension_dest', 'GA1_WTHR_base_height_dimension_dest', 'GF1_WTHR_lowest_cloud_base_height_dimension_dest', 
            'AA1_WTHR_period_quantity_in_hours_dest', 'AA1_WTHR_depth_dimension_dest', 

            # esther feature eng
            'LOCAL_DEP_HOUR', 'HOLIDAY', 'Prev_Flight_Delay_15', 'Enough_Time_Btwn_Estimate_Arrival_and_Planned_Dep', 'Poor_Schedule'
        ]

        y_var = 'DEP_DEL15_AIRLNS'

        # create an id column for final join
        df = df.withColumn("xgb_id", F.monotonically_increasing_id())

        # impute some missing values
        df2 = df.na.fill(0)
#         df2 = df.na.fill({
#             'LATITUDE_WTHR_origin': 0
#             ,'LONGITUDE_WTHR_origin': 0
#             ,'ELEVATION_WTHR_origin': 0
#             ,'LATITUDE_WTHR_dest': 0
#             ,'LONGITUDE_WTHR_dest': 0
#             ,'ELEVATION_WTHR_dest': 0
#             ,'Prev_Flight_Delay_15': 0
#             #,'DEP_DEL15_AIRLNS': 0
#         })

        # cast some vars to int
        str_cols = ['Prev_Flight_Delay_15', 'Poor_Schedule', 'Enough_Time_Btwn_Estimate_Arrival_and_Planned_Dep']
        for column in str_cols:
            df2 = df2.withColumn(column, F.col(column).cast(types.IntegerType())) 

        # vars to index
        # Specify which columns to index (ie cast to int)
        vars_to_index = [
            'ORIGIN_AIRLNS', 
            'DEST_AIRLNS', 
            'OP_UNIQUE_CARRIER_AIRLNS' # a more granular form of origin/dest airlines
        ]

        # rename cols to drop them later
        for var in vars_to_index:
            df2 = df2.withColumnRenamed(var, var+'_old')

        # finally, index them
        xgb_indexer = StringIndexerModel.load('dbfs:/' + 'files/shared_uploads/trevorj@berkeley.edu/xgb_indexer_1')
        xgb_indexer = xgb_indexer.setHandleInvalid('keep')
        df2 = xgb_indexer.transform(df2)
        #indexer = StringIndexer(inputCols=[i+'_old' for i in vars_to_index], outputCols=vars_to_index)
        #df2 = indexer.fit(df2).transform(df2)
        df2 = df2.drop(*[i+'_old' for i in vars_to_index])

        # vectorize
        df2 = df2.select(X_vars + [y_var])
        vectorAssembler = VectorAssembler(inputCols = X_vars, outputCol = 'features', handleInvalid='skip')
        df2 = vectorAssembler.transform(df2).select(['features', y_var])

        # make predictions
        # load the final fitted xgb model
        # model_path = 'files/shared_uploads/trevorj@berkeley.edu/xgb_0408_v1' # 0.5692620
        model_path = 'files/shared_uploads/trevorj@berkeley.edu/xgb_0408_v2' # 
        xgb_fit = XgboostClassifierModel.load('dbfs:/' + model_path)
        df2 = xgb_fit.transform(df2)

        # Extract probabilities
        get_item=F.udf(lambda v:float(v[1]), types.FloatType())
        df2 = df2.withColumn("xgb_prob", get_item('probability'))
        df2 = df2.withColumnRenamed('prediction', 'xgb_prediction')
        df2 = df2.select('xgb_prob', 'xgb_prediction')
        df2 = df2.withColumn('xgb_id', F.monotonically_increasing_id())

        # join preds to original dataset and return it
        df = df = df.join(df2, on='xgb_id', how='left').drop('xgb_id')
        
        return df
    
    
    def rf_predict(self, df):
        '''
        Calculated predictions on the dataset you pass in. 
        Use function on all_time_full_join_6 without any data cleaning.
        All data cleaning is done within this function.

        Output:
        spark.DataFrame object with two new columns: rf_prob, and rf_prediction
        '''

        # load the final fitted model
        model_path = 'files/shared_uploads/trevorj@berkeley.edu/rf_0409_v2'
        rf_fit = RandomForestClassificationModel.load('dbfs:/' + model_path)

        # vars to use in the model
        X_vars = [
        # time vars
        'YEAR_AIRLNS', 'QUARTER_AIRLNS', 'MONTH_AIRLNS', 'DAY_OF_WEEK_AIRLNS', 'CRS_DEP_TIME_AIRLNS', 'CRS_ARR_TIME_AIRLNS', 

        # airport location stuff
        'CRS_ELAPSED_TIME_AIRLNS', 'DISTANCE_AIRLNS', 'ELEVATION_WTHR_origin', 'ELEVATION_WTHR_dest', 
        'LATITUDE_WTHR_origin', 'LONGITUDE_WTHR_origin', 'LATITUDE_WTHR_dest', 'LONGITUDE_WTHR_dest',

        # airport cat vars to encode/index
        'ORIGIN_AIRLNS', 'DEST_AIRLNS', 'OP_UNIQUE_CARRIER_AIRLNS', 

        # weather vars origin
        'WND_WTHR_direction_angle_origin', 'WND_WTHR_speed_rate_origin', 'TMP_WTHR_air_temperature_origin', 'DEW_WTHR_dew_point_temperature_origin',
        'VIS_WTHR_distance_dimension_origin', 'GA1_WTHR_base_height_dimension_origin', 'GF1_WTHR_lowest_cloud_base_height_dimension_origin', 
        'AA1_WTHR_period_quantity_in_hours_origin', 'AA1_WTHR_depth_dimension_origin', 'AA2_WTHR_depth_dimension_origin', 
        'AJ1_WTHR_equivalent_water_depth_dimension_origin', 'AN1_WTHR_depth_dimension_origin', 
        'AL1_WTHR_period_quantity_origin', 'AL1_WTHR_depth_dimension_origin', 'SLP_WTHR_sea_level_pressure_origin',
        'GA1_WTHR_coverage_code_origin-00', 'GF1_WTHR_total_coverage_code_origin-00', 'AA1_WTHR_condition_code_origin-3', 'AU1_WTHR_descriptor_code_origin-0',
        'AU1_WTHR_descriptor_code_origin-7', 'AU1_WTHR_obscuration_code_origin-0', 'AU1_WTHR_other_weather_phenomena_code_origin-0', 

        # same weather vars, but for dest
        'WND_WTHR_direction_angle_dest', 'WND_WTHR_speed_rate_dest', 'TMP_WTHR_air_temperature_dest', 'DEW_WTHR_dew_point_temperature_dest',
        'VIS_WTHR_distance_dimension_dest', 'GA1_WTHR_base_height_dimension_dest', 'GF1_WTHR_lowest_cloud_base_height_dimension_dest', 
        'AA1_WTHR_period_quantity_in_hours_dest', 'AA1_WTHR_depth_dimension_dest', 'AA2_WTHR_depth_dimension_dest', 
        'AJ1_WTHR_equivalent_water_depth_dimension_dest', 'AN1_WTHR_depth_dimension_dest', 
        'AL1_WTHR_period_quantity_dest', 'AL1_WTHR_depth_dimension_dest', 'SLP_WTHR_sea_level_pressure_dest', 
        'GA1_WTHR_coverage_code_dest-00', 'GF1_WTHR_total_coverage_code_dest-00', 'AA1_WTHR_condition_code_dest-3',
        'AU1_WTHR_descriptor_code_dest-7', 'AU1_WTHR_obscuration_code_dest-0', 'AU1_WTHR_other_weather_phenomena_code_dest-0', 

        # esther feature eng
        'LOCAL_DEP_HOUR', 'HOLIDAY', 'Prev_Flight_Delay_15', 'Enough_Time_Btwn_Estimate_Arrival_and_Planned_Dep', 'Poor_Schedule'
        ]
        features=X_vars

        y_var = 'DEP_DEL15_AIRLNS'

        # create an id column for final join
        df = df.withColumn("rf_id", F.monotonically_increasing_id())

        df2 = df.alias('df2')

        # cast to int
        str_cols = ['Prev_Flight_Delay_15', 'Poor_Schedule', 'Enough_Time_Btwn_Estimate_Arrival_and_Planned_Dep']
        for column in str_cols:
            df2 = df2.withColumn(column, F.col(column).cast(types.IntegerType())) 

        # impute some missing values
        df2 = df2.na.fill(0)

        # get fields
        features = [i for i in df2.columns if i != "DEP_DEL15_AIRLNS"]
        str_cols = [t[0] for t in df2.dtypes if t[1] == 'string' and t[0] in features]
        # drop some features
        #features.remove('FL_DATE_AIRLNS')

        # index all str columns    
        vars_to_index = [i for i in str_cols if i != 'FL_DATE_AIRLNS']

        # rename cols to drop them later
        for var in vars_to_index:
            df2 = df2.withColumnRenamed(var, var+'_old')

        # finally, index them
        rf_indexer = StringIndexerModel.load('dbfs:/' + 'files/shared_uploads/trevorj@berkeley.edu/rf_indexer_2')
        rf_indexer = rf_indexer.setHandleInvalid('keep')
        df2 = rf_indexer.transform(df2)
        df2 = df2.drop(*[i+'_old' for i in vars_to_index])

        # vectorize
        df2 = df2.select(X_vars + [y_var])
        vectorAssembler = VectorAssembler(inputCols = X_vars, outputCol = 'features', handleInvalid='skip')
        df2 = vectorAssembler.transform(df2).select(['features', y_var])

        # make predictions
        df2 = rf_fit.transform(df2)

        # Extract probabilities
        get_item=F.udf(lambda v:float(v[1]), types.FloatType())
        df2 = df2.withColumn("rf_prob", get_item('probability'))
        df2 = df2.withColumnRenamed('prediction', 'rf_prediction')
        df2 = df2.select('rf_prob', 'rf_prediction')
        df2 = df2.withColumn('rf_id', F.monotonically_increasing_id())

        # join preds to original dataset and return it
        df = df = df.join(df2, on='rf_id', how='left').drop('rf_id')
        
        return df
    
    
    def lr_predict(self, df):
        
        df_original = df.alias('df_original')

        categorical_string_features = [
          'ORIGIN_AIRLNS',
          'DEST_AIRLNS',
          'OP_UNIQUE_CARRIER_AIRLNS'
        ]

        categorical_features = [
          'ORIGIN_AIRLNS_indexed',
          'DEST_AIRLNS_indexed',
          'OP_UNIQUE_CARRIER_AIRLNS_indexed',
          'HOLIDAY',
          'Prev_Flight_Delay_15',
          'Enough_Time_Btwn_Estimate_Arrival_and_Planned_Dep'
        ]
        
        # run thru the full process on the full df_train
        blob_container = "main-storage" # The name of your container created in https://portal.azure.com
        storage_account = "team05w261" # The name of your Storage account created in https://portal.azure.com
        secret_scope = "team05" # The name of the scope created in your local computer using the Databricks CLI
        secret_key = "team05-key" # The name of the secret key created in your local computer using the Databricks CLI 
        blob_url = f"wasbs://{blob_container}@{storage_account}.blob.core.windows.net"
        mount_path = "/mnt/mids-w261"

        # Configure blob storage account access key globally
        spark.conf.set(
          f"fs.azure.account.key.{storage_account}.blob.core.windows.net",
          dbutils.secrets.get(scope = secret_scope, key = secret_key)
        )

        df_train = spark.read.parquet(f"{blob_url}/all_time_full_join_6").filter(F.col('YEAR_AIRLNS') <= 2018).filter(F.col('DEP_DEL15_AIRLNS').isNotNull())
        df_full = spark.read.parquet(f"{blob_url}/all_time_full_join_6")
        # drop other nas
        #df_train = df_train.dropna()
        # try imputing NAs
        df_full = df_full.na.fill(0)
        df_train = df_train.na.fill(0)
        # index
        
        

        # returning error about unseen labels. So set this to skip
        # Unseen label: PSE. To handle unseen labels, set Param handleInvalid to keep
        indexer = StringIndexer(inputCols=categorical_string_features, outputCols=[col + '_indexed' for col in categorical_string_features], handleInvalid='skip')
        #index_model = indexer.fit(df_train)
        index_model = indexer.fit(df_full) # try fitting on full data to see if this works better
        df_indexed = index_model.transform(df_train).drop(*categorical_string_features)
        df_full = index_model.transform(df_full).drop(*categorical_string_features)
        # encode
        df_indexed = df_indexed.na.fill(value=0,subset=["Prev_Flight_Delay_15"])
        encoder = OneHotEncoder(inputCols=categorical_features, outputCols=[col + '_vec' for col in categorical_features])
        #encoder_model = encoder.fit(df_indexed)
        encoder_model = encoder.fit(df_full)
        df_encoded = encoder_model.transform(df_indexed).drop(*categorical_features)
        df_full = encoder_model.transform(df_full).drop(*categorical_features)
        # Start putting data into trainable form
        actual_feature_columns = [i for i in df_encoded.columns if i not in ['DEP_DEL15_AIRLNS', 'FL_DATE_AIRLNS']]
        #actual_feature_columns.remove('DEP_DEL15_AIRLNS')
        #actual_feature_columns.remove('FL_DATE_AIRLNS')
        vectorAssembler = VectorAssembler(inputCols = actual_feature_columns, outputCol = 'features', handleInvalid='skip')
        df_ready = vectorAssembler.transform(df_encoded).select(['features', 'DEP_DEL15_AIRLNS', 'FL_DATE_AIRLNS']).withColumnRenamed("DEP_DEL15_AIRLNS", "label")
        df_full = vectorAssembler.transform(df_full).select(['features', 'DEP_DEL15_AIRLNS', 'FL_DATE_AIRLNS']).withColumnRenamed("DEP_DEL15_AIRLNS", "label")
        # undersample
        def undersample(data, label_col='DEP_DEL15_AIRLNS'):
            delayed = data.filter(F.col(label_col) > 0)
            not_delayed = data.filter(F.col(label_col) == 0)
            delayed_count = delayed.count()
            not_delayed_count = not_delayed.count()
            sample_fraction = delayed_count * 1.0 / not_delayed_count
            sample_not_delayed = not_delayed.sample(fraction=sample_fraction, seed=1)
            return sample_not_delayed.union(delayed)
        
        df_train = undersample(df_ready)
        df_train_ready = df_train.drop('FL_DATE_AIRLNS')
        df_full = df_full.drop('FL_DATE_AIRLNS')
        # min max scale
        minMaxScaler = MinMaxScaler(inputCol='features', outputCol='features_scaled')
        #scaler = minMaxScaler.fit(df_train_ready)
        scaler = minMaxScaler.fit(df_full)
        df_train_ready = scaler.transform(df_train_ready).select('features_scaled', 'label')
        # PCA
        pca = PCA(k=125, inputCol='features_scaled', outputCol='features_transformed')
        pca_model = pca.fit(df_train_ready)
        df_train_ready = pca_model.transform(df_train_ready)
        # Train LR
        df_train_ready = df_train_ready.select(['features_transformed', 'label']).withColumnRenamed("features_transformed", "features")
        lr = LogisticRegression(maxIter=25, regParam=.1, elasticNetParam=0)
        lrModel = lr.fit(df_train_ready)
        
        
        
        # now run those fitted pieces on the input data
        # index
        df = index_model.transform(df).drop(*categorical_string_features)
        # encode
        df = df.na.fill(value=0,subset=["Prev_Flight_Delay_15"])
        df = df.na.fill(value=0) # lots of missing values, impute them
        df = encoder_model.transform(df).drop(*categorical_features)
        # vectorize
        df = vectorAssembler.transform(df).select(['features', 'DEP_DEL15_AIRLNS', 'FL_DATE_AIRLNS']).withColumnRenamed("DEP_DEL15_AIRLNS", "label")
        # scale
        df = scaler.transform(df).select('features_scaled', 'label')
        # PCA
        df = pca_model.transform(df)
        # LR
        df = lrModel.transform(df.withColumnRenamed("features_transformed", "features"))
        # Extract probabilities
        get_item=F.udf(lambda v:float(v[1]), types.FloatType())
        df = df.withColumn("lr_prob", get_item('probability'))
        df = df.withColumnRenamed('prediction', 'lr_prediction')
        df = df.select('lr_prob', 'lr_prediction')
        df = df.withColumn('lr_id', F.monotonically_increasing_id())

        # join preds to original dataset and return it
        df_original = df_original.withColumn('lr_id', F.monotonically_increasing_id()).join(df, on='lr_id', how='left').drop('lr_id')

        return df_original
    
    
    def ensemble_predict(self, df, verbose=False):
        
        if verbose:
            print('Making random forest predictions')
        df_rf = self.rf_predict(df).withColumn('id', F.monotonically_increasing_id())
        
        if verbose:
            print('Making XGBoost predictions')
        df_xgb = self.xgb_predict(df).select('xgb_prob', 'xgb_prediction').withColumn('id', F.monotonically_increasing_id())
        
        if verbose:
            print('Making Logistic Regression predictions')
        df_lr = self.lr_predict(df).select('lr_prob', 'lr_prediction').withColumn('id', F.monotonically_increasing_id())
        
        # join them
        df_all = df_rf.join(df_xgb, on='id', how='left').join(df_lr, on='id', how='left')
        
        # sometimes the indexer needs to skip a few unforseen airlines. In this case let's just predict 0 for now. Can possibly revisit this later. 
        df_all = df_all.na.fill({
            'lr_prediction': 0, 'rf_prediction': 0, 'xgb_prediction': 0,
            'lr_prob': .49, 'rf_prob': .49, 'xgb_prob': .49})
        
        # final predicted class
        df_all = df_all.withColumn('final_prediction', F.when((F.col('rf_prediction') + F.col('xgb_prediction') + F.col('lr_prediction')) >= 2, 1).otherwise(0))
        df_all = df_all.withColumn('final_prob', (F.col('rf_prob') + F.col('xgb_prob') + F.col('lr_prob')) / 3)
        
        return df_all
        

# Functions for classification metrics

In [0]:
# return f1 and f2 scores
def eval_class_metrics(df_results_train, df_results_test):

    metrics_train = MulticlassMetrics(df_results_train.select('prediction', 'label').rdd)
    metrics_test = MulticlassMetrics(df_results_test.select('prediction', 'label').rdd)
    
    print('\nTrain F1 score from package')
    print(metrics_train.weightedFMeasure(1.0))
    print('Test F1 score from package')
    print(metrics_test.weightedFMeasure(1.0))

    print('\nTrain F2 score from package')
    print(metrics_train.weightedFMeasure(2.0))
    print('Test F2 score from package')
    print(metrics_test.weightedFMeasure(2.0))


In [0]:
blob_container = "main-storage" # The name of your container created in https://portal.azure.com
storage_account = "team05w261" # The name of your Storage account created in https://portal.azure.com
secret_scope = "team05" # The name of the scope created in your local computer using the Databricks CLI
secret_key = "team05-key" # The name of the secret key created in your local computer using the Databricks CLI 
blob_url = f"wasbs://{blob_container}@{storage_account}.blob.core.windows.net"
mount_path = "/mnt/mids-w261"

# Configure blob storage account access key globally
spark.conf.set(
  f"fs.azure.account.key.{storage_account}.blob.core.windows.net",
  dbutils.secrets.get(scope = secret_scope, key = secret_key)
)

df = spark.read.parquet(f"{blob_url}/all_time_full_join_6")

# Model Evaluation

## XGBoost

In [0]:
model = EnsemblePredict()
df = spark.read.parquet(f"{blob_url}/all_time_full_join_6")
df_train = model.xgb_predict(df.filter(F.col('YEAR_AIRLNS') <= 2018).filter(F.col('DEP_DEL15_AIRLNS').isNotNull()))
df_test = model.xgb_predict(df.filter(F.col('YEAR_AIRLNS') == 2019).filter(F.col('DEP_DEL15_AIRLNS').isNotNull()))

df_results_train = df_train.select('xgb_prediction', 'xgb_prob', 'DEP_DEL15_AIRLNS')\
    .withColumnRenamed('xgb_prediction', 'prediction')\
    .withColumnRenamed('DEP_DEL15_AIRLNS', 'label')\
    .withColumnRenamed('xgb_prob', 'probability')

df_results_test = df_test.select('xgb_prediction', 'xgb_prob', 'DEP_DEL15_AIRLNS')\
    .withColumnRenamed('xgb_prediction', 'prediction')\
    .withColumnRenamed('DEP_DEL15_AIRLNS', 'label')\
    .withColumnRenamed('xgb_prob', 'probability')


eval_class_metrics(df_results_train, df_results_test)

## Random Forest

In [0]:
model = EnsemblePredict()
df = spark.read.parquet(f"{blob_url}/all_time_full_join_6")
df_train = model.rf_predict(df.filter(F.col('YEAR_AIRLNS') <= 2018).filter(F.col('DEP_DEL15_AIRLNS').isNotNull()))
df_test = model.rf_predict(df.filter(F.col('YEAR_AIRLNS') == 2019).filter(F.col('DEP_DEL15_AIRLNS').isNotNull()))

df_results_train = df_train.select('rf_prediction', 'rf_prob', 'DEP_DEL15_AIRLNS')\
    .withColumnRenamed('rf_prediction', 'prediction')\
    .withColumnRenamed('DEP_DEL15_AIRLNS', 'label')\
    .withColumnRenamed('rf_prob', 'probability')

df_results_test = df_test.select('rf_prediction', 'rf_prob', 'DEP_DEL15_AIRLNS')\
    .withColumnRenamed('rf_prediction', 'prediction')\
    .withColumnRenamed('DEP_DEL15_AIRLNS', 'label')\
    .withColumnRenamed('rf_prob', 'probability')

eval_class_metrics(df_results_train, df_results_test)

## Logistic Regression

In [0]:
# ran it once, and saved results to blob
# model = EnsemblePredict()
# df = spark.read.parquet(f"{blob_url}/all_time_full_join_6")
# df_train = model.rf_predict(df.filter(F.col('YEAR_AIRLNS') <= 2018).filter(F.col('DEP_DEL15_AIRLNS').isNotNull()))
# df_test = model.rf_predict(df.filter(F.col('YEAR_AIRLNS') == 2019).filter(F.col('DEP_DEL15_AIRLNS').isNotNull()))

# df_train.write.parquet(f"{blob_url}/lr_predictions_train_tj")
# df_test.write.parquet(f"{blob_url}/lr_predictions_test_tj")

# from yi
# df_train = spar.read.parquet(f"{blob_url}/final_lr_train_predictions_1")
# df_test = spark.read.parquet(f"{blob_url}/final_lr_test_predictions_1")

df_train = spark.read.parquet(f"{blob_url}/lr_predictions_train_tj")
df_test = spark.read.parquet(f"{blob_url}/lr_predictions_test_tj")


df_results_train = df_train.select('lr_prediction', 'lr_prob', 'DEP_DEL15_AIRLNS')\
    .withColumnRenamed('lr_prediction', 'prediction')\
    .withColumnRenamed('DEP_DEL15_AIRLNS', 'label')\
    .withColumnRenamed('lr_prob', 'probability')

df_results_test = df_test.select('lr_prediction', 'lr_prob', 'DEP_DEL15_AIRLNS')\
    .withColumnRenamed('lr_prediction', 'prediction')\
    .withColumnRenamed('DEP_DEL15_AIRLNS', 'label')\
    .withColumnRenamed('lr_prob', 'probability')


eval_class_metrics(df_results_train, df_results_test)

## Ensemble

In [0]:
# ensemble
# model = EnsemblePredict()
# df_train = model.ensemble_predict(df.filter(F.col('YEAR_AIRLNS') <= 2018))
# df_test = model.ensemble_predict(df.filter(F.col('YEAR_AIRLNS') == 2019))

# write to disc
# df_train.write.parquet(f"{blob_url}/ensemble_predictions_train")
# df_test.write.parquet(f"{blob_url}/ensemble_predictions_test")

# start from here
df_train = spark.read.parquet(f"{blob_url}/ensemble_predictions_train").filter(F.col('DEP_DEL15_AIRLNS').isNotNull())
df_test = spark.read.parquet(f"{blob_url}/ensemble_predictions_test").filter(F.col('DEP_DEL15_AIRLNS').isNotNull())

df_results_train = df_train.select('final_prediction', 'final_prob', 'DEP_DEL15_AIRLNS')\
    .withColumnRenamed('final_prediction', 'prediction').withColumn('prediction', F.col('prediction').cast(types.DoubleType()))\
    .withColumnRenamed('DEP_DEL15_AIRLNS', 'label').withColumn('label', F.col('label').cast(types.DoubleType()))\
    .withColumnRenamed('final_prob', 'probability')


df_results_test = df_test.select('final_prediction', 'final_prob', 'DEP_DEL15_AIRLNS')\
    .withColumnRenamed('final_prediction', 'prediction').withColumn('prediction', F.col('prediction').cast(types.DoubleType()))\
    .withColumnRenamed('DEP_DEL15_AIRLNS', 'label').withColumn('label', F.col('label').cast(types.DoubleType()))\
    .withColumnRenamed('final_prob', 'probability')

eval_class_metrics(df_results_train, df_results_test)