In [0]:
from pyspark.sql import types, Window, functions as F
import pandas as pd
import numpy as np
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.classification import LogisticRegressionModel, LogisticRegression
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler, StandardScaler, PCA, PCAModel, MinMaxScaler, MinMaxScalerModel
from pyspark.mllib.evaluation import MulticlassMetrics


In [0]:
blob_container = "main-storage" # The name of your container created in https://portal.azure.com
storage_account = "team05w261" # The name of your Storage account created in https://portal.azure.com
secret_scope = "team05" # The name of the scope created in your local computer using the Databricks CLI
secret_key = "team05-key" # The name of the secret key created in your local computer using the Databricks CLI 
blob_url = f"wasbs://{blob_container}@{storage_account}.blob.core.windows.net"
mount_path = "/mnt/mids-w261"
spark.conf.set(
  f"fs.azure.account.key.{storage_account}.blob.core.windows.net",
  dbutils.secrets.get(scope = secret_scope, key = secret_key)
)
df = spark.read.parquet(f"{blob_url}/all_time_full_join_6")

# df_test = df.filter(F.col('YEAR_AIRLNS')==2019)
df_train = df.filter(F.col('YEAR_AIRLNS')<=2018)
# df_test = lr_predict(df_test)
# display(df_test)

In [0]:
def lr_predict(df):
    
    #df = df.withColumn('lr_id', F.monotonically_increasing_id())
    df2 = df.alias('df2')
    
    categorical_string_features = [
      'ORIGIN_AIRLNS',
      'DEST_AIRLNS',
      'OP_UNIQUE_CARRIER_AIRLNS'
    ]

    categorical_features = [
      'ORIGIN_AIRLNS_indexed',
      'DEST_AIRLNS_indexed',
      'OP_UNIQUE_CARRIER_AIRLNS_indexed',
      'HOLIDAY',
      'Prev_Flight_Delay_15',
      'Enough_Time_Btwn_Estimate_Arrival_and_Planned_Dep'
    ]

    # Run string feature indexing and one-hot encoding on the whole data set before split 
    indexer = StringIndexer(inputCols=categorical_string_features, outputCols=[col + '_indexed' for col in categorical_string_features])
    index_model = indexer.fit(df2)
    df_indexed = index_model.transform(df2).drop(*categorical_string_features)

    df_indexed = df_indexed.na.fill(value=0,subset=["Prev_Flight_Delay_15"])
    encoder = OneHotEncoder(inputCols=categorical_features, outputCols=[col + '_vec' for col in categorical_features])
    encoder_model = encoder.fit(df_indexed)
    df_encoded = encoder_model.transform(df_indexed).drop(*categorical_features)

    # Start putting data into trainable form
    actual_feature_columns = [i for i in df_encoded.columns if i != 'lr_id']
    actual_feature_columns.remove('DEP_DEL15_AIRLNS')
    actual_feature_columns.remove('FL_DATE_AIRLNS')
    vectorAssembler = VectorAssembler(inputCols = actual_feature_columns, outputCol = 'features', handleInvalid='skip')
    df_ready = vectorAssembler.transform(df_encoded).select(['features', 'DEP_DEL15_AIRLNS', 'FL_DATE_AIRLNS']).withColumnRenamed("DEP_DEL15_AIRLNS", "label")
    
    # PCA
    minMaxScaler = MinMaxScaler(inputCol='features', outputCol='features_scaled')
    scaler = minMaxScaler.fit(df_ready)
    df_ready = scaler.transform(df_ready).select('features_scaled', 'label')
    pca = PCAModel.load('dbfs:/' + 'files/shared_uploads/yizhang7210@berkeley.edu/pca_for_lr')
    df_ready = pca.transform(df_ready)
    df_ready = df_ready.select(['features_transformed', 'label']).withColumnRenamed("features_transformed", "features")

    # make preds
    model_path = 'files/shared_uploads/yizhang7210@berkeley.edu/final_logistic_regression'
    final_model_loaded = LogisticRegressionModel.load('dbfs:/' + model_path)
    df_ready = final_model_loaded.transform(df_ready)
    # rename??
    
    # Extract probabilities
    get_item=F.udf(lambda v:float(v[1]), types.FloatType())
    df_ready = df_ready.withColumn("lr_prob", get_item('probability'))
    df_ready = df_ready.withColumnRenamed('prediction', 'lr_prediction')
    df_ready = df_ready.select('lr_prob', 'lr_prediction')
    df_ready = df_ready.withColumn('lr_id', F.monotonically_increasing_id())
    
    # join preds to original dataset and return it
    df = df.withColumn('lr_id', F.monotonically_increasing_id())
    df = df.join(df_ready, on='lr_id', how='left').drop('lr_id')
    
    return df

In [0]:
print(df_train.count())

In [0]:
df_train = lr_predict(df_train)
display(df_train)

In [0]:
df2 = df_train.alias('df2')

categorical_string_features = [
  'ORIGIN_AIRLNS',
  'DEST_AIRLNS',
  'OP_UNIQUE_CARRIER_AIRLNS'
]

categorical_features = [
  'ORIGIN_AIRLNS_indexed',
  'DEST_AIRLNS_indexed',
  'OP_UNIQUE_CARRIER_AIRLNS_indexed',
  'HOLIDAY',
  'Prev_Flight_Delay_15',
  'Enough_Time_Btwn_Estimate_Arrival_and_Planned_Dep'
]

# Run string feature indexing and one-hot encoding on the whole data set before split 
indexer = StringIndexer(inputCols=categorical_string_features, outputCols=[col + '_indexed' for col in categorical_string_features])
index_model = indexer.fit(df2)
df_indexed = index_model.transform(df2).drop(*categorical_string_features)

df_indexed = df_indexed.na.fill(value=0,subset=["Prev_Flight_Delay_15"])
encoder = OneHotEncoder(inputCols=categorical_features, outputCols=[col + '_vec' for col in categorical_features])
encoder_model = encoder.fit(df_indexed)
df_encoded = encoder_model.transform(df_indexed).drop(*categorical_features)

display(df_encoded)

In [0]:
display(df_encoded)

In [0]:
# Start putting data into trainable form
actual_feature_columns = [i for i in df_encoded.columns if i not in ['DEP_DEL15_AIRLNS', 'FL_DATE_AIRLNS', 'lr_id']]
vectorAssembler = VectorAssembler(inputCols = actual_feature_columns, outputCol = 'features', handleInvalid='skip')
df_ready = vectorAssembler.transform(df_encoded).select(['features', 'DEP_DEL15_AIRLNS', 'FL_DATE_AIRLNS']).withColumnRenamed("DEP_DEL15_AIRLNS", "label")

In [0]:
display(df_ready)

In [0]:
minMaxScaler = MinMaxScaler(inputCol='features', outputCol='features_scaled').fit(df_ready)
minMaxScaler.save('files/shared_uploads/yizhang7210@berkeley.edu/minmax_for_lr')

In [0]:
scaler = MinMaxScalerModel.load('dbfs:/' + 'files/shared_uploads/yizhang7210@berkeley.edu/minmax_for_lr')

In [0]:




# PCA
df_ready = scaler.transform(df_ready).select('features_scaled', 'label')
pca = PCAModel.load('dbfs:/' + 'files/shared_uploads/yizhang7210@berkeley.edu/pca_for_lr')
df_ready = pca.transform(df_ready)
df_ready = df_ready.select(['features_transformed', 'label']).withColumnRenamed("features_transformed", "features")

# make preds
model_path = 'files/shared_uploads/yizhang7210@berkeley.edu/final_logistic_regression'
final_model_loaded = LogisticRegressionModel.load('dbfs:/' + model_path)
df_ready = final_model_loaded.transform(df_ready)

# Extract probabilities
get_item=F.udf(lambda v:float(v[1]), types.FloatType())
df_ready = df_ready.withColumn("lr_prob", get_item('probability'))
df_ready = df_ready.withColumnRenamed('prediction', 'lr_prediction')
df_ready = df_ready.select('lr_prob', 'lr_prediction')
df_ready = df_ready.withColumn('lr_id', F.monotonically_increasing_id())

# join preds to original dataset and return it
df = df.join(df_ready, on='lr_id', how='left').drop('lr_id')

In [0]:
df_test_rdd = df_test.select('rf_prediction', 'dep_del15_airlns').withColumnRenamed('rf_prediction', 'prediction').withColumnRenamed('dep_del15_airlns', 'label').rdd
metrics = MulticlassMetrics(df_test_rdd)
    
# classification metrics
cm = metrics.confusionMatrix().toArray()
print(f'f2 score with package: {metrics.fMeasure(0.0, 2.0)}')

# confirm I'm getting the same f score here
accuracy = (cm[0][0] + cm[1][1]) / cm.sum()
precision = (cm[1][1]) / (cm[1][1] + cm[0][1])
recall = (cm[1][1]) / (cm[1][1] + cm[1][0])

def f_score(beta, precision, recall):
    return (1+beta**2) * precision * recall / (beta**2 * precision + recall)

print(f'f2 score from first principles: {f_score(2, precision, recall)}')

In [0]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator


predictions = df_train\
    .withColumnRenamed('xgb_prob', 'probability')\
    .withColumnRenamed('xgb_prediction', 'rawPrediction')\
    .withColumnRenamed('DEP_DEL15_AIRLNS', 'label')\
    .select('probability', 'label', 'rawPrediction')\
    .filter(F.col('label').isNotNull())

evaluator = BinaryClassificationEvaluator(labelCol='label')

# We have only two choices: area under ROC and PR curves :-(
auroc = evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"})
auprc = evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderPR"})
print("Area under ROC Curve: {:.4f}".format(auroc))
print("Area under PR Curve: {:.4f}".format(auprc))