In [0]:
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler, Imputer, MinMaxScaler, OneHotEncoder
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier

from xgboost.spark import SparkXGBClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import pyspark.sql.functions as F
from pyspark.sql.types import FloatType, IntegerType, DateType, StringType

from hyperopt import fmin, tpe, Trials, SparkTrials, hp
from hyperopt.early_stop import no_progress_loss
import mlflow
import mlflow.spark

import tensorflow as tf
from tensorflow.keras.layers import Dense, Normalization
from tensorflow.keras.models import Sequential
tf.random.set_seed(42)

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

2024-04-19 19:59:04.782878: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [0]:
def mount_to_storage(storage_account='vbui',
                     blob_container='team62container',
                     secret_key='cso',
                     secret_scope='vbui'):
    '''
    Function Used to Mount to blob Storage
    '''
    current_mounts = dbutils.fs.mounts()
    if len([1 for x in current_mounts if x.mountPoint == "/mnt/blob_storage"])>0:
        return dbutils.fs.ls("/mnt/blob_storage")
    else:
        dbutils.fs.mount(
            source = f"wasbs://{blob_container}@{storage_account}.blob.core.windows.net",
            mount_point = "/mnt/blob_storage",
            extra_configs = {f"fs.azure.sas.{blob_container}.{storage_account}.blob.core.windows.net": dbutils.secrets.get(scope=secret_scope, key=secret_key)})
    return dbutils.fs.ls("/mnt/blob_storage")


def import_file(file_name,
                file_type,
                mount_point='/mnt/blob_storage/'):
    '''
    Function used to Import files from Blob Storage, can read both parquet and csv files.
    '''
    if file_type=='parquet':
        return spark.read.parquet(f"dbfs:{mount_point}{file_name}")
    elif file_type=='csv':
        return spark.read.csv(f"dbfs:{mount_point}{file_name}",header=True)


def cal_fbeta_score(predictions, beta = 0.5):
  ''' Calcuate fbeta-score '''
  # Calculate fbeta-score
  evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedPrecision")
  precision = evaluator.evaluate(predictions)

  evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedRecall")
  recall = evaluator.evaluate(predictions)

  results = {
    'precision' : precision,
    'recall': recall,
    'fbeta-score': (1 + beta ** 2) * (precision * recall) / (beta**2 * precision + recall)
  }
  return results


def time_series_cv(preprocessing: [], model, data, total_rows, metric_function = cal_fbeta_score, num_folds: int = 3):
    """
    Perform time series cross-validation.

    :param model_class: The ML model class to be trained.
    :param hyperparams: Dictionary of hyperparameters to tune.
    :param data: The dataset to be used for training and validation.
    :param evaluator: The evaluator to be used for model evaluation.
    :param num_folds: Number of folds for time series cross-validation.
    """
    # Assume the data is sorted by time dring preprocessing.

    # Split the data into folds respecting the temporal order.
    fold_size = total_rows // num_folds
    folds = [data.limit(fold_size * (i + 1)).subtract(data.limit(fold_size * i)).cache() for i in range(num_folds)]

    # Iterate over each combination of parameters
    metrics = []
    for i in range(1, num_folds):
        train = folds[i-1]
        test = folds[i]
        
        # Train and evaluate the model
        pipeline = Pipeline(stages=preprocessing + [model])
        fitted_pipeline = pipeline.fit(train)
        predictions = fitted_pipeline.transform(test)
        metric = metric_function(predictions)['fbeta-score']
        metrics.append(metric)
      
    # Calculate the average metric across all folds for the current parameter combination
    scalars = np.array([i for i in range(1, len(metrics) + 1)])
    return np.sum(metrics * scalars) / np.sum(scalars)

In [0]:
df = import_file('Draft_Final_DF_1Y_3.00','parquet')

display(df)

MONTH,FL_DATE,OP_CARRIER_AIRLINE_ID,ORIGIN,DEST,DEP_DEL15,PREVIOUS_FLIGHT_ARRIVED_LATE,PREVIOUS_DIVERTED,PLANE_FORECAST_TURNAROUND_TIME,FLIGHTS_SCHEDULED_2HRS_OR_LESS_BEFORE_CRS_DEP,FLIGHTS_DEPARTED_2HRS_BEFORE_PREDICTION,FLIGHTS_DELAYED_2HRS_BEFORE_PREDICITION,YEAR,QUARTER,DAY_OF_MONTH,DAY_OF_WEEK
1,2019-01-01,20304,ABE,DTW,0,,,,0,0,0,2019,1,1,3
1,2019-01-01,20363,ABE,ATL,0,,,,1,0,0,2019,1,1,3
1,2019-01-01,20397,ABE,CLT,0,,,,2,0,0,2019,1,1,3
1,2019-01-01,20368,ABE,SFB,0,0.0,0.0,63.0,0,0,0,2019,1,1,3
1,2019-01-01,20397,ABE,CLT,0,0.0,0.0,46.0,1,0,0,2019,1,1,3
1,2019-01-01,20368,ABE,PIE,0,0.0,0.0,76.0,0,2,0,2019,1,1,3
1,2019-01-01,20397,ABE,CLT,0,0.0,0.0,51.0,0,1,0,2019,1,1,3
1,2019-01-01,20304,ABE,DTW,0,0.0,0.0,30.0,1,1,0,2019,1,1,3
1,2019-01-02,20304,ABE,DTW,0,1.0,0.0,326.0,0,0,0,2019,1,2,4
1,2019-01-02,20363,ABE,ATL,0,0.0,0.0,629.0,1,0,0,2019,1,2,4


In [0]:
mlflow.autolog(disable=True)

df = df.withColumn('FL_DATE', df['FL_DATE'].cast(DateType()))
df = df.sort(df.FL_DATE)
features = ['MONTH', 'OP_CARRIER_AIRLINE_ID', 'ORIGIN', 'DEST', 'PREVIOUS_FLIGHT_ARRIVED_LATE', 
            'PREVIOUS_DIVERTED', 'PLANE_FORECAST_TURNAROUND_TIME',
            'FLIGHTS_DEPARTED_2HRS_BEFORE_PREDICTION', 'FLIGHTS_DELAYED_2HRS_BEFORE_PREDICITION', 'FLIGHTS_SCHEDULED_2HRS_OR_LESS_BEFORE_CRS_DEP']

df = df.withColumnRenamed(existing='DEP_DEL15', new='label')

train_set, test_set = df.filter(df.QUARTER < 4), df.filter(df.QUARTER == 4)
train_set = train_set.select(features + ['label']).fillna(0)
test_set = test_set.select(features + ['label']).fillna(0)

preprocessing = []

string_cols = ['ORIGIN', 'DEST', 'OP_CARRIER_AIRLINE_ID']
numerical_cols = ['PLANE_FORECAST_TURNAROUND_TIME', 'FLIGHTS_DEPARTED_2HRS_BEFORE_PREDICTION', 'FLIGHTS_DELAYED_2HRS_BEFORE_PREDICITION', 'FLIGHTS_SCHEDULED_2HRS_OR_LESS_BEFORE_CRS_DEP']
for string_col in string_cols:
    string_index = StringIndexer(inputCol=string_col, outputCol="indexed_" + string_col, handleInvalid='keep')
    preprocessing.append(string_index)
    list_onehot = OneHotEncoder(inputCol="indexed_" + string_col, outputCol="encoded_" + string_col, handleInvalid='keep')
    preprocessing.append(list_onehot)

encoded_MONTH = 'encoded_MONTH'
preprocessing.append(OneHotEncoder(inputCol='MONTH', outputCol=encoded_MONTH, handleInvalid='keep'))

used_features = ['encoded_' + col for col in string_cols] + [encoded_MONTH]

numerical_ass = VectorAssembler(inputCols = numerical_cols, outputCol = 'numerical_features')
preprocessing.append(numerical_ass)

standard_sc = StandardScaler(inputCol = 'numerical_features', outputCol = 'scaled_numerical_features')
preprocessing.append(standard_sc)

vector_ass = VectorAssembler(inputCols=['scaled_numerical_features'] + used_features, outputCol='features')
preprocessing.append(vector_ass)

# preprocess_pipeline = Pipeline(stages = preprocessing)
# fitted_preprocess_pipeline = preprocess_pipeline.fit(train_set)
# processed_train_set = fitted_preprocess_pipeline.transform(train_set)
# processed_test_set = fitted_preprocess_pipeline.transform(test_set)

total_rows = train_set.count()

In [0]:
num_eval = 20
mlflow.autolog(disable=True)

## Logistic Regression

In [0]:
def lr_objective(params):
    lr = LogisticRegression(**params)
    return -time_series_cv(preprocessing, lr, data=train_set, total_rows=total_rows)

lr_spaces = {
    'elasticNetParam': hp.uniform('elasticNetParam', 0, 1)
}

trials = Trials()
best_lr = fmin(lr_objective, space = lr_spaces,
               algo=tpe.suggest, max_evals=num_eval, trials=trials, early_stop_fn=no_progress_loss(5))

print(best_lr)

  0%|          | 0/20 [00:00<?, ?trial/s, best loss=?]  5%|▌         | 1/20 [02:02<38:38, 122.05s/trial, best loss: -0.7789976560062071] 10%|█         | 2/20 [03:47<33:39, 112.21s/trial, best loss: -0.7789976560062071] 15%|█▌        | 3/20 [05:28<30:23, 107.28s/trial, best loss: -0.7789976560062071] 20%|██        | 4/20 [07:19<29:01, 108.82s/trial, best loss: -0.7789976560062071] 25%|██▌       | 5/20 [09:02<26:38, 106.54s/trial, best loss: -0.7789976560062071] 30%|███       | 6/20 [10:56<25:28, 109.18s/trial, best loss: -0.7789976560062071] 35%|███▌      | 7/20 [12:38<23:08, 106.83s/trial, best loss: -0.7789976560062071] 40%|████      | 8/20 [14:35<21:58, 109.90s/trial, best loss: -0.7789976560062071] 45%|████▌     | 9/20 [16:16<19:38, 107.12s/trial, best loss: -0.7789976560062071] 50%|█████     | 10/20 [18:19<18:40, 112.08s/trial, best loss: -0.7789976560062071] 55%|█████▌    | 11/20 [20:02<16:23, 109.24s/trial, best loss: -0.7789976560062071] 60%|██████    | 12/20 [21:55

In [0]:
def rf_objective(params):
    rf = RandomForestClassifier(**params, bootstrap=False)
    return -time_series_cv(preprocessing, rf, data=train_set, total_rows=total_rows)

rf_spaces = {
  "numTrees": hp.randint('numTrees', 100, 500),
  "maxDepth": hp.randint('maxDepth', 5, 29),
  "maxBins":  hp.randint('maxBins', 10, 64),
  "minInstancesPerNode": hp.randint('minInstancesPerNode', 2, 12),
  "minInfoGain": hp.uniform('minInfoGain', 0, 1),
}

trials = Trials()
best_rf = fmin(rf_objective, space=rf_spaces, algo = tpe.suggest, 
               max_evals=num_eval, trials=trials, early_stop_fn=no_progress_loss(5))
print(best_rf)

  0%|          | 0/20 [00:00<?, ?trial/s, best loss=?]  5%|▌         | 1/20 [05:26<1:43:14, 326.02s/trial, best loss: -0.6721180227831884] 10%|█         | 2/20 [12:01<1:50:08, 367.13s/trial, best loss: -0.6721180227831884] 15%|█▌        | 3/20 [18:22<1:45:48, 373.43s/trial, best loss: -0.6721180227831884] 20%|██        | 4/20 [21:13<1:18:17, 293.57s/trial, best loss: -0.6721180227831884] 25%|██▌       | 5/20 [27:19<1:19:54, 319.66s/trial, best loss: -0.6721180227831884] 25%|██▌       | 5/20 [27:19<1:21:59, 327.98s/trial, best loss: -0.6721180227831884]
{'maxBins': 46, 'maxDepth': 20, 'minInfoGain': 0.9670683775019292, 'minInstancesPerNode': 2, 'numTrees': 239}


In [0]:
def gbt_objective(params):
    gbt = GBTClassifier(**params)
    return -time_series_cv(preprocessing, gbt, data=train_set, total_rows=total_rows)

gbt_spaces = {
    "maxDepth": hp.randint('maxDepth', 1, 29), "maxBins": hp.randint('maxBins', 16, 128),
    "minInstancesPerNode": hp.randint('minInstancesPerNode', 3, 12),
    "stepSize": hp.uniform('stepSize', 0.1, 0.5), 
    "subsamplingRate":  hp.uniform('subsamplingRate', 0.1, 0.7)
    }

trials = Trials()
best_gbt = fmin(gbt_objective, space=gbt_spaces, algo = tpe.suggest, 
               max_evals=num_eval, trials=trials, early_stop_fn=no_progress_loss(5))
print(best_gbt)

  0%|          | 0/20 [00:00<?, ?trial/s, best loss=?]  5%|▌         | 1/20 [1:59:46<37:55:45, 7186.59s/trial, best loss: -0.8941692993553477] 10%|█         | 2/20 [3:02:15<25:49:15, 5164.22s/trial, best loss: -0.8941692993553477] 15%|█▌        | 3/20 [6:57:43<43:42:59, 9257.60s/trial, best loss: -0.8941692993553477] 20%|██        | 4/20 [7:25:41<27:50:40, 6265.03s/trial, best loss: -0.8941692993553477] 25%|██▌       | 5/20 [12:50:51<46:00:22, 11041.48s/trial, best loss: -0.8941692993553477] 25%|██▌       | 5/20 [12:50:51<38:32:35, 9250.38s/trial, best loss: -0.8941692993553477] 
{'maxBins': 126, 'maxDepth': 11, 'minInstancesPerNode': 3, 'stepSize': 0.27286063089767754, 'subsamplingRate': 0.6445718010290064}


In [0]:
mlflow.autolog(disable=False)

2024/04/20 08:50:24 INFO mlflow.tracking.fluent: Autologging successfully enabled for tensorflow.
2024/04/20 08:50:24 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2024/04/20 08:50:24 INFO mlflow.tracking.fluent: Autologging successfully enabled for xgboost.
2024/04/20 08:50:24 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.
2024/04/20 08:50:24 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.ml.
