### Supporting functions

Some of the supporting function for loading files, load into dataset

In [0]:
# Import Functions to be utilized throughout Workbook
from pyspark.sql.functions import col, to_timestamp, unix_timestamp, from_unixtime, expr,lpad,lag, row_number,concat, lit,count,substr,substring,coalesce,when,hour
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import mlflow
import json
import os

from pyspark.sql.types import FloatType, IntegerType, DateType, StringType
from pyspark.sql.window import Window
from pyspark.sql.functions import sum, cos, sin, radians, col, explode, array, lit

from pyspark.ml.feature import OneHotEncoder, VectorAssembler, StringIndexer, StandardScaler
from pyspark.ml import Pipeline, Estimator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier, MultilayerPerceptronClassifier
from pyspark.ml.tuning import ParamGridBuilder
from graphframes import *

def mount_to_storage(storage_account='vbui',
                     blob_container='team62container',
                     secret_key='cso',
                     secret_scope='vbui'):
    '''
    Function Used to Mount to blob Storage
    '''
    current_mounts = dbutils.fs.mounts()
    if len([1 for x in current_mounts if x.mountPoint == "/mnt/blob_storage"])>0:
        return dbutils.fs.ls("/mnt/blob_storage")
    else:
        dbutils.fs.mount(
            source = f"wasbs://{blob_container}@{storage_account}.blob.core.windows.net",
            mount_point = "/mnt/blob_storage",
            extra_configs = {f"fs.azure.sas.{blob_container}.{storage_account}.blob.core.windows.net": dbutils.secrets.get(scope=secret_scope, key=secret_key)})
    return dbutils.fs.ls("/mnt/blob_storage")


def import_file(file_name,
                file_type,
                mount_point='/mnt/blob_storage/'):
    '''
    Function used to Import files from Blob Storage, can read both parquet and csv files.
    '''
    if file_type=='parquet':
        return spark.read.parquet(f"dbfs:{mount_point}{file_name}")
    elif file_type=='csv':
        return spark.read.csv(f"dbfs:{mount_point}{file_name}",header=True)


def cal_fbeta_score(predictions, beta = 0.5):
  ''' Calcuate fbeta-score '''
  # Calculate fbeta-score
  evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedPrecision")
  precision = evaluator.evaluate(predictions)

  evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedRecall")
  recall = evaluator.evaluate(predictions)

  fmeasure = MulticlassClassificationEvaluator(metricName="weightedFMeasure", beta = beta).evaluate(predictions)

  results = {
    'precision' : precision,
    'recall': recall,
    'fbeta-score': (1 + beta ** 2) * (precision * recall) / (beta**2 * precision + recall),
    'fMeasure': fmeasure
  }
  return results


def time_series_cv(preprocessing: [], model, data, total_rows, metric_function = cal_fbeta_score, num_folds: int = 3):
  """
  Perform time series cross-validation.

  :param model_class: The ML model class to be trained.
  :param hyperparams: Dictionary of hyperparameters to tune.
  :param data: The dataset to be used for training and validation.
  :param evaluator: The evaluator to be used for model evaluation.
  :param num_folds: Number of folds for time series cross-validation.
  """
  # Assume the data is sorted by time dring preprocessing.
  mlflow.autolog(disable=True)
  print('============= Starting cross validation =============')
  # Split the data into folds respecting the temporal order.
  fold_size = total_rows // num_folds
  folds = [data.limit(fold_size * (i + 1)).subtract(data.limit(fold_size * i)).cache() for i in range(num_folds)]
  print('============= Finished spliting into folds =============')

  # Iterate over each combination of parameters
  metrics = []
  for i in range(1, num_folds):
      train = folds[i-1]
      test = folds[i]
      
      # Train and evaluate the model
      pipeline = Pipeline(stages=preprocessing + [model])
      fitted_pipeline = pipeline.fit(train)
      predictions = fitted_pipeline.transform(test)
      metric = metric_function(predictions)['fbeta-score']
      metrics.append(metric)
    
  # Calculate the average metric across all folds for the current parameter combination
  mlflow.autolog(disable=False)
  scalars = np.array([i for i in range(1, len(metrics) + 1)])
  return np.sum(metrics * scalars) / np.sum(scalars)

def find_pagerank(df):
  vertices = df.selectExpr("ORIGIN as id").distinct()

  # Create edges DataFrame
  edges = df.select("ORIGIN", "DEST").selectExpr("ORIGIN as src", "DEST as dst")

  # Create GraphFrame
  graph = GraphFrame(vertices, edges)

  # Run PageRank algorithm
  results = graph.pageRank(resetProbability=0.15, maxIter=20)

  # Show PageRank scores
  return results.vertices.select("id", "pagerank").withColumnRenamed("id", "ORIGIN")



def join_pagerank(df,pagerank_df):
  left_join_df = df.join(pagerank_df, on="ORIGIN", how="left")
  
  return left_join_df

In [0]:
# big file
# file_name = 'Draft_Final_DF_1Y_2.00'
df = import_file('Draft_Final_DF_ALL','parquet')
df = df.withColumn('FL_DATE', df['FL_DATE'].cast(DateType()))


df = df.sort(df.FL_DATE)
df.display()

MONTH,FL_DATE,OP_CARRIER_AIRLINE_ID,ORIGIN,DEST,DEP_DEL15,PREVIOUS_FLIGHT_ARRIVED_LATE,PREVIOUS_DIVERTED,PLANE_FORECAST_TURNAROUND_TIME,FLIGHTS_SCHEDULED_2HRS_OR_LESS_BEFORE_CRS_DEP,FLIGHTS_DEPARTED_2HRS_BEFORE_PREDICTION,FLIGHTS_DELAYED_2HRS_BEFORE_PREDICITION,YEAR,QUARTER,DAY_OF_MONTH,DAY_OF_WEEK
1,2015-01-01,20304,DVL,DEN,1,1.0,0.0,-103.0,0,0,0,2015,1,1,4
1,2015-01-01,20366,SAF,DEN,0,,,,0,0,0,2015,1,1,4
1,2015-01-01,20304,MSP,CVG,0,0.0,0.0,105.0,18,37,5,2015,1,1,4
1,2015-01-01,20304,RHI,MSP,0,0.0,0.0,62.0,0,0,0,2015,1,1,4
1,2015-01-01,19790,ATW,ATL,0,,,,0,0,0,2015,1,1,4
1,2015-01-01,20304,RHI,IMT,0,0.0,0.0,14.0,0,0,0,2015,1,1,4
1,2015-01-01,20355,CLT,IND,0,0.0,0.0,78.0,44,36,2,2015,1,1,4
1,2015-01-01,20366,RIC,IAH,0,,,,0,0,0,2015,1,1,4
1,2015-01-01,20366,ABE,ATL,0,0.0,0.0,34.0,0,0,0,2015,1,1,4
1,2015-01-01,19790,RIC,ATL,0,,,,1,0,0,2015,1,1,4


## Modelling

### Preprocessing

In [0]:
features = ['MONTH', 'OP_CARRIER_AIRLINE_ID', 'ORIGIN', 'DEST', 'PREVIOUS_FLIGHT_ARRIVED_LATE', 
            'PREVIOUS_DIVERTED', 'PLANE_FORECAST_TURNAROUND_TIME',
            'FLIGHTS_DEPARTED_2HRS_BEFORE_PREDICTION', 'FLIGHTS_DELAYED_2HRS_BEFORE_PREDICITION', 'FLIGHTS_SCHEDULED_2HRS_OR_LESS_BEFORE_CRS_DEP']

df = df.withColumnRenamed(existing='DEP_DEL15', new='label')

train_set, test_set = df.filter(df.YEAR < 2019), df.filter(df.YEAR == 2019)
# train_set = upsample(train_set)

train_set = train_set.select(features + ['label']).fillna(0)
test_set = test_set.select(features + ['label']).fillna(0)

# page_rank_df = find_pagerank(train_set)

# train_set = join_pagerank(train_set, page_rank_df).fillna(0)
# test_set = join_pagerank(test_set, page_rank_df).fillna(0)

preprocessing = []

string_cols = ['ORIGIN', 'DEST', 'OP_CARRIER_AIRLINE_ID']

numerical_cols = ['PLANE_FORECAST_TURNAROUND_TIME', 'FLIGHTS_DEPARTED_2HRS_BEFORE_PREDICTION', 'FLIGHTS_DELAYED_2HRS_BEFORE_PREDICITION', 'FLIGHTS_SCHEDULED_2HRS_OR_LESS_BEFORE_CRS_DEP']
for string_col in string_cols:
    string_index = StringIndexer(inputCol=string_col, outputCol="indexed_" + string_col, handleInvalid='keep')
    preprocessing.append(string_index)
    list_onehot = OneHotEncoder(inputCol="indexed_" + string_col, outputCol="encoded_" + string_col, handleInvalid='keep')
    preprocessing.append(list_onehot)

encoded_MONTH = 'encoded_MONTH'
preprocessing.append(OneHotEncoder(inputCol='MONTH', outputCol=encoded_MONTH, handleInvalid='keep'))

used_features = ['encoded_' + col for col in string_cols] + [encoded_MONTH]

numerical_ass = VectorAssembler(inputCols = numerical_cols, outputCol = 'numerical_features')
preprocessing.append(numerical_ass)

standard_sc = StandardScaler(inputCol = 'numerical_features', outputCol = 'scaled_numerical_features')
preprocessing.append(standard_sc)

vector_ass = VectorAssembler(inputCols=['scaled_numerical_features'] + used_features, outputCol='features')
preprocessing.append(vector_ass)

# total_rows = train_set.count()

## Models

### GBT

In [0]:
gbt_params = {'featuresCol': 'features', 'labelCol': 'label', 'maxBins': 126, 'maxDepth': 11, 'minInstancesPerNode': 3, 'stepSize': 0.27286063089767754, 'subsamplingRate': 0.6445718010290064}
gbt = GBTClassifier(**gbt_params)
gbt_pipeline = Pipeline(stages=preprocessing + [gbt])

# Start MLflow run
with mlflow.start_run():
    result = dict()
    # Fit the model on train_set
    print('Begin Training')
    model = gbt_pipeline.fit(train_set)
    print('Finished Fitting')
    test_predictions = model.transform(test_set)
    print('Finished Predicting')
    test_eval = cal_fbeta_score(test_predictions)
    print('Finished Calculating')
    for k, v in test_eval.items():
        result['test_' + k] = v
    
    mlflow.log_metrics(result)
    mlflow.log_param('features', features)
    mlflow.log_param('ML Algo', 'GBTClassifier')

Begin Training
Finished Fitting
Finished Predicting
Finished Calculating
