# Phase 3:

#### Team 14: Carlos Moreno, Elizabeth Khan, Jagan Lakshmipathy, and Ziling Huang

__PIPELINE TESTING:__  

**For more detail on EDA work please see the following notebook:**  
``````

**For more detail on JOIN WORK please see the following notebook:**
``````

### PIPELINE

#### 1. Setup and import libraries

In [0]:
from pyspark.sql.functions import *
import pyspark.sql.functions as F
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from graphframes import *
import geopandas as gpd
import plotly as plotly

# import custom cv module - for Custome Cross Validation - Timeseries
spark.sparkContext.addPyFile("dbfs:/custom_cv.py")
from custom_cv import CustomCrossValidator

pd.set_option("display.max_rows", 999)
pd.set_option("display.max_columns", 200)

#Import for implementing upsampling or downsampling
from pyspark.sql.functions import col, explode, array, lit

#from heatmap import heatmap, corrplot
from pyspark.ml import *
from pyspark.ml.linalg import *
from pyspark.ml.stat import *
from pyspark.ml.feature import *
from pyspark.sql.window import *

# Append weights to the dataset
from pyspark.sql.functions import col
from pyspark.sql.functions import when

# ML related libraries
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoder
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.feature import StandardScaler, Imputer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.classification import LinearSVC

from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.ml.classification import RandomForestClassifier
from pyspark.mllib.evaluation import MulticlassMetrics

from pyspark.ml.linalg import Vectors
from itertools import combinations

#Blob credentials
blob_container = "cemgr14c" # The name of your container created in https://portal.azure.com
storage_account = "cemgr14" #The name of your Storage account created in https://portal.azure.com
secret_scope = "w261gr14" # The name of the scope created in your local computer using the Databricks CLI
secret_key = "keygr14" # The name of the secret key created in your local computer using the Databricks CLI
blob_url = f"wasbs://{blob_container}@{storage_account}.blob.core.windows.net"
mount_path = "/mnt/mids-w261"

# SAS Token

spark.conf.set(
  f"fs.azure.sas.{blob_container}.{storage_account}.blob.core.windows.net",
  dbutils.secrets.get(scope = secret_scope, key = secret_key)
)

#### 2. Functions to Evaluate Models 
##### Accuracy, Recall, Precision, F1_score, F2_score, F05_Score

In [0]:
# Helping Functions for Evaluating Models
def extract(row):
  '''
  Input: row with probability field combined.
  Output: row with probability fiedl split.
  '''
  return (row.DEP_DEL15,) + tuple(row.probability.toArray().tolist()) +  (row.label,) + (row.prediction,)

def score(model,data):
  pred = model.transform(data).select("DEP_DEL15", "probability", "label", "prediction")
  pred = pred.rdd.map(extract).toDF(["DEP_DEL15", "p0", "p1", "label", "prediction"])
  return pred 

In [0]:
def model_metric(prediction):
  '''
  Input: precition dataframe including label and prediction.
  Output: dictionary with key metrics (acccuracy: value, recall: value, ...)
  This implementation uses map-reduce approach to calculating the metrics - created to facilitate calcuation with very large datasets.
  '''
  
  # To emit pairs for counting the number of TP, TN, FP and FN in the dataset
  def counts(label, pred):
    
    if (label == pred) and (label == 1.0): return ('TP', 1)
    elif (label == pred) and (label != 1.0): return ('TN', 1)
    elif (label != pred) and (label != 1.0): return ('FP', 1)
    elif (label != pred) and (label == 1.0): return ('FN', 1)
  
  # Based on counts, it calculates the performance metrics
  def cmetrics(counts):
    
    # Initialize variables, and convert counts to an array of tuples
    TP, TN, FP, FN = 0.0, 0.0, 0.0, 0.0
    carray = np.array(counts.collect())
    metric_dic = {}
    
    # Extract the counts from the arry of tuples
    for c in carray:
      if c[0] == 'TP': TP = float(c[1])
      elif c[0] == 'TN': TN = float(c[1])
      elif c[0] == 'FP': FP = float(c[1])
      elif c[0] == 'FN': FN = float(c[1])
  
    #print(TP, TN, FP, FN)
    metric_dic["Accuracy"] = (TP+TN) / (TP + TN + FP + FN)

    if (TP+FP) != 0:
      precision = TP/(TP+FP)
      metric_dic["Precision"] = precision
    else:
      precision = "NA"
      metric_dic["Precision"] = precision

    if (TP+FN) != 0: 
      recall = TP/(TP+FN)
      metric_dic["Recall"] = recall
    else:
      recall = "NA"
      metric_dic["Recall"] = recall

    if (TN+FP) != 0: metric_dic["Specificity"] = TN/(TN+FP)
    else: metric_dic["Specificity"] = "NA"

    if recall != "NA" and precision != "NA":
      if (recall + precision) != 0:
        metric_dic["F1_Score"] = 2*(recall * precision) / (recall + precision)
        beta = 0.5
        metric_dic["F05_Score"] = (1+beta**2)*(recall * precision) / ((beta**2 * precision) + recall)
        beta = 2
        metric_dic["F2_Score"] = (1+beta**2)*(recall * precision) / ((beta**2 *precision) + recall)      
      else: 
        metric_dic["F1_Score"] = "NA"
        metric_dic["F05_Score"] = "NA"
        metric_dic["F2_Score"] = "NA"
    else:
      metric_dic["F1_Score"] = "NA"
      metric_dic["F05_Score"] = "NA"
      metric_dic["F2_Score"] = "NA"
        
    return metric_dic    
      
  counts = prediction.rdd.map(lambda x: counts(x[3],x[4])) \
                         .reduceByKey(lambda x, y: x+y).cache()
  
  metrics = cmetrics(counts)
  
  return counts, metrics

In [0]:
def metricsdf(dfdata, names):
  '''
  Input: 
     dfdata: list of dataframes with predictions and labels
     names: name for model to be assigned to final metrics output.
     
   Output: dataframe with the metrics for each dataframe included in the list.
  '''
  
  df_data = {}
  
  for df, name in zip(dfdata, names):
    c, MetMod = model_metric(df)
    df_data[name] = list(MetMod.values())

  metrics = list(MetMod.keys())
  dfM = pd.DataFrame(df_data, index = metrics)
  
  return dfM

In [0]:
def metricsdf2(metrics):
  '''
  Input: dataframe with label and predictions.
  Output: dictionary with all relevant metrics calculated from the input dataframe.
  This implementation uses the PySpark metrics implementation.
  '''
     
  dfmetric = {}
  recall = metrics.recall(1.0)
  precision = metrics.precision(1.0)
  dfmetric["Accuracy"] = metrics.accuracy
  dfmetric["Precision"] = precision
  dfmetric["Recall"] = recall
  if (recall + precision) != 0:
    dfmetric["F1_Score"] = 2*(recall * precision) / (recall + precision)
  beta = 0.5
  if ((beta**2 * precision) + recall) != 0:
    dfmetric["F05_Score"] = (1+beta**2)*(recall * precision) / ((beta**2 * precision) + recall)
  beta = 2
  if ((beta**2 *precision) + recall) != 0:
    dfmetric["F2_Score"] = (1+beta**2)*(recall * precision) / ((beta**2 *precision) + recall)
  
  return dfmetric

In [0]:
# Function to get distinct values of a dataframe for the categorical variables
def distinctvals(df, categoricals):
  
  '''
  Input: 
    df: dataframe with all variables.
    categoricals: name of categorical variables in the dataframe
  Output: dictionary with list of categorical variables and the number of levels for each categorical variable.
    
  '''
  
  dv = {}
  for colN in categoricals:
    d = np.array(df.select(colN).distinct().collect())
    dv[colN] = len(d)
  
  return dv

In [0]:
def numattributes(df, categoricals, numerics):
  '''
  Input:
    df: dataframe with attributes as columns.
    categoricals: list of all categorical variables in dataframe
    numerics: list of all numberical variables in dataframe
  Output:
    Number of variables in dataframe where each level for a categorical variable count as one, while each numeric
    variable count as one.
  '''
  sum = 0
  for f in categoricals:
    d = np.array(df.select(f).distinct().collect())
    sum += len(d)
  return sum

# nfeatures = numattributes(train_val, categoricals, numerics)
# print(nfeatures)

#### 3. Read Joined Data and Select Data for Models

In [0]:
# READ JOINED DATA

# NEW LINK FOR DATA INCLUDING TIME OF DAY
# df_airlines = spark.read.parquet(f"{blob_url}/mlpipeline_set_time_of_day_zl_0403").cache()

# READ DATA INCLUDING AIRPORT RANK.
# df_airlines = spark.read.parquet(f"{blob_url}/df_airlines_rank").cache()
airlines = spark.read.parquet(f"{blob_url}/df_airlines_rank_graphs").cache()

# CONTINUE WITH A PORTION OF DATA FOR TESTING
# proportion = 0.05   # 0.025
# (airlines, airline_rest) = airlines_total.randomSplit([proportion, 1- proportion], seed=123)

#CONTINUE WITH ALL DATA
# airlines.display()

In [0]:
# Select only the columns needed
airlines = airlines.select('YEAR','QUARTER','MONTH','DAY_OF_MONTH','DAY_OF_WEEK',
                                         'OP_CARRIER_AIRLINE_ID', 'ORIGIN_AIRPORT_ID','DEST_AIRPORT_ID',
                                         'CRS_DEP_TIME','DEP_DELAY','DEP_DELAY_NEW','DEP_DEL15','DEP_DELAY_GROUP','DISTANCE','DISTANCE_GROUP',
                                         'wind_speed_mps_orig','ceiling_ht_dim_orig','visibility_meters_orig','temp_cels_orig','dew_pt_orig','atmos_press_orig','precip_milimeters_orig',
                                         'wind_speed_mps_dest','ceiling_ht_dim_dest','visibility_meters_dest','temp_cels_dest','dew_pt_dest','atmos_press_dest','precip_milimeters_dest',
                                         'rolling_ninety_day_average','Air_Page_Rank_traffic','Delay_block','OD_delay_pair',
                                         'SEASON','WKDAY','DEPARTURE_Hour_CRS', 'time_of_day_int', 'Cnn_Ranking_val','Delay_Ranking_val',
                                         'Coalesced_PgRank_orig', 'Coalesced_PgRank_dest', 'Conn_Ranking_orig', 'Conn_Ranking_dest').cache()

# OUTCOME VARIABLES: DEP_TIME  DEP_DEL15
airlines = airlines.dropna()

# CHECK FOR BALANCE - LARGE DATASET
airlines.groupBy("DEP_DEL15").agg((count(col("DISTANCE"))).alias("COUNT_DISTANCE")).display()

#### 4. Using Delta Lakes - Skip if Using Airlines

In [0]:
# FULL DATA = airline_joins_sel

# Configure Path
#FIRST ITERATION
# DELTALAKE_GOLD_PATH = f"{blob_url}/airline_rank_CEM_CAR20.delta"
# WHEN COYING BY OTHERS
DELTALAKE_GOLD_PATH = f"{blob_url}/airline_rank_CEM_CAR21.delta"

# Remove table if it exists
dbutils.fs.rm(DELTALAKE_GOLD_PATH, recurse=True)

# Save table as Delta Lake
airlines.write.format("delta").mode("overwrite").save(DELTALAKE_GOLD_PATH)

# Re-read as Delta Lake
airlines = spark.read.format("delta").load(DELTALAKE_GOLD_PATH)

# CHECK FOR BALANCE - LARGE DATASET
airlines.groupBy("DEP_DEL15").agg((count(col("DISTANCE"))).alias("COUNT_DISTANCE")).display()

In [0]:
airlines.groupBy("DEP_DEL15", "Cnn_Ranking_val").agg((count(col("DISTANCE"))).alias("COUNT_DISTANCE")).display()

#### 5. Functions for Managing Imbalance Data based on DEP_DEL15

In [0]:
def oversampling_Adj(data, column, val1, val2):

  '''
  Input: data = (dataframe with data), column = (column to check ratio for), val1 = majority value for column, val2 = minority value for column
  Output: dataframe with balanced count for minority and majority count by oversampling minority count.
  '''
  
  major_df = data.filter(col(column) == val1)
  minor_df = data.filter(col(column) == val2)
  n1 = minor_df.count()
  n2 = major_df.count()
  ratio = int(n2/n1)
  
  a = range(ratio+1)
  
  # duplicate the minority rows using explode and lit functions.
  oversampled_df = minor_df.withColumn("dummy", explode(array([lit(x) for x in a]))).drop('dummy')
  n3 = oversampled_df.count()
  if n3>n2: oversampled_df = oversampled_df.sample(False, n2/n3, 123)
  
  # combine both oversampled minority data and majority data 
  combined_df = major_df.unionAll(oversampled_df)
  return combined_df, ratio

In [0]:
def undersampling_Adj(data, column, val1, val2):

  '''
  Input: data = (dataframe with data), column = (column to check ratio for), val1 = majority value for column, val2 = minority value for column
  Output: dataframe with balanced count for minority and majority count by undersampling majority count.
  '''
  
  major_df = data.filter(col(column) == val1)
  minor_df = data.filter(col(column) == val2)
  n1 = major_df.count()
  n2 = minor_df.count()
  ratio = n2/n1
    
  sampled_majority_df = major_df.sample(False, ratio, 123)
  combined_df = sampled_majority_df.unionAll(minor_df)

  return combined_df, ratio

In [0]:
def balancesampling_Adj(data, column, val1, val2, adj):

  '''
  Input: data = (dataframe with data), column = (column to check ratio for), val1 = majority value for column, val2 = minority value for column, adj = factor to adjust balance
  Output: dataframe with balanced count for minority and majority count by balancing minority and majority count.
  '''
  
  major_df = data.filter(col(column) == val1)
  minor_df = data.filter(col(column) == val2)
  n1 = major_df.count()
  n2 = minor_df.count()
  t2 = (n1 + n2)/2 
  ratio = int((n1/n2)/adj)
  
  a = range(ratio)
  
  # duplicate the minority rows using explode and lit functions.
  oversampled_df = minor_df.withColumn("dummy", explode(array([lit(x) for x in a]))).drop('dummy')
  n3 = oversampled_df.count()
  oversampled_df = oversampled_df.sample(False, t2/n3, 123)
    
  # downsampling oversampled minority and majority data to achieve balanced data
  major_df = major_df.sample(False, t2/n1, 123)

  # combine both adjusted oversampled minority data and majority data 
  combined_df = major_df.unionAll(oversampled_df)
  return combined_df, [ratio, t2/n1]

In [0]:
def createSplits(data, year, last_year, delta, flag):
  # Set Dictionary with Data Splits

  '''
  Input:
    data: data for split
    year: first year to be used for validation
    last_year: last year to be considered for validation
    delta: size of range for training data
    flag: True if delta increases by one (for expanding window)
  Output: Dictionary with splits for test and train.
  '''
  d = {}
  i = 1
  while year <= last_year:
    dfname = "df"+str(i)
    print("Creating split ", i," for ", dfname, "- Val Year: ", year, " Train year: ", year-delta, "-", year-1)
    d[dfname] = data.filter( (data.YEAR <= year) & (data.YEAR >= year-delta))\
                    .withColumn('cv', when(data.YEAR <= year-1, 'train')
                    .otherwise('test'))
    i += 1
    year += 1
    if flag: delta +=1
  
  return d

#### 6. Set Data for ML Models and Create Partitions for Grid Search

##### - Select Variables and Manage Unbalance Data

In [0]:
myY = "DEP_DEL15"

# categoricals = [ 'QUARTER', 'MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK', 'OP_CARRIER_AIRLINE_ID', 'ORIGIN_AIRPORT_ID', 'DEST_AIRPORT_ID', 
#                 'SEASON', 'WKDAY', 'DEPARTURE_Hour_CRS', 'time_of_day_int']

# numerics = [ 'DISTANCE', 'wind_speed_mps_orig', 'ceiling_ht_dim_orig', 'visibility_meters_orig', 'temp_cels_orig', 'dew_pt_orig', 'atmos_press_orig', 'precip_milimeters_orig', 'wind_speed_mps_dest', 'ceiling_ht_dim_dest', 'visibility_meters_dest', 'temp_cels_dest', 'dew_pt_dest', 'atmos_press_dest', 'precip_milimeters_dest', 'rolling_ninety_day_average', 'Air_Page_Rank_traffic','OD_delay_pair', 'Coalesced_PgRank_orig', 'Coalesced_PgRank_dest', 'Conn_Ranking_orig', 'Conn_Ranking_dest']

# NI = ['YEAR','Cnn_Ranking_val','Delay_Ranking_val' ]

categoricals = [ 'MONTH',  'OP_CARRIER_AIRLINE_ID', 'ORIGIN_AIRPORT_ID', 'DEST_AIRPORT_ID', 
                'SEASON',  'DEPARTURE_Hour_CRS', 'time_of_day_int']

numerics = [ 'wind_speed_mps_orig', 'ceiling_ht_dim_orig', 'visibility_meters_orig', 'temp_cels_orig', 'dew_pt_orig', 'precip_milimeters_orig', 'wind_speed_mps_dest', 'temp_cels_dest',  'precip_milimeters_dest', 'rolling_ninety_day_average', 'OD_delay_pair', 'Coalesced_PgRank_orig']

NI = ['YEAR', 'Cnn_Ranking_val']


# 'CRS_DEP_TIME', 'DEP_DELAY', 'DEP_DELAY_NEW', 'DEP_DEL15', 'DEP_DELAY_GROUP'

myX = categoricals + numerics

# SELECT THE COLUMNS WITH THE VARIABLES
airlines2 = airlines.select(myX + [myY, "DEP_DELAY_NEW", "DEP_DELAY"] + NI)

# UNCOMMENT LINE FOR DESIRED APPROACH TO BALANCE DATA
# airlines2, rs = oversampling_Adj(airlines2, "DEP_DEL15", 0, 1)
# airlines2, rs = undersampling_Adj(airlines2, "DEP_DEL15", 0, 1)
# airlines2, rs = balancesampling_Adj(airlines2, "DEP_DEL15", 0, 1, 1)

#CREATE A COPY OF myY with name label to be used in the Grid Search 
airlines2 = airlines2.withColumn("label", airlines2[myY])

airlines2.groupBy("label").agg((count(col(myY))).alias("COUNT_DISTANCE")).display()

In [0]:
test_unbalanced = airlines.select(myX + [myY, "DEP_DELAY_NEW", "DEP_DELAY"] + NI).filter(airlines2.YEAR > 2018).cache()
test_unbalanced = test_unbalanced.withColumn("label", test_unbalanced[myY]).cache()


##### - Include a Weight Column for Weighted Models

In [0]:
# INCLUDE WEIGHTS FOR IMPLEMENTATION OF WEIGHTED MODELS
majority = airlines2.filter(col(myY) == 0).count()
minority = airlines2.filter(col(myY) == 1).count()
count_total = majority + minority

# Weights
c, F = 2, 1
weight_minority = count_total / (c * minority * F)
weight_majority = count_total / (c * (count_total - minority))

# Weights
c, F = 2, 0.98
weight1_minority = count_total / (c * minority * F)
weight1_majority = count_total / (c * (count_total - minority))

# Weights
c,  F = 2, 0.95
weight2_minority = count_total / (c * minority * F)
weight2_majority = count_total / (c * (count_total - minority))

airlines2 = airlines2.withColumn("weight", when(col(myY) ==1, weight_minority).otherwise(weight_majority))
airlines2 = airlines2.withColumn("weight1", when(col(myY) ==1, weight1_minority).otherwise(weight1_majority))
airlines2 = airlines2.withColumn("weight2", when(col(myY) ==1, weight2_minority).otherwise(weight2_majority))

##### - Split Data for Training, Validation and Testing.  Create Folds for Grid Search

In [0]:
# SPLIT DATA FOR TRAINING AND VALIDATION
year_train_val = 2018
train_val = airlines2.filter(airlines2.YEAR <= year_train_val).cache()
test = airlines2.filter(airlines2.YEAR > year_train_val).cache()

trainCase = train_val

# CREATE SPLITS FOR CROSS VALIDATION - FIRST VALIDATION ON 2016, AND LAST VALIDATION ON 2018 - EXPANDING WINDOW STARTING WITH STEP ONE.
d = createSplits(train_val, 2016, 2018, 1, True)

##### - Define Indexers, Ohes, Imputers, Scalers

In [0]:
## Current possible ways to handle categoricals in string indexer is 'error', 'keep', and 'skip'
indexers = map(lambda c: StringIndexer(inputCol=c, outputCol=c+"_idx", handleInvalid = 'keep'), categoricals)
ohes = map(lambda c: OneHotEncoder(inputCol=c + "_idx", outputCol=c+"_class", dropLast=True),categoricals)
imputers = Imputer(inputCols = numerics, outputCols = numerics)

# Establish features columns
featureCols = list(map(lambda c: c+"_class", categoricals)) + numerics

model_matrix_stages = list(indexers) + list(ohes) + [imputers] + \
                     [VectorAssembler(inputCols=featureCols, outputCol="features")]

# Apply StandardScaler to create scaledFeatures
scaler = StandardScaler(inputCol="features",
                        outputCol="scaledFeatures",
                        withStd=True,
                        withMean=True)

#### 7. Grid Search Pipeline
###### Models Available: Random Forest (RF), Logistic Regression (LogR), Gradient Boosted Tree (GBT), and Linear Vector Support Classifier (LVSC)

In [0]:
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

def create_model(d_seg, model_matrix_stages, scaler, ModelType):
  '''
  Input: 
    - d_seg: data folds for Grid Search
    - model_matrix_stages (indexers, ohes, imputers, vector assembler)
    - scaler: StandardScaler (no PCA included in pipeline)
    - ModelType: Models Available: Random Forest (RF), Logistic Regression (LogR), Gradient Boosted Tree (GBT), and Linear Vector Support Classifier (LVSC)
  Output:
    - GridSearch Pipeline model fitted to data folds (d_seg)
  '''

  if ModelType == "RF":
    # Define a Random Forest model - not using PCA
    md = RandomForestClassifier(featuresCol = 'scaledFeatures', labelCol = 'label',
                                featureSubsetStrategy='auto', 
                                impurity='gini',
                                seed=123)

    # Set the Grid Search set of parameters
    grid = ParamGridBuilder()\
                .addGrid(md.maxDepth, [5, 8, 10, 12, 14])\
                .addGrid(md.numTrees, [80, 100, 110, 150])\
                .build()
    
  elif ModelType == "LogR":
    md = LogisticRegression(maxIter=20, featuresCol = "scaledFeatures", weightCol="weight")
    
      # Set the Grid Search set of parameters
    grid = ParamGridBuilder()\
                .addGrid(md.regParam, [0.1, 0.05, 0.01])\
                .addGrid(md.elasticNetParam, [0, 0.5, 1])\
                .build()
    
  elif ModelType == "GBT":
    
    # Define a GBT model.
    md = GBTClassifier(featuresCol="scaledFeatures",      # alternative to include scaled features: "scaledFeatures"
                        labelCol="label",
                        lossType = "logistic",
                        maxBins = 350,   #Replaced 52
                        weightCol= 'weight')

    # Set the Grid Search set of parameters
    grid = ParamGridBuilder()\
                .addGrid(md.maxDepth, [5, 8, 10])\
                .addGrid(md.maxIter, [10, 15, 20])\
                .build()
    
  elif ModelType == "LSVC":
    # Define a GBT model.
    md = LinearSVC(featuresCol="scaledFeatures",      # alternative to include scaled features: "scaledFeatures"
                   labelCol="label",
                   maxIter=10,
                   weightCol= 'weight'
                  )
    grid = ParamGridBuilder()\
                  .addGrid(md.regParam, [0.1, 0.05, 0.01])\
                .build()
 
  elif ModelType == "MPC":
    # Define a MPC model.
    # specify layers for the neural network:
    # input layer of size N (features),
    # and output of size 2 (classes)
    
    layers = [[735, 32, 64, 32, 2], [735, 8, 16, 8, 2], [735, 16, 32, 16, 2]]

#     layers = [[735, 32, 64, 32, 2], [735, 16, 32, 16, 2]]
    
    md = MultilayerPerceptronClassifier(maxIter=100, blockSize=128, seed=1234)

    # Set the Grid Search set of parameters
    grid = ParamGridBuilder()\
                .addGrid(md.layers, layers)\
                .build()

  # Build our ML pipeline
  
  pipeline = Pipeline(stages=model_matrix_stages+[scaler]+[md])


  evaluator = BinaryClassificationEvaluator()

  # Execute CrossValidator for model tuning
  crossval = CustomCrossValidator(estimator=pipeline,
                                  estimatorParamMaps=grid,
                                  evaluator=evaluator,
                                  splitWord = ('train', 'test'),
                                  cvCol = 'cv',
                                  parallelism=4)

  # Train the tuned model and establish our best model
  Model_seg = crossval.fit(d_seg)
  
  return Model_seg

##### a. Grid Search for Random Forest (Batch Process)
###### Run Grid Search Pipeline, Print Metrics, Save Model

In [0]:
# Testing MultilayerPerceptronClassifier
# Type of Model: "RF" = Random Forest, "LogR" = Logistic Regression, "GBT" = Gradient Boosted Trees, "LSVC" = Linear Support Vector Classifier
modelName = "RF"

# Create and fit pipeline to data (d)
rf_pipeline = create_model(d, model_matrix_stages, scaler, modelName)

# Get best model and print performance metrics
rf_model = rf_pipeline.bestModel
pred = rf_model.transform(test).select("DEP_DEL15", "prediction")
metricsT = MulticlassMetrics(pred.rdd.map(lambda x: (x[1], x[0])))
m2 = metricsdf2(metricsT)
dfM_rf = pd.DataFrame({modelName: list(m2.values())}, index = list(m2.keys()))
dfM_rf

RF: Best Model:
{Param(parent='RandomForestClassifier_6fe29fa7db1b', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.'): 14, Param(parent='RandomForestClassifier_6fe29fa7db1b', name='numTrees', doc='Number of trees to train (>= 1).'): 250} Detailed Score [0.7124449378591117, 0.711729788988346, 0.7133298648993877] Avg Score 0.7125015305822817

Using 19 variables:
Best Model:  {Param(parent='RandomForestClassifier_030856b0d53a', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.'): 14, Param(parent='RandomForestClassifier_030856b0d53a', name='numTrees', doc='Number of trees to train (>= 1).'): 110} Detailed Score [0.7117320780331156, 0.7112025132790081, 0.7107323084360173] Avg Score 0.711222299916047

In [0]:
# Erase MODEL
model_path = f"{blob_url}/models/rf_model"
dbutils.fs.rm(model_path, True)

# Save the MODEL
rf_model.save(model_path)

##### b. Grid Search for Logistic Regression (Batch Process)
###### Run Grid Search Pipeline, Print Metrics, Save Model

In [0]:
# Type of Model: "RF" = Random Forest, "LogR" = Logistic Regression, "GBT" = Gradient Boosted Trees, "LSVC" = Linear Support Vector Classifier
modelName = "LogR"
logr_pipeline = create_model(d, model_matrix_stages, scaler, modelName)

logr_model = logr_pipeline.bestModel
pred = logr_model.transform(test).select("DEP_DEL15", "prediction")
metricsT = MulticlassMetrics(pred.rdd.map(lambda x: (x[1], x[0])))
m2 = metricsdf2(metricsT)
dfM_lr = pd.DataFrame({modelName: list(m2.values())}, index = list(m2.keys()))
dfM_lr

LogR: Best Model:  {Param(parent='LogisticRegression_b84ba01371d9', name='regParam', doc='regularization parameter (>= 0).'): 0.01, Param(parent='LogisticRegression_b84ba01371d9', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.5} Detailed Score [0.6897965704488692, 0.6832426835147819, 0.6859631304773205] Avg Score 0.6863341281469906
Out[67]:

In [0]:
# Erase MODEL
model_path = f"{blob_url}/models/logr_model"
dbutils.fs.rm(model_path, True)

# Save the MODEL
logr_model.save(model_path)

##### c. Grid Search for GBT - Gradient Boosted Tree
###### Run Grid Search Pipeline, Print Metrics, Save Model

In [0]:
# Type of Model: "RF" = Random Forest, "LogR" = Logistic Regression, "GBT" = Gradient Boosted Trees, "LSVC" = Linear Support Vector Classifier
modelName = "GBT"
gbt_pipeline = create_model(d, model_matrix_stages, scaler, modelName)

gbt_model = gbt_pipeline.bestModel
predgbt = gbt_model.transform(test).select("DEP_DEL15", "prediction")
metricsgbt = MulticlassMetrics(predgbt.rdd.map(lambda x: (x[1], x[0])))
mgbt = metricsdf2(metricsgbt)
dfgbt = pd.DataFrame({modelName: list(mgbt.values())}, index = list(mgbt.keys()))
dfgbt

GBT: Best Model:  {Param(parent='GBTClassifier_f85db42a2d71', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.'): 5, Param(parent='GBTClassifier_f85db42a2d71', name='maxIter', doc='max number of iterations (>= 0).'): 20} Detailed Score [0.7095841683238695, 0.7070802054050617, 0.7107503772097722] Avg Score 0.7091382503129011

In [0]:
# Erase MODEL
model_path = f"{blob_url}/models/gbt_model"
dbutils.fs.rm(model_path, True)

# Save the MODEL
gbt_model.save(model_path)

##### d. Grid Search for LSVC Batch Process
###### Run Grid Search Pipeline, Print Metrics, Save Model

In [0]:
# Type of Model: "RF" = Random Forest, "LogR" = Logistic Regression, "GBT" = Gradient Boosted Trees, "LSVC" = Linear Support Vector Classifier
modelName = "LSVC"
lsvc_pipeline = create_model(d, model_matrix_stages, scaler, modelName)

lsvc_model = lsvc_pipeline.bestModel
predlsvc = lsvc_model.transform(test).select("DEP_DEL15", "prediction")
metricslsvc = MulticlassMetrics(predlsvc.rdd.map(lambda x: (x[1], x[0])))
mlsvc = metricsdf2(metricslsvc)
dflsvc = pd.DataFrame({modelName: list(mlsvc.values())}, index = list(mlsvc.keys()))
dflsvc

LSVC: Best Model:  {Param(parent='LinearSVC_63470e83e35e', name='regParam', doc='regularization parameter (>= 0).'): 0.01} Detailed Score [0.685944064672475, 0.6807187279762132, 0.6823800942462118] Avg Score 0.6830142956316334

In [0]:
# Erase MODEL
model_path = f"{blob_url}/models/lsvc_model"
dbutils.fs.rm(model_path, True)

# Save the MODEL
lsvc_model.save(model_path)

##### e. Summary Commands to Load Models

In [0]:
# Loading a Radom Forest Saved Model
model_path = f"{blob_url}/models/rf_model"
rf_saved_model = rf_model.load(model_path)

# Loading a Log Regression Saved Model
model_path = f"{blob_url}/models/logr_model"
logr_saved_model = logr_model.load(model_path)

# Loading a Log Regression Saved Model
model_path = f"{blob_url}/models/gbt_model"
gbt_saved_model = gbt_model.load(model_path)

# Loading a Log Regression Saved Model
model_path = f"{blob_url}/models/lsvc_model"
lsvc_saved_model = lsvc_model.load(model_path)

#### 8. Generate Grid Searched Models in Batch (RF, LogR, GBT, LSVC)

In [0]:
modelName = ["RF", "LogR", "GBT", "LSVC"]
modelList = []
metricList = []
i = 0
for modelN in modelName:
  print("Working on Model: ", modelN)
  model = create_model(d, model_matrix_stages, scaler, modelN)
  bestmodel = model.bestModel
  
  pred = bestmodel.transform(test).select("DEP_DEL15", "prediction")
  metricsT = MulticlassMetrics(pred.rdd.map(lambda x: (x[1], x[0])))
  m2 = metricsdf2(metricsT)
  dfM = pd.DataFrame({modelN: list(m2.values())}, index = list(m2.keys()))
  if i == 0:
    dfM_consolidated = dfM
    i += 1
  else:
    dfM_consolidated = pd.concat([dfM_consolidated,dfM],axis=1)
  
#   print(dfM)
  modelList.append(model)
  metricList.append(dfM)

In [0]:
dfM_consolidated

#### 9. Fit and Test Best Random Forest Model - No PCA

##### a. Random Forest Best Configuration

In [0]:
def rf_pipeline_best(data, maxDepth, numTrees, model_matrix_stages, scaler):
  
  # Define a Random Forest model - not using PCA
  rf = RandomForestClassifier(featuresCol = 'scaledFeatures', labelCol = 'label',
                              featureSubsetStrategy='auto', 
                              impurity='gini',
                              maxDepth = maxDepth,
                              numTrees = numTrees,
                              seed=123)


  pipeline = Pipeline(stages=model_matrix_stages+[scaler]+[rf])


  # Train the tuned model and establish our best model
  pipeline_rf = pipeline.fit(data)
  
  return pipeline_rf

In [0]:
# DO NOT RUN THIS - MOVE TO NEXT CELL
year_train_val = 2018
lower_end = 2015
# train_val = airlines2.filter(airlines2.YEAR <= year_train_val).cache()
# test = airlines2.filter(airlines2.YEAR > year_train_val).cache()

trainCase = airlines2.filter( (airlines2.YEAR <= year_train_val) & (airlines2.YEAR >= lower_end)  ).cache()
maxDepth = 14
numTrees = 250

rf_pipeline_test = rf_pipeline_best(trainCase, maxDepth, numTrees, model_matrix_stages, scaler)

In [0]:
# train_val.unpersist()
# airlines.unpersist()
# airlines2.unpersist()

In [0]:
# RUN THIS FOR RANDOM FOREST
rf = RandomForestClassifier(featuresCol = 'scaledFeatures', labelCol = 'label',
                            featureSubsetStrategy='auto', 
                            impurity='gini',
                            maxDepth = 14,
                            numTrees = 110,
                            seed=123)

pipelineRF = Pipeline(stages=model_matrix_stages+[scaler]+[rf])

model_RF = pipelineRF.fit(train_val)

In [0]:
# GET PREDICTIONS -> DEP_DEL15, P0, P1, LABEL, PREDICTION
tvprd = score(model_RF, train_val)
testprd = score(model_RF, test_unbalanced)

rfM = metricsdf([tvprd, testprd], ["RF_train", "RF_test"])
rfM

Unnamed: 0,RF_train,RF_test
Accuracy,0.657424,0.648145
Precision,0.296531,0.297438
Recall,0.654366,0.648226
Specificity,0.658097,0.648126
F1_Score,0.408119,0.407771
F05_Score,0.332944,0.333537
F2_Score,0.527142,0.524509


In [0]:
# Random Forest Confusion Matrix
testprd.groupBy("DEP_DEL15", "prediction").count().display()

In [0]:
metricsRF = MulticlassMetrics(testprd.rdd.map(lambda x: (x[4], x[3])))
rfM2 = metricsdf2(metricsRF)
dfMrf = pd.DataFrame({"RF": list(rfM2.values())}, index = list(rfM2.keys()))
dfMrf

In [0]:
# LOOK TO TRY DIFFERENT CUTS FOR TRAINING DATA
year_train_val = 2018
maxDepth = 10
numTrees = 110

result_list = []
name_list = []
for lower_end in [2015, 2016, 2017, 2018]:
  
  print("Lower Year: ",lower_end) 
  trainCase = airlines2.filter( (airlines2.YEAR <= year_train_val) & (airlines2.YEAR >= lower_end)  ).cache()
  rf_pipeline_test = rf_pipeline_best(trainCase, maxDepth, numTrees, model_matrix_stages, scaler)

  testprd = score(rf_pipeline_test, test)
  result_list.append(testprd)
  name = "RF-"+str(lower_end)+"-"+str(year_train_val)
  name_list.append(name)
  
rfM = metricsdf(result_list, name_list) 
rfM

##### b. Best GBT

In [0]:
gbt = GBTClassifier(featuresCol="scaledFeatures",      # alternative to include scaled features: "scaledFeatures"
                    labelCol="label",
                    lossType = "logistic",
                    maxDepth = 5,
                    maxIter = 20,
                    maxBins = 350,   #Replaced 52
                    weightCol= 'weight')

pipelineGBT = Pipeline(stages=model_matrix_stages+[scaler]+[gbt])

model_GBT = pipelineGBT.fit(trainCase)

In [0]:
# GET PREDICTIONS -> DEP_DEL15, P0, P1, LABEL, PREDICTION
tvpgbt = score(model_GBT, trainCase)
testpgbt = score(model_GBT, test_unbalanced)

gbtM = metricsdf([tvpgbt, testpgbt], ["GBT_train", "GBT_test"])
gbtM

##### c. LSVC

In [0]:
lsvc = LinearSVC(featuresCol="scaledFeatures",      # alternative to include scaled features: "scaledFeatures"
               labelCol="label",
               maxIter=10,
               weightCol= 'weight',
               regParam = 0.01
              )

pipelineLSVC = Pipeline(stages=model_matrix_stages+[scaler]+[lsvc])

model_LSVC = pipelineLSVC.fit(trainCase)

In [0]:
# GET PREDICTIONS -> DEP_DEL15, P0, P1, LABEL, PREDICTION

Plsvc = model_LSVC.transform(test_unbalanced).select("DEP_DEL15", "prediction")
Mlsvc = MulticlassMetrics(Plsvc.rdd.map(lambda x: (x[1], x[0])))
sumlsvc = metricsdf2(Mlsvc)
lsvc = pd.DataFrame({"LSVC": list(sumlsvc.values())}, index = list(sumlsvc.keys()))
lsvc

In [0]:
Plsvc = model_LSVC.transform(trainCase).select("DEP_DEL15", "prediction")
Mlsvc = MulticlassMetrics(Plsvc.rdd.map(lambda x: (x[1], x[0])))
sumlsvc = metricsdf2(Mlsvc)
lsvc = pd.DataFrame({"LSVC": list(sumlsvc.values())}, index = list(sumlsvc.keys()))
lsvc

##### d. Logistic Regression

In [0]:
lr = LogisticRegression(maxIter=20, featuresCol = "scaledFeatures", weightCol="weight", regParam=0.01, elasticNetParam=0.5)

pipelineLR = Pipeline(stages=model_matrix_stages+[scaler]+[lr])

model_LR = pipelineLR.fit(trainCase)

In [0]:
# GET PREDICTIONS -> DEP_DEL15, P0, P1, LABEL, PREDICTION
tvpLR = score(model_LR, trainCase)
testpLR = score(model_LR, test_unbalanced)

lrM = metricsdf([tvpLR, testpLR], ["LR_train", "LR_test"])
lrM

In [0]:
#dataframe.unpersist()

#### 10. Feature Importance - Random Forest Model

In [0]:
# year_train_val = 2018
# lower_end = 2017
# trainCase = airlines2.filter( (airlines2.YEAR <= year_train_val) & (airlines2.YEAR >= lower_end)  ).cache()
# datamodel = rf_pipeline_test.transform(train_val)


In [0]:
def featureList2(model, datamodel, data, categoricals):
  features = model.stages[-3].getInputCols()
  sizelist = []
  setcat = set(categoricals)
  featurelist = []
  for f in features:
    fset = f[:len(f)-6]
    if fset in setcat:
      n = len(np.array(data.select(fset).distinct().collect()))
      sizelist.append(n)
      featurelist = featurelist + [fset]*n
    else:
      featurelist = featurelist + [f]
      sizelist.append(1)
  
  return featurelist, sizelist

In [0]:
def featureList(model, datamodel, categoricals):
  features = model.stages[-3].getInputCols()
  sizelist = []
  setcat = set(categoricals)
  featurelist = []
  for f in features:
    fset = f[:len(f)-6]
    if fset in setcat:
      n = int(len(datamodel.select(f).take(1)[0][0].toArray()))
      sizelist.append(n)
      featurelist = featurelist + [fset]*n
    else:
      featurelist = featurelist + [f]
      sizelist.append(1)
  
  return featurelist, sizelist

In [0]:
def get_feature_importance(model, traindata, categoricals):
    
  datamodel = model.transform(traindata)
#   Use when using a dataset from Grid Search
  featurelist, sizelist = featureList(model, datamodel, categoricals)

  # Use when not using a dataset from Grid Search
#   featurelist, sizelist = featureList2(model, datamodel, traindata, categoricals)

  featureImp = model.stages[-1].featureImportances
  Impfeatures = pd.DataFrame({"atribute":featurelist, "importance":featureImp})
  
  return Impfeatures

In [0]:
# datamodel = rf_pipeline_test.transform(trainCase)
# rf_model or rf_pipeline_test; trainCase, train_val
importantFeatures = get_feature_importance(model_RF, train_val, categoricals)

In [0]:
df1 = importantFeatures.groupby('atribute')['importance'].sum().sort_values(ascending=False).reset_index()
df1['CUMSUM_C'] = df1['importance'].cumsum()
df1

##### Test Model with Key Variables

In [0]:
myY2 = "DEP_DEL15"

take_out = ['WKDAY', 'DAY_OF_MONTH', 'DAY_OF_WEEK', 'DISTANCE', 'QUARTER', 'dew_pt_dest', 'atmos_press_dest', 'Conn_Ranking_dest',
           'Delay_Ranking_val', 'Coalesced_PgRank_dest', 'Air_Page_Rank_traffic', 'visibility_meters_dest', 'Conn_Ranking_orig',
           'atmos_press_orig', 'ceiling_ht_dim_dest']

categoricals2 = [ 'MONTH',  'OP_CARRIER_AIRLINE_ID', 'ORIGIN_AIRPORT_ID', 'DEST_AIRPORT_ID', 
                'SEASON',  'DEPARTURE_Hour_CRS', 'time_of_day_int']

numerics2 = [ 'wind_speed_mps_orig', 'ceiling_ht_dim_orig', 'visibility_meters_orig', 'temp_cels_orig', 'dew_pt_orig', 'precip_milimeters_orig', 'wind_speed_mps_dest', 'temp_cels_dest',  'precip_milimeters_dest', 'rolling_ninety_day_average', 'OD_delay_pair', 'Coalesced_PgRank_orig']

NI2 = ['YEAR', 'Cnn_Ranking_val']
# 'CRS_DEP_TIME', 'DEP_DELAY', 'DEP_DELAY_NEW', 'DEP_DEL15', 'DEP_DELAY_GROUP'

myX2 = categoricals2 + numerics2

# SELECT THE COLUMNS WITH THE VARIABLES
airlines3 = airlines.select(myX2 + [myY2, "DEP_DELAY_NEW", "DEP_DELAY"] + NI2)

# UNCOMMENT LINE FOR DESIRED APPROACH TO BALANCE DATA
# airlines2, rs = oversampling_Adj(airlines2, "DEP_DEL15", 0, 1)
airlines3, rs = undersampling_Adj(airlines3, "DEP_DEL15", 0, 1)
# airlines2, rs = balancesampling_Adj(airlines2, "DEP_DEL15", 0, 1, 1)

#CREATE A COPY OF myY with name label to be used in the Grid Search 
airlines3 = airlines3.withColumn("label", airlines3[myY])

airlines3.groupBy("label").agg((count(col(myY2))).alias("COUNT_DISTANCE")).display()

In [0]:
# SPLIT DATA FOR TRAINING AND VALIDATION
year_train_val = 2018
train_val2 = airlines3.filter(airlines3.YEAR <= year_train_val).cache()
test2 = airlines3.filter(airlines3.YEAR > year_train_val).cache()

# CREATE SPLITS FOR CROSS VALIDATION - FIRST VALIDATION ON 2016, AND LAST VALIDATION ON 2018 - EXPANDING WINDOW STARTING WITH STEP ONE.
d2 = createSplits(train_val2, 2016, 2018, 1, True)

In [0]:
## Current possible ways to handle categoricals in string indexer is 'error', 'keep', and 'skip'
indexers2 = map(lambda c: StringIndexer(inputCol=c, outputCol=c+"_idx", handleInvalid = 'keep'), categoricals2)
ohes2 = map(lambda c: OneHotEncoder(inputCol=c + "_idx", outputCol=c+"_class", dropLast=True),categoricals2)
imputers2 = Imputer(inputCols = numerics2, outputCols = numerics2)

# Establish features columns
featureCols2 = list(map(lambda c: c+"_class", categoricals2)) + numerics2

model_matrix_stages2 = list(indexers2) + list(ohes2) + [imputers2] + \
                     [VectorAssembler(inputCols=featureCols2, outputCol="features")]

# Apply StandardScaler to create scaledFeatures
scaler2 = StandardScaler(inputCol="features",
                        outputCol="scaledFeatures",
                        withStd=True,
                        withMean=True)

In [0]:
# RUN THIS FOR RANDOM FOREST
rf2 = RandomForestClassifier(featuresCol = 'scaledFeatures', labelCol = 'label',
                            featureSubsetStrategy='auto', 
                            impurity='entropy',    #gini
                            maxDepth = 16,
                            numTrees = 110,
                            seed=123)

pipelineRF2 = Pipeline(stages=model_matrix_stages2+[scaler2]+[rf2])

model_RF2 = pipelineRF2.fit(train_val2)

In [0]:
# GET PREDICTIONS -> DEP_DEL15, P0, P1, LABEL, PREDICTION
tvprd2 = score(model_RF2, train_val2)
testprd2 = score(model_RF2, test_unbalanced)

rfM2 = metricsdf([tvprd2, testprd2], ["RF_train", "RF_test"])
rfM2

In [0]:
# Random Forest Confusion Matrix
testprd2.groupBy("DEP_DEL15", "prediction").count().display()

In [0]:
# RUN THIS FOR RANDOM FOREST
rf3 = RandomForestClassifier(featuresCol = 'features', labelCol = 'label',
                            featureSubsetStrategy='auto', 
                            impurity='gini',    #gini
                            maxDepth = 10,
                            numTrees = 110,
                            seed=123)

pipelineRF3 = Pipeline(stages=model_matrix_stages2+[rf3])

model_RF3 = pipelineRF3.fit(train_val2)

# GET PREDICTIONS -> DEP_DEL15, P0, P1, LABEL, PREDICTION
tvprd3 = score(model_RF3, train_val2)
testprd3 = score(model_RF3, test2)

rfM3 = metricsdf([tvprd3, testprd3], ["RF_train", "RF_test"])
rfM3

#### 11. Evaluate Where Model is Being Succeesful or Failing

In [0]:
#categoricals = ['YEAR', 'QUARTER', 'MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK', 'OP_CARRIER_AIRLINE_ID', 'ORIGIN_AIRPORT_ID', 'DEST_AIRPORT_ID', 'SEASON', 'WKDAY', 'DEPARTURE_Hour_CRS']

# Helping Functions for Evaluating Models
def extract2(row):
  return (row.SEASON,)+ (row.time_of_day_int,)+ (row.Cnn_Ranking_val,)+ (row.DEPARTURE_Hour_CRS,)+ (row.DEP_DEL15,) + tuple(row.probability.toArray().tolist()) +  (row.label,) + (row.prediction,)

def score2(model,data):
  pred = model.transform(data).select('SEASON', 'time_of_day_int', 'Cnn_Ranking_val',
                                      'DEPARTURE_Hour_CRS',"DEP_DEL15", "probability", "label", "prediction")
  pred = pred.rdd.map(extract2).toDF(['SEASON', 'time_of_day_int', 'Cnn_Ranking_val',
                                      'DEPARTURE_Hour_CRS',"DEP_DEL15", "p0", "p1", "label", "prediction"])
  return pred 


In [0]:
# model_RF

testprd2 = score2(model_RF, test_unbalanced)
testprd2.display()

In [0]:
def eval_mod_segment(df, attribute):
  '''
  Input:
    df: dataframe including the predictions from a given model.
    attribute: name of attribute to be used to evaluate the performance of the predictions of the model.
  Output:
    metric_df: dataframe including key metrics for all values of the attribute being evaluated.
  '''
  values = np.array(df.select(attribute).distinct().collect())
  list = [int(v[0]) for v in values]
  list.sort()
  df_list = []
  attr_names = []
  for val in list:
    print("Evaluating ",attribute,": ", val)
    dfTemp = df.filter(df[attribute] == int(val)).select("DEP_DEL15", "P0", "P1","label", "prediction").cache()
    df_list.append(dfTemp)
    attr_names.append(attribute+"_"+str(val))
  metric_df = metricsdf(df_list, attr_names)
  return metric_df

##### - Evaluate Model by Season

In [0]:
seasons_metric = eval_mod_segment(testprd2,"SEASON")
seasons_metric

##### - Evaluate Model by Mork Day (Weekend vs. Weekdays)

In [0]:
wkday_metric = eval_mod_segment(testprd2,"WKDAY")
wkday_metric

In [0]:
month_metric = eval_mod_segment(testprd2,"MONTH")
month_metric

##### - Evaluate Model by Day of Week

In [0]:
dow_metric = eval_mod_segment(testprd2,"DAY_OF_WEEK")
dow_metric

##### - Evaluate Model by Flight Departure Hour

In [0]:
dh_crs_metric = eval_mod_segment(testprd2,"DEPARTURE_Hour_CRS")
dh_crs_metric

##### - Evaluate Model by Quarter of the Year

In [0]:
Q_metric = eval_mod_segment(testprd2,"QUARTER")
Q_metric

##### - Evaluate Model by Time of the Day

In [0]:
tod_metric = eval_mod_segment(testprd2,"time_of_day_int")
tod_metric

##### - Evaluate Model by Airport Rank by Number of Connections:
###### Group 1: Top 5% of Airports, Group 2: Next 20%, Group 3: Next 25%, Group 4: Next 50%

In [0]:
cnn_rank_metric = eval_mod_segment(testprd2,"Cnn_Ranking_val")
cnn_rank_metric

##### - Evaluate Model by Airport Rank by Amount of Delay:
###### Group 1: Top 5% of Airports, Group 2: Next 20%, Group 3: Next 25%, Group 4: Next 50%

In [0]:
delay_rank_metric = eval_mod_segment(testprd2,"Delay_Ranking_val")
delay_rank_metric

#### 12. Generate Models by Metric Segment (Season or Month)

Reference for LSVC - https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.classification.LinearSVC.html

In [0]:
# SPLIT DATA FOR TRAINING AND VALIDATION
def train_test_segment(data, year_train_val, segment, segmentVal):
  '''
  INPUT:
  year_train_val = Year for which data would be split between train and test.
  segment = attribute would be used to select the specific data (i.e. "SEASON")
  segmentVal = specific value for the segment that would be used to split data (i.e. 3 for season 3) 
  
  OUTUT:
  train and test split for developing models.
  '''
  
  train_val_seg = data.filter((data.YEAR <= year_train_val) & (data[segment] == segmentVal)).cache()
  test_seg = data.filter((data.YEAR > year_train_val) & (airlines2[segment] == segmentVal)).cache()

  return train_val_seg, test_seg

In [0]:
# Split data for Train, Validation and Test - 
train_val_seg, test_seg = train_test_segment(airlines2, 2018, "SEASON", 3)

# CREATE SPLITS FOR CROSS VALIDATION - FIRST VALIDATION ON 2016, AND LAST VALIDATION ON 2018 - EXPANDING WINDOW STARTING WITH STEP ONE.
d_seg = createSplits(train_val_seg, 2016, 2018, 1, True)

# Type of Model: "RF" = Random Forest, "LogR" = Logistic Regression, "GBT" = Gradient Boosted Trees, "LSVC" = Linear Support Vector Classifier
Model_seg = create_model(d_seg, model_matrix_stages, scaler, "LogR")

seg_model = Model_seg.bestModel

In [0]:
# GET PREDICTIONS -> DEP_DEL15, P0, P1, LABEL, PREDICTION
trainPredSeg = score(seg_model, train_val_seg)
testPredSeg = score(seg_model, test_seg)

In [0]:
metric_seg = metricsdf([trainPredSeg, testPredSeg], ["RF_trazLR_Season3", "RF_testLR_Season3"])
metric_seg

In [0]:
def models_segment(data, model_matrix_stages, scaler, segment, ModelType):

  '''
  Input:
    data: data to be evalauted.
    matrix_stages and scaler for pipeline.
    segment: variable to be used to generate models (for example: "SEASON")
    ModelType: Name of Model to be generated - RF, GBT, LSVC, LogR.
  Output:
    metrics - performance for all models generated.
    list of models - one model for each value of the attribute (segment) being evaluated
    prediction list - list of dataframes with predictions.
  '''
  # Get values for Segment
  values = np.array(data.select(segment).distinct().collect())
  list = [int(v[0]) for v in values]
  list.sort()
  
  model_list = []
  name_list = []
  pred_list = []
  
  for val in list:
    # Split data for Train, Validation and Test
    train_val_seg, test_seg = train_test_segment(airlines2, 2018, segment, val)
    
    # CREATE SPLITS FOR CROSS VALIDATION - FIRST VALIDATION ON 2016, AND LAST VALIDATION ON 2018 - EXPANDING WINDOW STARTING WITH STEP ONE.
    d_seg = createSplits(train_val_seg, 2016, 2018, 1, True)
  
    Model_seg = create_model(d_seg, model_matrix_stages, scaler, ModelType)
    seg_model = Model_seg.bestModel
    
    testPredSeg = score(seg_model, test_seg)
    pred_list.append(testPredSeg)
    name_list.append(segment+"-"+str(val))
    model_list.append(Model_seg)
    
  metric_seg = metricsdf(pred_list, name_list)
  return metric_seg, model_list, pred_list

In [0]:
# Type of Model: "RF" = Random Forest, "LogR" = Logistic Regression, "GBT" = Gradient Boosted Trees, "LSVC" = Linear Support Vector Classifier
metric_seg, model_list, pred_list = models_segment(airlines2, model_matrix_stages, scaler, "time_of_day_int", "RF")

In [0]:
metric_seg

#### 13. Random Forest - PCA

In [0]:
# Apply MinMaxScaler to create scaledFeatures
scaler_pca = MinMaxScaler(inputCol="features",
                        outputCol="scaledFeatures",
                        min=0.0, max=1.0)

pca = PCA(k=150, inputCol="scaledFeatures", outputCol = "pca_features")

# Define a Random Forest model - WHEN using PCA
rf_PCA = RandomForestClassifier(featuresCol = 'pca_features',  #Modify 'features' for scaled features
                            labelCol = 'label',
                            featureSubsetStrategy='auto', 
                            impurity='gini', 
                            maxBins=350,
                            weightCol= 'weight',
                            seed=None)

# Set the Grid Search set of parameters
gridpca = ParamGridBuilder()\
            .addGrid(rf_PCA.maxDepth, [5, 10])\
            .addGrid(rf_PCA.numTrees, [10, 15])\
            .build()

evaluator = BinaryClassificationEvaluator()


# Chain indexer and GBT in a Pipeline
pipeline_PCA = Pipeline(stages=model_matrix_stages+[scaler_pca]+[pca]+[rf_PCA])

# Execute CrossValidator for model tuning
crossvalpca = CustomCrossValidator(estimator=pipeline,
                                estimatorParamMaps=gridpca,
                                evaluator=evaluator,
                                splitWord = ('train', 'test'),
                                cvCol = 'cv',
                                parallelism=4)

# Train the tuned model and establish our best model
cv_rf_pca_Model = crossvalpca.fit(d)
rf_pca_model = cv_rf_pca_Model.bestModel

In [0]:
tvprd_pca = score(rf_pca_model, train_val)
testprd_pca = score(rf_pca_model, test)

In [0]:
rfM_pca = metricsdf([tvprd_pca, testprd_pca], ["RFpca_tr", "RFpca_test"])
rfM_pca

In [0]:
# Random Forest Confusion Matrix
testprd_pca.groupBy("label", "prediction").count().display()

In [0]:
metricsRFpca = MulticlassMetrics(testprd_pca.rdd.map(lambda x: (x[4], x[3])))
rfM2pca = metricsdf2(metricsRFpca)
dfMrfpca = pd.DataFrame({"Rand_Forest": list(rfM2pca.values())}, index = list(rfM2pca.keys()))
dfMrfpca

#### 14. PCA Analysis for Random Forest Model

Reference:  
>- https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.feature.PCA.html
>- https://datascience-enthusiast.com/Python/PCA_Spark_Python_R.html

In [0]:
# FOR PCA -> For OHE make sure to use droplast=True
# MinMaxScaler

## Current possible ways to handle categoricals in string indexer is 'error', 'keep', and 'skip'
indexers = map(lambda c: StringIndexer(inputCol=c, outputCol=c+"_idx", handleInvalid = 'keep'), categoricals)
ohes = map(lambda c: OneHotEncoder(inputCol=c + "_idx", outputCol=c+"_class", dropLast=True),categoricals)
imputers = Imputer(inputCols = numerics, outputCols = numerics)

# Establish features columns
featureCols = list(map(lambda c: c+"_class", categoricals)) + numerics

# Build the stage for the ML pipeline
model_matrix_stages = list(indexers) + list(ohes) + [imputers] + \
                     [VectorAssembler(inputCols=featureCols, outputCol="features")]

# Apply StandardScaler to create scaledFeatures
scaler_pca = MinMaxScaler(inputCol="features",
                        outputCol="scaledFeatures",
                        min=0.0, max=1.0)

In [0]:
k = 150
pca = PCA(k=k, inputCol="scaledFeatures", outputCol = "pca_features")
# pca.setOutputCol("pca_output")

# Build our ML pipeline
pipelinePCA = Pipeline(stages=model_matrix_stages+[scaler_pca]+[pca])

# Train model.  This also runs the indexer.
pca_model = pipelinePCA.fit(train_val)

In [0]:
# pca_out = pca_model.transform(trainPCA)
# pca_out.collect()[0].pca_output
# pca_out.collect()[0].pca_features

# Extract the PCA model
pcaM = pca_model.stages[-1]

In [0]:
# Explained Variance
expVar1 = pcaM.explainedVariance
expVar1_array = np.array(expVar1)
ev_round = expVar1_array.round(3)*100

In [0]:
ev_round

In [0]:
#k = 100
def add_value_label(x_list,y_list):
    for i in range(0, len(x_list)):
      plt.text(i,y_list[i],y_list[i], ha="center")

pc_list = range(1,k+1)
fig = plt.figure(figsize=(10,7))
ax = fig.add_axes([0,0,1,1])
add_value_label(pc_list,ev_round.round(2))
ax.set_xlabel('Principal Components')
ax.set_ylabel('% Explained Variance')
ax.set_title('Explained Variance by Principal Component')
ax.bar(pc_list,ev_round.round(2))
plt.show()

In [0]:
cum_expVar = np.cumsum(expVar1_array)*100 
pc_list = range(1,k+1)
fig = plt.figure(figsize=(10,7))
ax = fig.add_axes([0,0,1,1])
ax.set_xlabel('Principal Components')
ax.set_ylabel('% Explained Variance')
#add_value_label(pc_list,cum_expVar.round(2))
ax.set_title('Cummulative Explained Variance')
ax.bar(pc_list,cum_expVar.round(2))
plt.show()

In [0]:
def featureListPCA(data, categoricals, numerics):
  featurelists = []
  for f in categoricals:
    
    d = np.array(data.select(f).distinct().collect())
    n = len(d)
    
    featurelists = featurelists + [f]*n
  
  for f in numerics:
    featurelists = featurelists + [f]
  
  return featurelists

In [0]:
flist = featureListPCA(train_val, categoricals, numerics)

In [0]:
pc_list = range(1,k+1)
pcs = np.round(pcaM.pc.toArray(),4)
df_pc = pd.DataFrame(pcs, columns = pc_list, index=flist).reset_index()
df_pc

#### 15. Ensemble Learning Model

The Ensemble Learning Model combines the prediction results from several different models trained on the training data. In our case, we train the Base models Random Forest, Gradient Boosted Trees, and Support Vector machines on the `train_val` dataset which contains the 2015 through 2018 flights data. Next, we take the predictions from the model and then train a meta classifier (logistic regression) model to come up with the final classification.

from pyspark.ml.classification import GBTClassifier
from pyspark.ml.classification import LinearSVC

from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.ml.classification import RandomForestClassifier
from pyspark.mllib.evaluation import MulticlassMetrics

In [0]:
def preprocess_data(train_df, meta_features=None):
  """
  Return pre-processed meta-features 
  Args:
    train_df - (spark DataFrame) training data
    meta_features - (list) list of metafeatures columns
  """
  ohes = OneHotEncoder(inputCols=meta_features, outputCols=['vec{}'.format(i) for i in range(len(meta_features))])
  vec = VectorAssembler(inputCols=['vec{}'.format(i) for i in range(len(meta_features))], outputCol='meta_features')
  pipeline = Pipeline(stages=[ohes, vec])
  data_pipeline = pipeline.fit(train_df)
  return data_pipeline.transform(train_df)

In [0]:
def ensemble_learning_model(gbt_model, rf_model,svc_model, train_val, meta_features):
  """
    Return predictions of Stacking Ensemble Learning Model in Spark DataFrame.
    Args:
        gbt_model - Gradient Boosted Tree spark fitted model
        rf_model - Random Forest spark fitted model
        meta_classifier - Logistic Regression fitted model
        meta_features - (list) meta feature prediction column names
        train_val - (spark DataFrame) training data
  """
  # get predictions from each model
  data1 = rf_model.transform(train_val).select(categoricals+numerics+ ["prediction","label"]).withColumnRenamed('prediction', 'rf_pred')
  data2 = gbt_model.transform(data1).select(categoricals+numerics+["prediction","label","rf_pred"]).withColumnRenamed('prediction', 'gbt_pred')
  data3 = svc_model.transform(data2).select(categoricals+numerics+["label", "prediction","rf_pred", 'gbt_pred']).withColumnRenamed('prediction', 'svc_pred')
  
  
  # pre-process meta-features
  preds = preprocess_data(data3, meta_features).cache()
  
  # create meta-classifier
  lr = LogisticRegression(featuresCol='meta_features', labelCol='label', predictionCol='meta_pred', maxIter=20, regParam=1., elasticNetParam=0)
  meta_classifier = lr.fit(preds)
  meta_preds = meta_classifier.transform(preds)
  
  
  return meta_classifier, meta_preds
    
    

In [0]:
def predict_ensemble(gbt_model, rf_model,svc_model, meta_classifier,meta_features, test_df):
  """
      Return predictions of Stacking Ensemble Learning Model in Spark DataFrame.
      Args:
          gbt_model - Gradient Boosted Tree spark fitted model
          rf_model - Random Forest spark fitted model
          meta_classifier - Logistic Regression fitted model
          meta_features - (list) meta feature prediction column names
          test_df - (spark DataFrame) Data Holdout
  """
  # get predictions from each model
  data1 = rf_model.transform(test_df).select(categoricals+numerics+ ["prediction","label"]).withColumnRenamed('prediction', 'rf_pred')
  data2 = gbt_model.transform(data1).select(categoricals+numerics+["prediction","label","rf_pred"]).withColumnRenamed('prediction', 'gbt_pred')
  data3 = svc_model.transform(data2).select(categoricals+numerics+["label", "prediction","rf_pred", 'gbt_pred']).withColumnRenamed('prediction', 'svc_pred')

  # pre-process meta-features
  preds = preprocess_data(data3, meta_features).cache()
  meta_preds = meta_classifier.transform(preds)
  return meta_preds

In [0]:
def get_metrics(df, preds, labels):
  """
        Return evaluation metrics
        Args:
            df - model transformed dataframe
            preds - (str) prediction column
            labels - label column
  """
  predmet = df.select(labels, preds)
  metricsmet = MulticlassMetrics(predmet.rdd.map(lambda x: (x[1], x[0])))
  mmet = metricsdf2(metricsmet)
  dfmet = pd.DataFrame({"Ensemble": list(mmet.values())}, index = list(mmet.keys()))
  return dfmet
  

In [0]:
# Loading in Based Models if they already exist
# Loading a Random Forest Saved Model
model_path = f"{blob_url}/models/rf_model"
rf_modelSAVED = rf_model.load(model_path)

# Loading a Gradient Boosted Tree Saved Model
model_path = f"{blob_url}/models/gbt_model"
gbt_modelSAVED = gbt_model.load(model_path)

# Loading a Log Regression Saved Model
model_path = f"{blob_url}/models/lsvc_model"
lsvc_modelSAVED = lsvc_model.load(model_path)

In [0]:
# Train meta classifier
# model_RF, model_GBT, model_LSVC, model_LR, model_RF2
meta_classifier, meta_preds = ensemble_learning_model(model_GBT, model_RF2,model_LSVC, train_val, ['rf_pred','gbt_pred','svc_pred'])


In [0]:
# Get evaluation metrics
df_meta = get_metrics(meta_preds,"meta_pred","label")
df_meta.display()

In [0]:
meta_preds2 = predict_ensemble(model_GBT, model_RF,model_LSVC, meta_classifier,['rf_pred','gbt_pred','svc_pred'], test_unbalanced)

In [0]:
df_meta2 = get_metrics(meta_preds2,"meta_pred","label")
df_meta2.display()

In [0]:
# Random Forest Confusion Matrix
#testprd2.groupBy("DEP_DEL15", "prediction").count().display()
df_meta2.groupBy("label", "meta_pred").count().display()

In [0]:
test.display()

In [0]:
predrf = rf_modelSAVED.transform(test).select("DEP_DEL15", "prediction")
metricsrf = MulticlassMetrics(predrf.rdd.map(lambda x: (x[1], x[0])))
mrf = metricsdf2(metricsrf)
dfrf = pd.DataFrame({modelName: list(mrf.values())}, index = list(mrf.keys()))
dfrf

In [0]:
# Loading a Gradient Boosted Tree Saved Model
model_path = f"{blob_url}/models/gbt_model"
gbt_modelSAVED = gbt_model.load(model_path)

In [0]:
predrf = gbt_modelSAVED.transform(test).select("DEP_DEL15", "prediction")
metricsrf = MulticlassMetrics(predrf.rdd.map(lambda x: (x[1], x[0])))
mrf = metricsdf2(metricsrf)
dfrf = pd.DataFrame({modelName: list(mrf.values())}, index = list(mrf.keys()))
dfrf

In [0]:
# Loading a Log Regression Saved Model
model_path = f"{blob_url}/models/lsvc_model"
lsvc_modelSAVED = lsvc_model.load(model_path)

In [0]:
predrf = lsvc_modelSAVED.transform(test).select("DEP_DEL15", "prediction")
metricsrf = MulticlassMetrics(predrf.rdd.map(lambda x: (x[1], x[0])))
mrf = metricsdf2(metricsrf)
dfrf = pd.DataFrame({modelName: list(mrf.values())}, index = list(mrf.keys()))
dfrf

In [0]:
#select(myX + [myY, "DEP_DELAY_NEW", "DEP_DELAY"])


data1 = rf_modelSAVED.transform(train_val).select(categoricals+numerics+["prediction","label"]).withColumnRenamed('prediction', 'rf_pred')
data1.display()

In [0]:
data1 = rf_modelSAVED.transform(train_val).select(categoricals+numerics+["prediction","label"]).withColumnRenamed('prediction', 'rf_pred')
data2 = gbt_modelSAVED.transform(data1).select(categoricals+numerics+["prediction","label","rf_pred"]).withColumnRenamed('prediction', 'gbt_pred')
data3 = lsvc_modelSAVED.transform(data2).select(categoricals+numerics+["label", "prediction","rf_pred", 'gbt_pred']).withColumnRenamed('prediction', 'svc_pred')
data3.display()


preds = preprocess_data(data3, ['rf_pred','gbt_pred','svc_pred'])
preds.display()

In [0]:
preds = preprocess_data(data3, ['rf_pred','gbt_pred','svc_pred'])
preds.display()

In [0]:
lr = LogisticRegression(featuresCol='meta_features', labelCol='label', predictionCol='meta_pred', maxIter=20, regParam=1., elasticNetParam=0)
meta_classifier = lr.fit(preds)
meta_preds = meta_classifier.transform(preds)

In [0]:
predrf = meta_preds.select("label", "meta_pred")
metricsrf = MulticlassMetrics(predrf.rdd.map(lambda x: (x[1], x[0])))
mrf = metricsdf2(metricsrf)
dfrf = pd.DataFrame({modelName: list(mrf.values())}, index = list(mrf.keys()))
dfrf

In [0]:
def ensemble_learning_model(cv_train_df, train_val_df, meta_features):
    """
    Return the Gradient Boosted Tree, Random Forest, Support Vector Machine, and Meta Classifier algorithms
    
    Args:
        cv_train_df - (dict) dictionary of Spark Dataframes {'df1': sparkdf1, 'df2':sparkdf2 ...}
        train_val_df - {SparkDF} Training data (Features and Labels)
        meta_features - (list) of meta features column names
    """
  
    # Loading in Based Models if they already exist
    # Loading a Random Forest Saved Model
    model_path = f"{blob_url}/models/rf_model"
    rf_model = rf_model.load(model_path)
    # Loading a Gradient Boosted Tree Saved Model
    model_path = f"{blob_url}/models/gbt_model"
    gbt_model = gbt_model.load(model_path)
    # Loading a Log Regression Saved Model
    model_path = f"{blob_url}/models/lsvc_model"
    svc_model = svc_model.load(model_path)
        
    # Get the meta features which are predictions of the baseline models
    train_dt = preprocess_data(train_val_df).cache()
    
    # Transform data to include predictions (meta_features)
    preds = svc_model.transform(gbt_model.transform(rf_model.transform(train_dt) \
                                .drop('rawPrediction','probability')) \
                                .drop('rawPrediction','probability')) \
                                .drop('rawPrediction','probability')
    # Define meta features
    meta_fts = meta_features
    # Preprocess meta features using One Hot Encoding and Vectorizer
    preds = preprocess_data(preds, meta_features = meta_fts).cache()
    
    # Train meta classifier (Logistic Regression)
    lr = LogisticRegression(featuresCol='meta_features', labelCol='label', predictionCol='meta_pred', maxIter=20, regParam=1., elasticNetParam=0)
    meta_classifier = lr.fit(preds)
    meta_preds = meta_classifier.transform(preds)
    return gbt_model, rf_model,svc_model, meta_classifier, meta_preds
    

#### Reference:
>- https://pages.databricks.com/rs/094-YMS-629/images/02-Delta%20Lake%20Workshop%20-%20Including%20ML.html
>- https://databricks.com/blog/2018/08/09/loan-risk-analysis-with-xgboost-and-databricks-runtime-for-machine-learning.html
>- https://spark.apache.org/docs/latest/ml-features

Other reference on Delta lakes:
>- https://towardsdatascience.com/delta-lake-with-spark-what-and-why-6d08bef7b963  
>- https://databricks.com/notebooks/gallery/GettingStartedWithSparkMLlib.html  
>- https://pages.databricks.com/rs/094-YMS-629/images/02-Delta%20Lake%20Workshop%20-%20Including%20ML.html

Reference Random Forest:  
>- https://towardsdatascience.com/a-guide-to-exploit-random-forest-classifier-in-pyspark-46d6999cb5db
>- https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.mllib.tree.RandomForest.html

Using weight fro unbalance data:
>- https://www.datatrigger.org/post/spark_3_weighted_random_forest/

##### - Logistic Regression Notes  
  
>- We will be using the Apache Spark pre-installed GLM and GBTClassifier models
>- GLM is in reference to generalized linear models; the Apache Spark logistic regression model is a special case of a generalized linear model
>- Use BinaryClassificationEvaluator, CrossValidator, and ParamGridBuilder to tune our models.

###### For this initial pipeline we focused on Logistic Regression, using the following formula:

$$
P(\text{Delay}) = \frac{1}{1 + e^{-(\beta_{0} + \sum_{i=1}^{n}(\beta{i} * \text{Flight}_{i} )+ \sum_{j=1}^{m}(\beta{j} * \text{Weather}_{j} ) + \sum_{k=1}^{l}(\beta{k} * \text{Others}_{k} ))}}
$$

In feature model we will use Linear Regression:

$$
\text{Delay in minutes} = \beta_{0} + \sum_{i=1}^{n}(\beta{i} * \text{Flight}_{i} )+ \sum_{j=1}^{m}(\beta{j} * \text{Weather}_{j} ) + \sum_{k=1}^{l}(\beta{k} * \text{Others}_{k} )
$$

Finally, we will explore other Machine Learning Algorithms (to be defined)

##### Model Metrics & Performance
###### Performance Metrics Using Map/Reduce  

**Positive** - defined as "being delayed"

>- **False Positive:** Predicted a delay, but flight was not delayed.  The potential implications are:
>>- Passenger get's stress, and may start looking for alternatives.  
>>- Passenger may start complaining and express negatively about the airline (even before the delay)
>>- Paasenger may try to cancel (it he/she has flexibility)
>>- May lead to some frustration, but if flight is not delayed, he/she may be pleasently surprised.

>- **False Negative:** Predicted No Delay, but flight was delayed.  Some consequences are: 
>>- Passenger is frustrated. He/she thought things were OK, and now the flight is delayed.  No time to change anything.
>>- Passenger feels helpless - he/she can't do anything to manage situation.
>>- High reputation cost for airline

If there is more weight for `False Negative`, there is a need to minimize `False Negatives`, thus we want to minimize **Recall**.

**How do we communicate the delay?** For example, a delay of more than 30 minutes is likely (or very likely)
>- Include a sense of time of delay - how much time delay? 
>- Include a sense of probability - likelyhood/confidence on the predicted event.

Through the right communication approach we manage potential False Positives, but try to minimize False Negatives. Thus, we need to give more weight to `Recall`, thus, performance metric will focus on `F2-score`. 

##### F-Beta Coefficient: Alternatives:
>- Interested in an F-measure with more attention put on precision, such as when false positives are more important to minimize, but false negatives are still important.
>- Interested in an F-measure with more attention put on recall, such as when false negatives are more important to minimize, but false positives are still important.

$$
Fbeta = \frac{((1 + beta^2) * Precision * Recall)} {(beta^2 * Precision )+ Recall)}
$$

>>- F0.5-Measure (beta=0.5): More weight on precision, less weight on recall.
>>- F1-Measure (beta=1.0): Balance the weight on precision and recall.
>>- F2-Measure (beta=2.0): Less weight on precision, more weight on recall


Reference:
>- https://machinelearningmastery.com/fbeta-measure-for-machine-learning/#:~:text=The%20F2%2Dmeasure%20is%20calculated,(4%20*%20Precision%20%2B%20Recall)

##### Accuracy Demonstrated (literature)

80.44% SVR

Reference:  
>- https://journalofbigdata.springeropen.com/articles/10.1186/s40537-020-00380-z#:~:text=The%20results%20have%20shown%20SVR,impact%20on%20the%20mode%20performance.  
>- https://medium.com/analytics-vidhya/using-machine-learning-to-predict-flight-delays-e8a50b0bb64c

Recall - ~80 to 84%

###### - OLD CODE - GRID SEARCH

In [0]:
# Sliding Window:
# 2015 2016 - 2017
# 2016 2017 - 2018
# 2017 2018 - 2019

datasplits = {}
i = 2015   # For YEAR 2015
datasplits[i] = {"train" : airlines2.filter(airlines2.YEAR <= i).cache(),
                   "valid": airlines2.filter(airlines2.YEAR == i+1).cache()}
i += 1  # FOR YEAR 2016
datasplits[i] = {"train" : airlines2.filter(airlines2.YEAR <= i).cache(),
                   "valid": airlines2.filter(airlines2.YEAR == i+1).cache()}

i += 1  # FOR FOR YEAR 2017
datasplits[i] = {"train" : airlines2.filter(airlines2.YEAR <= i).cache(),
                   "valid": airlines2.filter(airlines2.YEAR == i+1).cache()}

i += 1  # FOR YEAR 2018
datasplits[i] = {"train" : airlines2.filter(airlines2.YEAR <= i).cache(),
                   "valid": airlines2.filter(airlines2.YEAR == i+1).cache()}

# i += 1  # FOR MONTH 5
# datasplits[i] = {"train" : airlines2.filter(airlines2.MONTH <= i).cache(),
#                    "valid": airlines2.filter(airlines2.MONTH == i+1).cache()}

In [0]:
def Mmetric(model_matrix_stages, datasplits, scaler, numTrees, maxDepth, weigth, metric, verbose, pca_flag, k):
  
  keys = datasplits.keys()
  metricL = []
  metricsL = []
  maverage = 0
  #print("***** Evaluating - numTress: ",numTrees, " maxDepth: ",maxDepth, " *****")
  for key in keys:
    #if verbose: print("PROCESSING PARTITION *********", key)
    train_rf = datasplits[key]["train"]
    valid_rf = datasplits[key]["valid"]

    # Chain indexer and GBT in a Pipeline
    if pca_flag:
      # Apply MinMaxScaler to create scaledFeatures
      scaler_pca = MinMaxScaler(inputCol="features",
                              outputCol="scaledFeatures",
                              min=0.0, max=1.0)
      
      rf_pca = RandomForestClassifier(featuresCol = 'scaledFeatures', 
                            labelCol = 'label',
                            featureSubsetStrategy='auto', 
                            impurity='gini', 
                            numTrees = numTrees,
                            maxDepth= maxDepth, 
                            maxBins=305,
                            weightCol=weigth,
                            seed=None)
      pca = PCA(k=k, inputCol="features", outputCol = "pca_features")
      pca.setOutputCol("pca_output")
      pipeline = Pipeline(stages=model_matrix_stages+[scaler_pca]+[pca]+[rf_pca])
    else:
      rf = RandomForestClassifier(featuresCol = 'scaledFeatures', 
                            labelCol = 'label',
                            featureSubsetStrategy='auto', 
                            impurity='gini', 
                            numTrees = numTrees,
                            maxDepth= maxDepth, 
                            maxBins=305,
                            weightCol=weigth,
                            seed=None)

      pipeline = Pipeline(stages=model_matrix_stages+[scaler]+[rf])

    # Train model.  This also runs the indexer.
    rf_model = pipeline.fit(train_rf)    

    rf_valid = score(rf_model, valid_rf)
    
    c, mrf_valid = model_metric(rf_valid)
    metricL.append(mrf_valid[metric])
    metricsL.append(mrf_valid)
    maverage += mrf_valid[metric]
    
  avgmetric = maverage/len(keys)
  if verbose:
    print("Ntrees: ",numTrees, " Depth: ",maxDepth, "W: ", weigth, "Avg ",metric, ": ", avgmetric, " list: ", metricL)
  return avgmetric, metricL

In [0]:
# GRID SEARCH FOR RANDOM FOREST - TWO PARAMETERS
def GS_rf(model_matrix_stages, datasplits, scaler, numTrees, maxDepth, weights, metric, verbose, pca, k):
  
  metrics_rf = {}
  i = 1
  
  maxmetric = 0
  bestparam = []
  
  for nt in numTrees:
    for md in maxDepth:
      for w in weights:
        modmetric, modmetricL = Mmetric(model_matrix_stages, datasplits, scaler, nt, md, w, metric, verbose, pca, k)
        metrics_rf[i] = {"metric": modmetric, "parameters": [nt, md, w], "metric list": modmetricL}
        if modmetric > maxmetric:
          maxmetric = modmetric
          bestparam = [nt, md, w]
          #if verbose: print("Current best metric: ",maxmetric, " Best parameters: ", betparam)
        i += 1
  
  return metrics_rf, maxmetric, bestparam

In [0]:
numTrees = [20, 25, 30, 35, 40]
maxDepth = [4, 5, 6, 8, 10, 15]
weights = ['weight']

# numTrees = [5, 10, 15, 20]
# maxDepth = [3, 4, 5, 6]
# weights = ['weight', 'weight1', 'weight2']

# numTrees = [20,25]
# maxDepth = [4, 5]
# weights = ['weight']

# numTrees = [15, 20]
# maxDepth = [4, 5]

GS_output2, maxmetric2, bestparam2 = GS_rf(model_matrix_stages, datasplits, scaler, numTrees, maxDepth, weights, "f2_score",True, True, 3)

In [0]:
print(maxmetric2, bestparam2)

pdf = pd.DataFrame(GS_output2)
pdf

##### - ADDITIONAL CODE -NOT USED

In [0]:
# OLD CODE
glm_train = score(glm_model, train)
glm_valid = score(glm_model, valid)
gbt_train = score(gbt_model, train)
gbt_valid = score(gbt_model, valid)

glm_train.createOrReplaceTempView("glm_train")
glm_valid.createOrReplaceTempView("glm_valid")
gbt_train.createOrReplaceTempView("gbt_train")
gbt_valid.createOrReplaceTempView("gbt_valid")


# print ("GLM Training AUC ROC :" + str(auc(glm_train)))
# print ("GLM Validation AUC ROC :" + str(auc(glm_valid)))
# print ("GBT Training AUC ROC :" + str(auc(gbt_train)))
# print ("GBT Validation AUC ROC :" + str(auc(gbt_valid)))
# print ("************************************************")
# print ("GLM Training AUC PR :" + str(pr(glm_train)))
# print ("GLM Validation AUC PR :" + str(pr(glm_valid)))
# print ("GBT Training AUC PR :" + str(pr(gbt_train)))
# print ("GBT Validation AUC PR :" + str(pr(gbt_valid)))

In [0]:

def auc(pred):
  metric = BinaryClassificationMetrics(pred.select("p1", "label").rdd)
  return metric.areaUnderROC

def pr(pred):
  metric = BinaryClassificationMetrics(pred.select("p1", "label").rdd)
  return metric.areaUnderPR

In [0]:
def preprocess_data(train_df, meta_features=None):
    if meta_features == None:
      indexers = map(lambda c: StringIndexer(inputCol=c, outputCol=c+"_idx", handleInvalid = 'keep'), categoricals)
      imputers = Imputer(inputCols = numerics, outputCols = numerics)
      featureCols = list(map(lambda c: c+"_idx", categoricals)) + numerics
      # Define vector assemblers
      scaler = StandardScaler(inputCol="features",
                            outputCol="scaledFeatures",
                            withStd=True,
                            withMean=True)

      model_matrix_stages = list(indexers) + [imputers] + \
                         [VectorAssembler(inputCols=featureCols, outputCol="features")]
      # Data Preprocessing Pipeline
      pipeline = Pipeline(stages=model_matrix_stages+[scaler])
    else:
        ohes = OneHotEncoder(inputCols=meta_features, outputCols=['vec{}'.format(i) for i in range(len(meta_features))])
        vec = VectorAssembler(inputCols=['vec{}'.format(i) for i in range(len(meta_features))], outputCol='meta_features')
        pipeline = Pipeline(stages=[ohes, vec])
        
    data_pipeline = pipeline.fit(train_df)
    return data_pipeline.transform(train_df)
 