In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

In [47]:
# importing the full dataset
# (note that flights_train is not available in github due to size. only in local repository)
chunk = pd.read_csv('../datasets/flights_train_set.csv', 
#                     usecols = usecols, 
                    chunksize=1000000, 
                    low_memory=False)
df_full = pd.concat(chunk)

In [None]:
# importing processing files 
import sys
sys.path.insert(0, '../py_scripts/')
from dataset_processing import *
from feature_generation_for_multiclass import *
from training_and_testing_prep import *

# Model Building

In [20]:
df_full.head(2)

Unnamed: 0.1,Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,origin_city_name,dest_airport_id,dest,dest_city_name,crs_dep_time,dep_time,dep_delay,taxi_out,wheels_off,wheels_on,taxi_in,crs_arr_time,arr_time,arr_delay,cancelled,cancellation_code,diverted,dup,crs_elapsed_time,actual_elapsed_time,air_time,flights,distance,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,first_dep_time,total_add_gtime,longest_add_gtime,no_name
0,0,2018-07-14,NK,NK,NK,423,NK,N633NK,423,10721,BOS,"Boston, MA",13204,MCO,"Orlando, FL",1550,1540.0,-10.0,12.0,1552.0,1820.0,8.0,1910,1828.0,-42.0,0,,0,N,200.0,168.0,148.0,1,1121,,,,,,,,,
1,1,2018-07-14,NK,NK,NK,424,NK,N684NK,424,12892,LAX,"Los Angeles, CA",13487,MSP,"Minneapolis, MN",1828,1834.0,6.0,17.0,1851.0,2353.0,5.0,2359,2358.0,-1.0,0,,0,N,211.0,204.0,182.0,1,1535,,,,,,,,,


In [14]:
def feature_generation (df_full, save_features=0):
    
    """
    input: none but df_full = flights_csv full dataset after initial cleaning
    and test/train split should already be declared in the notebook.
    
    generates the aggregate features used for model training
    
    returns: either returns the dataframes or it saves them to csv
    """
    tmp = preprocessing_dataset(df_full)
    

    tmp2 = tailnum_delay_taxi_multiclass_params(tmp)
    tmp3, tmp4 = tailnum_hourly_delays_multiclass_params(tmp)
    tmp5 = carrier_branded_dayofweek_delay_multiclass_params(tmp)
    tmp6 = dest_monthly_multiclass_params(tmp)
    tmp7 = origin_monthly_multiclass_params(tmp)
    tmp8 = holiday_multiclass_params(tmp)
    tmp9 = origin_dest_route_dayofweek_multiclass_params(tmp)
    
    # save to file
    if save_features:
        tmp2.to_csv('../data/features_tailnum_delay_taxi_multiclass_params.csv')
        tmp3.to_csv('../data/tailnum_hourly_delays_multiclass_params_dep.csv')
        tmp4.to_csv('../data/tailnum_hourly_delays_multiclass_params_arr.csv')
        tmp5.to_csv('../data/carrier_branded_dayofweek_delay_multiclass_params.csv')
        tmp6.to_csv('../data/dest_monthly_multiclass_params.csv')
        tmp7.to_csv('../data/origin_monthly_multiclass_params.csv')
        tmp8.to_csv('../data/holiday_multiclass_params.csv')
        tmp9.to_csv('../data/origin_dest_route_dayofweek_multiclass_params.csv')
        return tmp
    else:       
        return tmp, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9
        
        
def preprocessing_dataset(df):
    """
    Input: full dataset or a sample dataset of flights_csv after initial cleaning (check duplicates etc)
    returns: clean dataset (no null values) and only records of delayed flights for analysis
    """
    
    # cleaning 'arr_delay' and 'dep_delay'
    # remove any null values that are left after calling the cleaning function
    df = cleaning_delays(df)
    df.dropna(subset=['arr_delay', 'dep_delay'], inplace=True) 
    unused_cols = ['wheels_off', 
                    'wheels_on',
                    'diverted',
                    'cancellation_code',
                    'dup',
                    'first_dep_time',
                    'total_add_gtime',
                    'longest_add_gtime',
                    'no_name']
    df = df.drop(columns=unused_cols) # delete unnecessary cols
    # column for the target labels
    # clean the delay variables, fill with 0, assuming nan delays were 0
    delay_cols = ['carrier_delay', 'weather_delay',
       'nas_delay', 'security_delay', 'late_aircraft_delay'] 
    for col in delay_cols:
        df[col].fillna(0, inplace=True) 
    
    
    # filter out records where there were no delays
    df['isDelay'] = df['arr_delay'].apply(lambda x: 1 if x>0 else None)
    df['isDepDelay'] = df['dep_delay'].apply(lambda x: 1 if x>0 else None)
    df['isDelay'].fillna(df['isDepDelay'], inplace=True)
    df.drop(columns=['isDepDelay'], inplace=True)
    df.dropna(subset='isDelay', inplace=True)
    
    # defining the target (y) labels
    df['target_delay'] = df[delay_cols].idxmax(axis=1) # returns maximum delay
    

    
    return df
    
    
    
def cleaning_delays (df_sample):
    """input flights csv full dataset or sample data
    checks null values for dep_delay and arr_delay 
    against crs_times and actual times to confirm they are null and not 0s
    usually CALLED by preprocessing_dataset
    """
    
    # checking for Null values 
    filter1 = df_sample['dep_delay'].isna()
    filter2 = (df_sample['crs_dep_time'] == df_sample['dep_time'])

    indices = df_sample[(filter1) & (filter2)].index

    for idx in indices:
        df_sample.loc[idx,'dep_delay'] = 0
    
    filter1 = df_sample['arr_delay'].isna()
    filter2 = (df_sample['crs_arr_time'] == df_sample['arr_time'])

    indices = df_sample[(filter1) & (filter2)].index

    for idx in indices:
        df_sample.loc[idx,'arr_delay'] = 0
        
        
    return df_sample
    
    


### All scripts should be run on flights dataset (full or sample) after preprocessing_dataset function.

def tailnum_delay_taxi_multiclass_params(df_sample):
    """
    Input: flights csv sample or full dataset AFTER preprocessing_dataset
    Aggregates on tail_num
    Output: 
        index / join key: 'tail_num'
        columns: aggregated isCraft and isCarrier delays 
    """  
    
    df_sample['isCraft'] = df_sample['target_delay'].\
                            apply (lambda x: 1 if x == 'late_aircraft_delay' else 0)
    df_sample['isCarrier'] = df_sample['target_delay'].\
                            apply (lambda x: 1 if x == 'carrier_delay' else 0)

    tailnum_delay_taxi_df = df_sample.groupby('tail_num').agg({'dep_delay': 'median',
                                  'arr_delay' : 'median',
                                  'isCraft' : 'mean',
                                  'isCarrier' : 'mean'      
                                  })
    return tailnum_delay_taxi_df
    
    
    

def tailnum_hourly_delays_multiclass_params(df_sample):
    """
    Input: flights csv sample or full dataset AFTER preprocessing_dataset
    Aggregates on tail_num and arr_hour and tail_num and dep_hour
    Output: 2 dataframes 
        index / join key: 'tail_num' and arr_hour / dep_hour
        columns: median delays
    """  
    df_sample['isCraft'] = df_sample['target_delay'].\
                            apply (lambda x: 1 if x == 'late_aircraft_delay' else 0)
    df_sample['isCarrier'] = df_sample['target_delay'].\
                            apply (lambda x: 1 if x == 'carrier_delay' else 0)
    
    # extract hour and minute from crs_time
    df_sample['dep_hour'] = (np.round(df_sample['crs_dep_time'],-2)/100).astype(int)
    df_sample['arr_hour'] = (np.round(df_sample['crs_arr_time'],-2)/100).astype(int)
    
    tailnum_dep_hourly_delays_df = df_sample.groupby(['tail_num', 'dep_hour']).agg({'dep_delay': 'median',
                                          'carrier_delay' :  'median',
                                          'late_aircraft_delay' :  'median',
                                          'isCraft' : 'mean', 
                                          'isCarrier' : 'mean' })
    tailnum_arr_hourly_delays_df = df_sample.groupby(['tail_num', 'arr_hour']).agg({'arr_delay': 'median',
                                          'carrier_delay' :  'median',
                                          'late_aircraft_delay' :  'median',
                                          'isCraft' : 'mean', 
                                          'isCarrier' : 'mean' })
    
    return tailnum_dep_hourly_delays_df, tailnum_arr_hourly_delays_df
    
    
def carrier_branded_dayofweek_delay_multiclass_params(df_sample):
    """
    Input: flights csv sample or full dataset AFTER preprocessing_dataset
    Aggregates on op_unique_carrier, branded_share, f1_dayofweek
    
    Output: 
        index /join key: op_unique_carrier, branded_share, f1_dayofweek
        columns: median delays and isCarrier
    """      
    

    df_sample['branded_share'] = df_sample['branded_code_share'].apply(lambda x: 1 if len(x)>2 else 0)
    df_sample = df_sample.drop(columns = ['branded_code_share'])
    
    
    df_sample['isCarrier'] = df_sample['target_delay'].\
                            apply (lambda x: 1 if x == 'carrier_delay' else 0)


    df_sample['fl_date'] = pd.to_datetime(df_sample['fl_date'])
    df_sample['fl_dayofweek'] = df_sample['fl_date'].dt.dayofweek
    df_sample.drop(columns=['fl_date'], inplace=True)


    carrier_df = df_sample.groupby(['op_unique_carrier', 'branded_share', 'fl_dayofweek'])\
                                    .agg({'dep_delay': 'median',
                                          'arr_delay' : 'median',
                                          'carrier_delay' :  'median',
                                          'late_aircraft_delay' :  'median', 
                                          'isCarrier' : 'mean' })
    return carrier_df

def dest_monthly_multiclass_params(df_sample):
    """
    Input: flights csv sample or full dataset AFTER preprocessing_dataset
    Aggregates on dest_airport_id, fl_month
    
    Output: 2 dataframes 
        index / join key: dest_airport_id, fl_month
        columns: median delays and isWeather, isSecurity
    """   

    # extract hour and minute from crs_time
    df_sample['fl_date'] = pd.to_datetime(df_sample['fl_date'])
    df_sample['fl_month'] = df_sample['fl_date'].dt.month
    
    df_sample['isWeather'] = df_sample['target_delay'].\
                            apply (lambda x: 1 if x == 'weather_delay' else 0)
    df_sample['isSecurity'] = df_sample['target_delay'].\
                            apply (lambda x: 1 if x == 'security_delay' else 0)

    
    dest_monthly_params = df_sample.groupby(['dest_airport_id', 'fl_month']).agg({'arr_delay': 'median',
                                  'arr_delay' : 'median',
                                  'carrier_delay': 'median',  
                                  'nas_delay': 'median', 
                                  'late_aircraft_delay': 'median',                                    
                                  'weather_delay' : 'median',
                                  'security_delay' : 'median', 
                                  'isWeather' : 'mean' , 
                                  'isSecurity' : 'mean' ,
                                                                                  
                                  })
    
    dest_monthly_params.index.to_flat_index()
    return dest_monthly_params
    
def origin_monthly_multiclass_params(df_sample):
    """
    Input: flights csv sample or full dataset AFTER preprocessing_dataset
    Aggregates on origin_airport_id, fl_month
          
    Output: A dataframe
        index / join key: 'origin_airport_id', 'fl_month'
        columns: median delays and isWeather, isSecurity
    """
    

    # extract hour and minute from crs_time
    df_sample['fl_date'] = pd.to_datetime(df_sample['fl_date'])
    df_sample['fl_month'] = df_sample['fl_date'].dt.month

    df_sample['isWeather'] = df_sample['target_delay'].\
                            apply (lambda x: 1 if x == 'weather_delay' else 0)
    df_sample['isSecurity'] = df_sample['target_delay'].\
                            apply (lambda x: 1 if x == 'security_delay' else 0)
    
    origin_monthly_params = df_sample.groupby(['origin_airport_id', 'fl_month']).agg({'dep_delay': 'median',
                                  'arr_delay' : 'median',
                                  'carrier_delay': 'median',  
                                  'nas_delay': 'median', 
                                  'late_aircraft_delay': 'median',                                    
                                  'weather_delay' : 'median',
                                  'security_delay' : 'median', 
                                  'isWeather' : 'mean', 
                                  'isSecurity' : 'mean',
                                  })
    
    origin_monthly_params.index.to_flat_index()
    return origin_monthly_params
    
def holiday_multiclass_params(df_sample):
    """
    Input: flights csv sample or full dataset AFTER preprocessing_dataset
    Aggregates on 'holidate', 'origin_airport_id', 'dest_airport_id'
    Output: 
        Index / join key: 'holidate', 'origin_airport_id', 'dest_airport_id'
        columns: median delays,  isWeather isSecurity

    """
       
        
########################################
##  run this if holidays is not available. check the file location first
    us_holidays_df = pd.read_csv('../extra/us_holidays.csv')

    from datetime import timedelta
    holidays = []
    for hol in us_holidays_df['date'].values:
        holstart = pd.to_datetime(hol) - timedelta(days=3)
        holend = pd.to_datetime(hol) + timedelta(days=3)
        holidayweek = pd.date_range(holstart, holend)
        holidays.extend(holidayweek)
#######################
    
    df_sample['isWeather'] = df_sample['target_delay'].\
                            apply (lambda x: 1 if x == 'weather_delay' else 0)
    df_sample['isSecurity'] = df_sample['target_delay'].\
                            apply (lambda x: 1 if x == 'security_delay' else 0)

    
    # get holidate 
    df_sample['fl_date'] = pd.to_datetime(df_sample['fl_date'])
    df_sample['holidate'] = df_sample['fl_date'].apply(lambda x: 1 if x in holidays else 0)


    holiday_params = df_sample.groupby(['holidate', 'origin_airport_id', 'dest_airport_id']).agg({'dep_delay': 'median',
                              'arr_delay' : 'median',
                              'carrier_delay': 'median',  
                              'nas_delay': 'median', 
                              'late_aircraft_delay': 'median',                                    
                              'weather_delay' : 'median',
                              'security_delay' : 'median',
                              'isWeather' : 'mean', 
                              'isSecurity' : 'mean',
                              })
    
    return holiday_params
        
        
def origin_dest_route_dayofweek_multiclass_params(df_sample):
    """
    Input: flights csv sample or full dataset AFTER preprocessing_dataset
    Aggregates on 'origin_airport_id', 'dest_airport_id', 'fl_dayofweek'
    
    Output: A dataframe
        Index: 'origin_airport_id', 'dest_airport_id', 'fl_dayofweek'
        columns: median delays, isCarrier
    """
    
    df_sample['isCarrier'] = df_sample['target_delay'].\
                            apply (lambda x: 1 if x == 'carrier_delay' else 0)
    
    # get dayofweek
    df_sample['fl_date'] = pd.to_datetime(df_sample['fl_date'])
    df_sample['fl_dayofweek'] = df_sample['fl_date'].dt.dayofweek
    # traffic
    
    params_df = df_sample.groupby(['origin_airport_id', 'dest_airport_id', 
                                   'fl_dayofweek']).agg({'dep_delay': 'median',
                              'arr_delay' : 'median',
                              'carrier_delay': 'median',  
                              'nas_delay': 'median', 
                              'late_aircraft_delay': 'median',                                    
                              'weather_delay' : 'median',
                              'security_delay' : 'median',
                              'isCarrier' : 'mean'
                              })

    params_df['traffic'] = df_sample.groupby(['origin_airport_id', 'dest_airport_id', 'fl_dayofweek']).size()
    
    
    return params_df


In [44]:
def preparing_training_df(train_df):
    """Input: dataset after preprocessing
    check extras for  holidays csv
    Output: 2 datasets: X and y
    """

    ######################################################
    # getting the list of US national holidays
    # run this if 'holidays' is not available. check the file location first
    us_holidays_df = pd.read_csv('../extra/us_holidays.csv')

    from datetime import timedelta
    holidays = []
    for hol in us_holidays_df['date'].values:
        holstart = pd.to_datetime(hol) - timedelta(days=5)
        holend = pd.to_datetime(hol) + timedelta(days=3)
        holidayweek = pd.date_range(holstart, holend)
        holidays.extend(holidayweek)
    #######################################################          


    # EDA: column transformations to integrate with the preprocessed feature tables:
    # binarize branded share
    train_df['branded_share'] = train_df['branded_code_share'].apply(lambda x: 1 if len(x)>2 else 0)
    # extract month and day of week and holidate
    train_df['fl_date'] = pd.to_datetime(train_df['fl_date'])
    train_df['fl_month'] = train_df['fl_date'].dt.month
    train_df['fl_dayofweek'] = train_df['fl_date'].dt.dayofweek
    train_df['fl_date'] = pd.to_datetime(train_df['fl_date'])
    train_df['holidate'] = train_df['fl_date'].apply(lambda x: 1 if x in holidays else 0)

    # extract flight hour
    train_df['dep_hour'] = (np.round(train_df['crs_dep_time'],-2)/100).astype(int)
    train_df['arr_hour'] = (np.round(train_df['crs_arr_time'],-2)/100).astype(int)

    # drop irrelevant columns  
    train_df.drop(columns = ['mkt_unique_carrier', 'mkt_carrier',
                         'mkt_carrier_fl_num',
                         'op_carrier_fl_num',
                         'origin', 'origin_city_name'],
                         inplace=True)

    train_df.drop(columns = ['dest', 'dest_city_name',
                         'crs_elapsed_time',
                         'flights',
                         'fl_date',
                         'crs_dep_time', 'crs_arr_time',
                         'branded_code_share'],
                          inplace=True)

    delay_cols = ['carrier_delay', 'weather_delay',
                   'nas_delay', 'security_delay', 'late_aircraft_delay'] 

#     # defining the target (y) labels
#     df['target'] = df[delay_cols].idxmax(axis=1) # returns maximum delay

    # remove delays from dataset
    train_df.drop(columns=delay_cols, inplace=True)

    train_df.drop(columns = ['dep_time',
                       'dep_delay', 'taxi_out', 'taxi_in', 'arr_time',
                       'arr_delay', 'cancelled','actual_elapsed_time',
                       'air_time', 'isDelay'],
                              inplace=True)
 
    ################# calling features tables
    
    # merging the testing dataset with the features tables of aggregate values
    # thereby converting categorical and ordinal columns to continuous values
    tmp = train_df
    train_df = tmp.merge(features_2, 
                  left_on=['tail_num'], 
                  right_on=['tail_num'], how='left').merge(features_3,
                  left_on=['tail_num','dep_hour'],
                  right_on=['tail_num','dep_hour']).merge(features_4,
                  left_on=['tail_num','arr_hour'],
                  right_on=['tail_num','arr_hour']).merge(features_5,
                  left_on=['op_unique_carrier', 'branded_share', 'fl_dayofweek'], 
                  right_on=['op_unique_carrier', 'branded_share', 'fl_dayofweek'],
                  suffixes=('_', '_carrier')).merge(features_6,
                  left_on=['dest_airport_id', 'fl_month'], 
                  right_on=['dest_airport_id', 'fl_month'],
                  suffixes=('_', '_dest')).merge(features_7,
                  left_on=['origin_airport_id', 'fl_month'], 
                  right_on=['origin_airport_id', 'fl_month'],
                  suffixes=('_', '_origin')).merge(features_8,                                
                  left_on=['holidate', 'origin_airport_id', 'dest_airport_id'], 
                  right_on=['holidate', 'origin_airport_id', 'dest_airport_id'],
                  suffixes=('_', '_holidate')).merge(features_9,                               
                  left_on=['origin_airport_id', 'dest_airport_id', 'fl_dayofweek'], 
                  right_on=['origin_airport_id', 'dest_airport_id', 'fl_dayofweek'],
                  suffixes=('_', '_route'))
    
    # dropping irrelevant columns
    train_y = train_df['target_delay']
    
    train_X = train_df.drop(columns = ['op_unique_carrier',
                       'tail_num',
                       'origin_airport_id',
                       'dest_airport_id', 'target_delay',
                                       'fl_month',
                                      'fl_dayofweek'])
    
    return train_y, train_X
        





def preparing_test_dataset(df):
    """Input: 'raw' testing dataframe from csv file
    This function takes the raw pd.read_csv('flights_test.csv') 
        applies the engineered feature aggregations
        and restructures it to the form it needs to be 
        for Scaling and Model Predicting
    Output: X_test. 
    """
    
    ######################################################
    # getting the list of US national holidays
    # run this if 'holidays' is not available. check the file location first
    us_holidays_df = pd.read_csv('extra/us_holidays.csv')

    from datetime import timedelta
    holidays = []
    for hol in us_holidays_df['date'].values:
        holstart = pd.to_datetime(hol) - timedelta(days=5)
        holend = pd.to_datetime(hol) + timedelta(days=3)
        holidayweek = pd.date_range(holstart, holend)
        holidays.extend(holidayweek)
    #######################################################   
    
    # column transformations to integrate with the preprocessed feature tables:  

    # binarize branded share
    df['branded_share'] = df['branded_code_share'].apply(lambda x: 1 if len(x)>2 else 0)
    # extract month and day of week and holidate
    df['fl_date'] = pd.to_datetime(df['fl_date'])
    df['fl_month'] = df['fl_date'].dt.month
    df['fl_dayofweek'] = df['fl_date'].dt.dayofweek
    df['fl_date'] = pd.to_datetime(df['fl_date'])
    df['holidate'] = df['fl_date'].apply(lambda x: 1 if x in holidays else 0)
     # extract flight hour
    df['dep_hour'] = (np.round(df['crs_dep_time'],-2)/100).astype(int)
    df['arr_hour'] = (np.round(df['crs_arr_time'],-2)/100).astype(int)   
    # drop irrelevant columns   
    df.drop(columns = ['dup', 'mkt_unique_carrier', 'mkt_carrier',
                         'mkt_carrier_fl_num',
                         'op_carrier_fl_num',
                         'origin', 'origin_city_name',
                         'dest', 'dest_city_name',
                         'crs_elapsed_time',
                         'flights',
                            'fl_date',
                            'crs_dep_time', 'crs_arr_time',
                         'branded_code_share'],
              inplace=True) 
    
    ################# calling features tables
    
    # merging the testing dataset with the features tables of aggregate values
    # thereby converting categorical and ordinal columns to continuous values
    tmp = df
    df = tmp.merge(features_features_2, 
                  left_on=['tail_num'], 
                  right_on=['tail_num'], how='left').merge(features_features_3,
                  left_on=['tail_num','dep_hour'],
                  right_on=['tail_num','dep_hour']).merge(features_features_4,
                  left_on=['tail_num','arr_hour'],
                  right_on=['tail_num','arr_hour']).merge(features_features_5,
                  left_on=['op_unique_carrier', 'branded_share', 'fl_dayofweek'], 
                  right_on=['op_unique_carrier', 'branded_share', 'fl_dayofweek'],
                  suffixes=('_', '_carrier')).merge(features_features_6,
                  left_on=['dest_airport_id', 'fl_month'], 
                  right_on=['dest_airport_id', 'fl_month'],
                  suffixes=('_', '_dest')).merge(features_features_7,
                  left_on=['origin_airport_id', 'fl_month'], 
                  right_on=['origin_airport_id', 'fl_month'],
                  suffixes=('_', '_origin')).merge(features_features_8,                                
                  left_on=['holidate', 'origin_airport_id', 'dest_airport_id'], 
                  right_on=['holidate', 'origin_airport_id', 'dest_airport_id'],
                  suffixes=('_', '_holidate')).merge(features_features_9,                               
                  left_on=['origin_airport_id', 'dest_airport_id', 'fl_dayofweek'], 
                  right_on=['origin_airport_id', 'dest_airport_id', 'fl_dayofweek'],
                  suffixes=('_', '_route'))
    
    df.drop(columns = ['op_unique_carrier',
                       'tail_num',
                       'origin_airport_id',
                       'dest_airport_id',
                                       'fl_month',
                                      'fl_dayofweek'],
              inplace=True)


    return df


In [43]:
features_2 = pd.read_csv('../data/features_tailnum_delay_taxi_multiclass_params.csv')
features_3 = pd.read_csv('../data/tailnum_hourly_delays_multiclass_params_dep.csv')
features_4 = pd.read_csv('../data/tailnum_hourly_delays_multiclass_params_arr.csv')
features_5 = pd.read_csv('../data/carrier_branded_dayofweek_delay_multiclass_params.csv')
features_6 = pd.read_csv('../data/dest_monthly_multiclass_params.csv')
features_7 = pd.read_csv('../data/origin_monthly_multiclass_params.csv')
features_8 = pd.read_csv('../data/holiday_multiclass_params.csv')
features_9 = pd.read_csv('../data/origin_dest_route_dayofweek_multiclass_params.csv')

In [None]:
df.drop(columns = 'Unnamed: 0', inplace=True)
df = feature_generation(df_full, 1)
# this function calls the necessary functions to clean the dataset 
# and generate the aggregate features for model training
# save_features = True saves the files into the local directory, save_features False returns them

## Step 1: Testing on a sample dataset

In [None]:
# reload full dataset
chunk = pd.read_csv('../datasets/flights_train_set.csv', 
#                     usecols = usecols, 
                    chunksize=1000000, 
                    low_memory=False)
df_full = pd.concat(chunk)
df_full.drop(columns = 'Unnamed: 0', inplace=True)

### Preparing the training dataset

In [75]:
# get a sample
df_sample = df_full.sample(n=3000000)

# getting the target and value datasets
tmp_train_batch = preprocessing_dataset(df_sample)
y_batch, X_batch = preparing_training_df(tmp_train_batch)

In [83]:
y_batch.shape, X_batch.shape # check shape

((1323842,), (1323842, 59))

In [84]:
y_batch.value_counts()

carrier_delay          901069
late_aircraft_delay    226213
nas_delay              176164
weather_delay           19408
security_delay            988
Name: target_delay, dtype: int64

Due to the unbalanced classes, we have to strategically create sample data for training.

We will try 2 strategies: 

1. underbalancing the bigger classes or 

2. overbalancing the smaller classes.

In [85]:
# creating filters for the target classes
Y_carrier = y_batch == 'carrier_delay'
Y_aircraft = y_batch == 'late_aircraft_delay'
Y_nas = y_batch == 'nas_delay'
Y_weather = y_batch == 'weather_delay'
Y_security = y_batch == 'security_delay'

# to get the indices of the records
y_carrier_index = y_batch[Y_carrier].index
y_security_index = y_batch[Y_security].index
y_nas_index = y_batch[Y_nas].index
y_weather_index = y_batch[Y_weather].index
y_aircraft_index = y_batch[Y_aircraft].index

#### 1. underbalancing the bigger classes

In [86]:
# resize the sample size of the classes so they're equal
y_security_training_index = y_security_index
y_nas_training_index = np.random.choice(y_nas_index, size = 324)
y_weather_training_index = np.random.choice(y_weather_index, size = 324)
y_carrier_training_index = np.random.choice(y_carrier_index, size = 324)
y_air_training_index = np.random.choice(y_aircraft_index, size = 324)


# add them together and shuffle
shuffle_index = np.concatenate((y_security_training_index,
                  y_nas_training_index,
                  y_weather_training_index,
                  y_carrier_training_index,
                  y_air_training_index))
np.random.shuffle(shuffle_index)
np.random.shuffle(shuffle_index) # to be doubly sure

# make X, the parameters based on the shuffle_index

X_train = X_batch.iloc[shuffle_index]
y_train = y_batch.iloc[shuffle_index] 

In [87]:
y_train.value_counts()

security_delay         988
nas_delay              324
carrier_delay          324
weather_delay          324
late_aircraft_delay    324
Name: target_delay, dtype: int64

**Note**: That a separate test_dataset was reserved before feature generation to avoid data leak. So we will not be using train test split

### Preparing the testing dataset

In [111]:
# prepare test data with train test split
# import numpy as np
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(
#     X, y, test_size=0.20, random_state=88)

test_full = preprocessing_dataset(pd.read_csv('../datasets/flights_test_set.csv'))

In [112]:
test_batch = test_full
y_test, X_test = preparing_training_df(test_batch)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4645141, 59), (620613, 59), (4645141,), (620613,))

In [113]:
y_test.value_counts() # checking that it has all the labels we need

carrier_delay          422016
late_aircraft_delay    106405
nas_delay               82603
weather_delay            9091
security_delay            498
Name: target_delay, dtype: int64

* Now that the datasets are ready, model training and testing starts here...
* Our first model is Linear Regression

In [114]:
# scale the data
from sklearn import preprocessing

scaler = preprocessing.StandardScaler().fit(X_train)
X_scaled = scaler.transform(X_train)

# train model
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=88, max_iter=1500)
clf.fit(X_scaled, y_train)

# testing the test data
X_test_scaled = scaler.transform(X_test)
y_pred = clf.predict(X_test_scaled)
clf.score(X_test_scaled, y_test)

KeyboardInterrupt: 

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

#### 2. overbalancing the smaller classes

In [95]:
y_batch.value_counts()

carrier_delay          901069
late_aircraft_delay    226213
nas_delay              176164
weather_delay           19408
security_delay            988
Name: target_delay, dtype: int64

In [107]:
# resize the sample size of the classes so they're equal
y_carrier_training_index = y_carrier_index
# y_nas_training_index = np.random.choice(y_nas_index, size = 102820)
# y_weather_training_index = np.random.choice(y_weather_index, size = 102820)
# y_carrier_training_index = np.random.choice(y_carrier_index, size = 57791)
# y_air_training_index = np.random.choice(y_aircraft_index, size = 57791)
# y_nas_training_index = y_nas_index

tmp = []
for i in range (4):
    tmp.extend(y_aircraft_index)
y_air_training_index = tmp

tmp = []
for i in range (5):
    tmp.extend(y_nas_index)
y_nas_training_index = tmp


tmp = []
for i in range (50):
    tmp.extend(y_weather_index)
y_weather_training_index = tmp

tmp = []
for i in range (1000):
    tmp.extend(y_security_index)
y_security_training_index = tmp

In [108]:
import numpy as np

In [110]:
# add them together and shuffle
shuffle_index = np.concatenate((y_security_training_index,
                  y_nas_training_index,
                  y_weather_training_index,
                  y_carrier_training_index,
                  y_air_training_index))
np.random.shuffle(shuffle_index)
np.random.shuffle(shuffle_index)
# make X, the parameters based on the shuffle_index

X_train = X_batch.iloc[shuffle_index]
y_train = y_batch.iloc[shuffle_index] 
y_train.value_counts()

security_delay         988000
weather_delay          970400
late_aircraft_delay    904852
carrier_delay          901069
nas_delay              880820
Name: target_delay, dtype: int64

In [None]:
# scale and train model
from sklearn import preprocessing

scaler = preprocessing.StandardScaler().fit(X_train)
X_scaled = scaler.transform(X_train)

from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=88, max_iter=1000)
clf.fit(X_scaled, y_train)

# testing the test data
X_test_scaled = scaler.transform(X_test)
y_pred = clf.predict(X_test_scaled)
clf.score(X_test_scaled, y_test)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

## Step 2: Testing on the entire dataset