# Project Overview

## OSEMN Pipeline

* O - Obtaining our data
* S - Scrubbing/Cleaning our data
* E - Exploring/Visualizing our data
* M - Modeling our data
* N - Interpreting the data

## Notebook Preparation

In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve, auc, classification_report,confusion_matrix, plot_confusion_matrix, accuracy_score, f1_score

from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

import xgboost as xgb


from imblearn.over_sampling import SMOTE

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

# Obtaining our Data

In [2]:
df = pd.read_csv('data/ONTIME_REPORTING_12.csv')
df.shape

(625763, 33)

In [3]:
df

Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN,ORIGIN_CITY_NAME,DEST_AIRPORT_ID,...,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,Unnamed: 32
0,12,8,7,WN,N8651A,3689,15016,STL,"St. Louis, MO",14679,...,245.0,266.0,1557.0,7,0.0,0.0,18.0,0.0,0.0,
1,12,8,7,WN,N939WN,2600,15016,STL,"St. Louis, MO",14683,...,145.0,125.0,786.0,4,,,,,,
2,12,8,7,WN,N7741C,2770,15016,STL,"St. Louis, MO",14683,...,140.0,131.0,786.0,4,,,,,,
3,12,8,7,WN,N550WN,6654,15016,STL,"St. Louis, MO",14747,...,275.0,256.0,1709.0,7,,,,,,
4,12,8,7,WN,N8319F,3402,15016,STL,"St. Louis, MO",14771,...,270.0,256.0,1735.0,7,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
625758,12,31,2,B6,N193JB,846,13204,MCO,"Orlando, FL",15070,...,163.0,151.0,989.0,4,52.0,0.0,0.0,0.0,0.0,
625759,12,31,2,B6,N304JB,854,11278,DCA,"Washington, DC",10721,...,90.0,79.0,399.0,2,,,,,,
625760,12,31,2,B6,N193JB,860,14100,PHL,"Philadelphia, PA",10721,...,85.0,59.0,280.0,2,,,,,,
625761,12,31,2,B6,N563JB,861,10721,BOS,"Boston, MA",14843,...,242.0,216.0,1674.0,7,,,,,,


In [4]:
df.memory_usage().sum()

165201560

In [10]:
df.dtypes

MONTH                    int64
DAY_OF_MONTH             int64
DAY_OF_WEEK              int64
OP_UNIQUE_CARRIER       object
TAIL_NUM                object
OP_CARRIER_FL_NUM        int64
ORIGIN_AIRPORT_ID        int64
ORIGIN                  object
ORIGIN_CITY_NAME        object
DEST_AIRPORT_ID          int64
DEST                    object
DEST_CITY_NAME          object
CRS_DEP_TIME             int64
DEP_TIME               float64
DEP_DELAY_NEW          float64
DEP_DEL15              float64
DEP_TIME_BLK            object
CRS_ARR_TIME             int64
ARR_TIME               float64
ARR_DELAY_NEW          float64
ARR_TIME_BLK            object
CANCELLED              float64
CANCELLATION_CODE       object
CRS_ELAPSED_TIME       float64
ACTUAL_ELAPSED_TIME    float64
DISTANCE               float64
DISTANCE_GROUP           int64
CARRIER_DELAY          float64
WEATHER_DELAY          float64
NAS_DELAY              float64
SECURITY_DELAY         float64
LATE_AIRCRAFT_DELAY    float64
Unnamed:

In [5]:
df.columns

Index(['MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK', 'OP_UNIQUE_CARRIER', 'TAIL_NUM',
       'OP_CARRIER_FL_NUM', 'ORIGIN_AIRPORT_ID', 'ORIGIN', 'ORIGIN_CITY_NAME',
       'DEST_AIRPORT_ID', 'DEST', 'DEST_CITY_NAME', 'CRS_DEP_TIME', 'DEP_TIME',
       'DEP_DELAY_NEW', 'DEP_DEL15', 'DEP_TIME_BLK', 'CRS_ARR_TIME',
       'ARR_TIME', 'ARR_DELAY_NEW', 'ARR_TIME_BLK', 'CANCELLED',
       'CANCELLATION_CODE', 'CRS_ELAPSED_TIME', 'ACTUAL_ELAPSED_TIME',
       'DISTANCE', 'DISTANCE_GROUP', 'CARRIER_DELAY', 'WEATHER_DELAY',
       'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY', 'Unnamed: 32'],
      dtype='object')

In [6]:
df.head()

Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN,ORIGIN_CITY_NAME,DEST_AIRPORT_ID,...,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,Unnamed: 32
0,12,8,7,WN,N8651A,3689,15016,STL,"St. Louis, MO",14679,...,245.0,266.0,1557.0,7,0.0,0.0,18.0,0.0,0.0,
1,12,8,7,WN,N939WN,2600,15016,STL,"St. Louis, MO",14683,...,145.0,125.0,786.0,4,,,,,,
2,12,8,7,WN,N7741C,2770,15016,STL,"St. Louis, MO",14683,...,140.0,131.0,786.0,4,,,,,,
3,12,8,7,WN,N550WN,6654,15016,STL,"St. Louis, MO",14747,...,275.0,256.0,1709.0,7,,,,,,
4,12,8,7,WN,N8319F,3402,15016,STL,"St. Louis, MO",14771,...,270.0,256.0,1735.0,7,,,,,,


In [7]:
df.describe()

Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_CARRIER_FL_NUM,ORIGIN_AIRPORT_ID,DEST_AIRPORT_ID,CRS_DEP_TIME,DEP_TIME,DEP_DELAY_NEW,DEP_DEL15,...,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,Unnamed: 32
count,625763.0,625763.0,625763.0,625763.0,625763.0,625763.0,625763.0,620253.0,620253.0,620253.0,...,625763.0,618612.0,625763.0,625763.0,126945.0,126945.0,126945.0,126945.0,126945.0,0.0
mean,12.0,15.839891,3.937692,2661.786221,12663.122324,12663.160129,1329.26204,1336.141961,15.151851,0.206621,...,144.801207,138.839012,810.330363,3.714075,21.149427,3.067029,15.141581,0.08729,28.177817,
std,0.0,8.977102,2.088807,1816.726749,1525.572508,1525.53956,496.707166,510.127095,50.090133,0.404881,...,72.840669,73.090522,593.150175,2.330364,68.206656,32.129444,35.088202,2.225987,55.918673,
min,12.0,1.0,1.0,1.0,10135.0,10135.0,1.0,1.0,0.0,0.0,...,22.0,16.0,31.0,1.0,0.0,0.0,0.0,0.0,0.0,
25%,12.0,8.0,2.0,1106.0,11292.0,11292.0,912.0,916.0,0.0,0.0,...,91.0,86.0,373.0,2.0,0.0,0.0,0.0,0.0,0.0,
50%,12.0,16.0,4.0,2295.0,12889.0,12889.0,1323.0,1329.0,0.0,0.0,...,127.0,121.0,650.0,3.0,1.0,0.0,2.0,0.0,6.0,
75%,12.0,23.0,6.0,4102.0,14027.0,14027.0,1736.0,1747.0,9.0,0.0,...,175.0,170.0,1048.0,5.0,18.0,0.0,19.0,0.0,34.0,
max,12.0,31.0,7.0,6981.0,16869.0,16869.0,2359.0,2400.0,2710.0,1.0,...,705.0,718.0,5095.0,11.0,2695.0,1847.0,1237.0,366.0,1627.0,


In [8]:
df.isna().sum()

MONTH                       0
DAY_OF_MONTH                0
DAY_OF_WEEK                 0
OP_UNIQUE_CARRIER           0
TAIL_NUM                  457
OP_CARRIER_FL_NUM           0
ORIGIN_AIRPORT_ID           0
ORIGIN                      0
ORIGIN_CITY_NAME            0
DEST_AIRPORT_ID             0
DEST                        0
DEST_CITY_NAME              0
CRS_DEP_TIME                0
DEP_TIME                 5510
DEP_DELAY_NEW            5510
DEP_DEL15                5510
DEP_TIME_BLK                0
CRS_ARR_TIME                0
ARR_TIME                 6045
ARR_DELAY_NEW            7151
ARR_TIME_BLK                0
CANCELLED                   0
CANCELLATION_CODE      619970
CRS_ELAPSED_TIME            0
ACTUAL_ELAPSED_TIME      7151
DISTANCE                    0
DISTANCE_GROUP              0
CARRIER_DELAY          498818
WEATHER_DELAY          498818
NAS_DELAY              498818
SECURITY_DELAY         498818
LATE_AIRCRAFT_DELAY    498818
Unnamed: 32            625763
dtype: int

# Scrubbing/Cleaning our Data

In [None]:
def month_cleanup(monthly_data):
    
    # drop rows with no departure time, tail number, or were cancelled
    monthly_data.drop(monthly_data.loc[monthly_data['DEP_TIME'].isna()].index, axis=0, inplace=True)
    monthly_data.drop(monthly_data.loc[monthly_data['TAIL_NUM'].isna()].index, axis=0, inplace=True)
    monthly_data.drop(monthly_data.loc[monthly_data['CANCELLED']==1].index, axis=0, inplace=True)
    
    
    # Create time blocks for departure for cleaner categories
    monthly_data.loc[(monthly_data['DEP_TIME_BLK']=='2100-2159') | (monthly_data['DEP_TIME_BLK']=='2200-2259') | (monthly_data['DEP_TIME_BLK']=='2300-2359'), 'DEP_BLOCK'] = 'LATE_NIGHT'
    monthly_data.loc[(monthly_data['DEP_TIME_BLK']=='0001-0559'), 'DEP_BLOCK'] = 'EARLY_MORNING'      
    monthly_data.loc[(monthly_data['DEP_TIME_BLK']=='0600-0659') | (monthly_data['DEP_TIME_BLK']=='0700-0759') | (monthly_data['DEP_TIME_BLK']=='0800-0859') | (monthly_data['DEP_TIME_BLK']=='0900-0959'), 'DEP_BLOCK'] = 'MORNING'
    monthly_data.loc[(monthly_data['DEP_TIME_BLK']=='1000-1059') | (monthly_data['DEP_TIME_BLK']=='1100-1159') | (monthly_data['DEP_TIME_BLK']=='1200-1259'), 'DEP_BLOCK'] = 'MIDDAY'
    monthly_data.loc[(monthly_data['DEP_TIME_BLK']=='1300-1359') | (monthly_data['DEP_TIME_BLK']=='1400-1459') | (monthly_data['DEP_TIME_BLK']=='1500-1559') | (monthly_data['DEP_TIME_BLK']=='1600-1659'), 'DEP_BLOCK'] = 'AFTERNOON'
    monthly_data.loc[(monthly_data['DEP_TIME_BLK']=='1700-1759') | (monthly_data['DEP_TIME_BLK']=='1800-1859') | (monthly_data['DEP_TIME_BLK']=='1900-1959') | (monthly_data['DEP_TIME_BLK']=='2000-2059') , 'DEP_BLOCK'] = 'EVENING'

    # drop columns that we won't use
    monthly_data.drop(columns = ['MONTH', 'ORIGIN',  'DEST',  
                   'CRS_DEP_TIME', 'DEP_DELAY_NEW', 'CRS_ARR_TIME', 'ARR_TIME', 
                   'CANCELLED', 'CANCELLATION_CODE', 'CRS_ELAPSED_TIME', 'DISTANCE',
                   'CARRIER_DELAY', 'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY',
                  'ARR_DELAY_NEW', 'Unnamed: 32', 'DEP_TIME_BLK', 'ARR_TIME_BLK', 'ACTUAL_ELAPSED_TIME',
                  'DEST_AIRPORT_ID', 'DEST_CITY_NAME'],
        axis=1, inplace=True)
    
    # specify data types of various fields to reduce memory usage
    monthly_data['DAY_OF_MONTH'] = monthly_data['DAY_OF_MONTH'].astype('int8')
    monthly_data['DAY_OF_WEEK'] = monthly_data['DAY_OF_WEEK'].astype('object')
    monthly_data['OP_CARRIER_FL_NUM'] = monthly_data['OP_CARRIER_FL_NUM'].astype('object')
    monthly_data['DEP_DEL15'] = monthly_data['DEP_DEL15'].astype('int8')
    monthly_data['DISTANCE_GROUP'] = monthly_data['DISTANCE_GROUP'].astype('int8')
    monthly_data['DEP_BLOCK'] = monthly_data['DEP_BLOCK'].astype('object')
    
    return monthly_data

In [None]:
df

### Feature Engineering

#### Flight Sequence Order

In [None]:
carriers = list(df['OP_UNIQUE_CARRIER'].unique())
days = list(df['DAY_OF_MONTH'].unique())
flight_nums = list(df['OP_CARRIER_FL_NUM'].unique())

# TEMP STUFF
#carriers = ['WN']
#days = [1]

In [None]:
# apply sequence order to same-day flight number sequences

for carrier in carriers:
    print("Working on Carrier: {}".format(carrier))
    for day in days:
        print("Carrier {} Day {}".format(carrier, day))
        flights = df.loc[(df['OP_UNIQUE_CARRIER'] == carrier) & 
                  (df['DAY_OF_MONTH'] == day), 'OP_CARRIER_FL_NUM'].unique()
        print(flights)
        for flight in flights:
            sequence = df.loc[(df['OP_UNIQUE_CARRIER'] == carrier) & 
                  (df['DAY_OF_MONTH'] == day) & (df['OP_CARRIER_FL_NUM'] == flight)]
            df.loc[(df['OP_UNIQUE_CARRIER'] == carrier) & 
                  (df['DAY_OF_MONTH'] == day) & (df['OP_CARRIER_FL_NUM'] == flight), 
                      'sequence_rank'] = sequence['DEP_TIME'].rank()

            
df.loc[(df['OP_UNIQUE_CARRIER'] == carrier) & (df['DAY_OF_MONTH'] == day)]                     

In [None]:
# Get Airline ID into main frame in order to join aircraft info

#airline_id = pd.read_csv('data/airline_id.csv')
#airline_id.drop_duplicates(inplace=True, keep='first')
#airline_id

In [None]:
# Merge Airline ID onto main frame

#merge1 = df.merge(airline_id, how="left", left_on='OP_UNIQUE_CARRIER', right_on='UNIQUE_CARRIER')
#merge1

In [None]:
# Check we got all Airline ID
#merge1.isna().sum()

#### Airplane Seat Count

In [None]:
# Load airplane info so we can get seat count
aircraft = pd.read_csv("data/aircraft_type_by_tail_number.csv")
aircraft

In [None]:
# Drop unneeded columns. All we need is the Tail Number and the number of seats
aircraft.drop(columns=['MANUFACTURER', 'MODEL', 'Unnamed: 4'], axis=1, inplace=True)

# Merge aircraft info with main frame on tail number
final = df.merge(aircraft, how="inner", left_on='TAIL_NUM', right_on='TAIL_NUMBER')

# drop any entries that didn't match to a tail number
final.drop(columns=['TAIL_NUM', 'TAIL_NUMBER'], axis=1, inplace=True)

# simplify data type of number of seats to reduce memory usage
final['NUMBER_OF_SEATS'] = final['NUMBER_OF_SEATS'].astype('int16')

final

#### Airport Coordinates

In [None]:
# load coordinates of airports

coords = pd.read_csv('data/airport_coordinates.csv')
coords.drop("Unnamed: 3", axis=1, inplace=True)
coords_dict = coords.to_dict(orient='records')
coords_dict

In [None]:
# make a copy of the original df to do the coordinates EDA

df_with_coords = final

df_with_coords['ORIGIN_LAT'] = 0
df_with_coords['ORIGIN_LONG'] = 0

In [None]:
for item in coords_dict:
    airport = item['AIRPORT_ID']
    lat = item['LATITUDE']
    long = item['LONGITUDE']
    df_with_coords.loc[df_with_coords['ORIGIN_AIRPORT_ID']==airport, 'ORIGIN_LAT'] = lat
    df_with_coords.loc[df_with_coords['ORIGIN_AIRPORT_ID']==airport, 'ORIGIN_LONG'] = long

#### Concurrent Flights

In [None]:
airport_business = final.groupby(['ORIGIN_AIRPORT_ID','DAY_OF_MONTH', 'DEP_BLOCK'])['OP_UNIQUE_CARRIER'].count().reset_index()
airport_business.rename(columns={'OP_UNIQUE_CARRIER' : 'FLIGHTS_IN_BLOCK'}, inplace=True)
airport_business

In [None]:
def find_flights(line):
    origin = line['ORIGIN_AIRPORT_ID']
    day = line['DAY_OF_MONTH']
    block = line['DEP_BLOCK']
    flights = airport_business.loc[(airport_business['ORIGIN_AIRPORT_ID'] == origin) & 
                                    (airport_business['DAY_OF_MONTH'] == day) & 
                                   (airport_business['DEP_BLOCK'] == block), ['FLIGHTS_IN_BLOCK']]
    number = flights.iloc[0]
    return number

In [None]:
final['CONCURRENT_FLIGHTS'] = 0
final['CONCURRENT_FLIGHTS'] = final.apply(lambda x: find_flights(x), axis=1)

In [None]:
final

In [None]:
# SAVE FILE

#final.to_pickle("final_flights.pkl")


In [None]:
# LOAD FILE

final = pd.read_pickle("final_flights.pkl")

In [None]:
final

In [None]:
wn = final.loc[final['OP_UNIQUE_CARRIER']=='WN']
wn

In [None]:
wn.groupby('OP_CARRIER_FL_NUM').count()

In [None]:
wn.loc[(wn['OP_CARRIER_FL_NUM']==1) & (wn['DAY_OF_WEEK']==4)]

# Exploring/Visualizing Data

## Train/Test Split

In [None]:
randomstate = 42
testsize = .2

In [None]:
# Define appropriate X and y

predictors = final.drop('DEP_DEL15', axis=1)
y = final['DEP_DEL15']

x_train, x_test, y_train, y_test = train_test_split(predictors, y, test_size=testsize, random_state=randomstate, stratify=y)

In [None]:
print('Raw counts: \n')
print(final['DEP_DEL15'].value_counts())
print('-----------------------------------')
print('Normalized counts: \n')
print(final['DEP_DEL15'].value_counts(normalize=True))

In [None]:
final

In [None]:
sns.countplot(x=y, data=predictors)
plt.show()

In [None]:
final.groupby('DEP_DEL15').mean()

In [None]:
plt.figure(figsize=(25,10))

sns.scatterplot(data=final, x="ORIGIN_LONG", y="ORIGIN_LAT", hue="DEP_DEL15", palette="magma_r");

In [None]:
temp = final.loc[(final['ORIGIN_LONG'] > -130) & (final['ORIGIN_LONG'] < -50) & (final['ORIGIN_LAT'] > 20) & (final['ORIGIN_LAT'] < 50)]
temp2 = temp.groupby('ORIGIN_AIRPORT_ID').mean()
temp2.drop(temp2.loc[temp2['DEP_DEL15']==1].index, axis=0, inplace=True)
temp2

In [None]:
plt.figure(figsize=(30,20))

sns.scale=2
sns.scatterplot(data=temp2, x='ORIGIN_LONG', y="ORIGIN_LAT", hue="DEP_DEL15", palette="plasma_r", size='CONCURRENT_FLIGHTS');

In [None]:
sns.set(font_scale = 1.5) #increasing our font size a bit


g = sns.catplot(
    data=final, kind="bar",
    x="OP_UNIQUE_CARRIER", y="DEP_DEL15",
    ci=None, palette="dark", alpha=.6, height=8, aspect=2
)
g.despine(left=True)
g.set_axis_labels("Airline", "Percent Delayed Flights")


In [None]:
sns.set(font_scale = 1.5) #increasing our font size a bit


g = sns.catplot(
    data=final, kind="bar",
    x="DAY_OF_WEEK", y="DEP_DEL15",
    ci=None, palette="dark", alpha=.6, height=8, aspect=2
)
g.despine(left=True)
g.set_axis_labels("Day of Week", "Percent Delayed Flights")


In [None]:
sns.set(font_scale = 1.5) #increasing our font size a bit

g = sns.catplot(
    data=final, kind="bar",
    x="OP_UNIQUE_CARRIER", y="DEP_DEL15", hue='DAY_OF_WEEK',
    ci=None, palette="dark", alpha=.6, height=8, aspect=2
)
g.despine(left=True)
g.set_axis_labels("Airline", "Percent Delayed Flights")


In [None]:
sns.set(font_scale = 1.5) #increasing our font size a bit

g = sns.catplot(
    data=final, kind="bar",
    x="ORIGIN_CITY_NAME", y="DEP_DEL15",
    ci=None, palette="dark", alpha=.6, height=8, aspect=2
)
g.despine(left=True)
g.set_axis_labels("Airport", "Delays by Airport")


In [None]:
final.groupby('OP_UNIQUE_CARRIER').mean()

## Construct Pipelines

In [None]:
features_cat = [col for col in x_train.columns if x_train[col].dtype in [object]]

cont_features = [col for col in x_train.columns if x_train[col].dtype in [np.float64, np.int64, np.int32, np.int16, np.int8]]

features_cat, cont_features

In [None]:
def pipeline_fit_preprocessor(x):
    
    # Create a column transformer to one hot encode the categoricals
    print("Creating One-hot Transformer")
    cat_features_encoded = ColumnTransformer(transformers=[
        ("ohe", OneHotEncoder(handle_unknown="ignore"), features_cat)
        ], remainder="passthrough")

  
    # set up the preprocessing pipeline
    print('Setting up processing pipeline')
    pipe = Pipeline([
                ('encoder', cat_features_encoded),
                ('scl', StandardScaler(with_mean=False)),
            ])
    
    print('Fitting pipeline on train data')
    transformed_data = pipe.fit_transform(x)
    
    return transformed_data, pipe
    # Label transformed data
    #print('Getting Labels')
    #encoder = cat_features_encoded.named_transformers_["ohe"]
    #object_labels0 = encoder.categories_[0] + '_' + features_cat[0]
    #object_labels1 = encoder.categories_[1] + '_' + features_cat[1]
    #print('Labeling features')
    #all_cols = list(object_labels0) + list(object_labels1) + cont_features
    #return pd.DataFrame(transformed_data, columns=all_cols, index=x.index)


def pipeline_transform_preprocessor(x, pipe):
    
    print('Transforming test data')
    transformed_data = pipe.transform(x)
    return transformed_data
    
    # Label transformed data
    #print('Getting Labels')
    #encoder = cat_features_encoded.named_transformers_["ohe"]
    #object_labels0 = encoder.categories_[0] + '_' + features_cat[0]
    #object_labels1 = encoder.categories_[1] + '_' + features_cat[1]
    #print('Labeling features')
    #all_cols = list(object_labels0) + list(object_labels1) + cont_features
    #return pd.DataFrame(transformed_data, columns=all_cols, index=x.index)


In [None]:
processed_train, pipe = pipeline_fit_preprocessor(x_train)
processed_test = pipeline_transform_preprocessor(x_test, pipe)

In [None]:
processed_train.shape

# Modeling

### Plan:

Model Plans:

* Basic Log Reg
* KNN
* Decision Tree
* Naive Bayes
* Random Forest
* XGBoost
* CatBoost

Feature Selection Plans:

* Basic Correlation
* RFECV
* Permutation Importance

Add Pipelines

In [None]:
# prepare dictionary to store results
models = {}
models['Models'] = []
models['f1'] = []
models['accuracy'] = []

In [None]:
def make_model(model, train, test, y_train, y_test, title):
    
    # fit model to train data
    model.fit(train, y_train)
    
    # get accuracy cross val score for cv 5
    cv_5 = cross_val_score(model, train, y_train, scoring='accuracy', cv=5)
    accs = round(cv_5.mean()*100,2)
    print("CV 5 Accuracy Train Score: {}".format(accs))
    
    # make predictions on test data
    preds = model.predict(test)
    
    # Print Testing accuracy
    print('Testing Accuracy: ', accuracy_score(y_test, preds))
    print('f1 Score: ', f1_score(y_test, preds))
    
    # Visualize Confusion Matrix
    plot_confusion_matrix(model, test, y_test,
                     cmap=plt.cm.Blues)
    plt.show()
    
    # Get false positive, true positive, thresholds
    fpr, tpr, thresholds = roc_curve(y_test, preds)
    # print the AUC
    roc_auc = auc(fpr, tpr)
    print('AUC: {}'.format(auc(fpr, tpr)))

    # Visualize AUC
    sns.set_style('darkgrid', {'axes.facecolor': '0.9'})
    plt.figure(figsize=(15,10))
    plt.plot(fpr, tpr, color='red', lw=2, label='ROC curve')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Baseline')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.yticks([i/20.0 for i in range(21)])
    plt.xticks([i/20.0 for i in range(21)])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic (ROC) Curve')
    plt.legend(loc='lower right')
    plt.show()
    
    # Calculate our scores
    accuracy = accuracy_score(y_test, preds)
    f1 = f1_score(y_test, preds)

    # append our results to our lists
    models['Models'].append(title)
    models['f1'].append(f1)
    models['accuracy'].append(accuracy)

In [None]:
model = LogisticRegression(random_state=randomstate, fit_intercept=False, C=1e12)

make_model(model, processed_train, processed_test, y_train, y_test, "Log Reg Unbalanced")

In [None]:
model = LogisticRegression(class_weight='balanced', random_state=randomstate, fit_intercept=False, C=1e12)

make_model(model, processed_train, processed_test, y_train, y_test, "Log Reg")

In [None]:
models

In [None]:
'''#Parameter Tuning

param_grid = {'criterion': ['gini', 'entropy'],
        'min_samples_leaf': [10, 50, 100, 1000],
        'max_depth': [5,10,25,50],
        'min_samples_split': [10, 50, 100, 1000],
        }

clf = DecisionTreeClassifier()

grid_search = GridSearchCV(clf, param_grid, verbose=10, scoring='f1', cv=5, n_jobs=-1)

grid_search.fit(processed_train, y_train)

grid_search.best_estimator_


# Best accuracy
print('Best f1: %.3f' % grid_search.best_score_)

print("Best parameters set found on train set: \n")
print(grid_search.best_params_)
print("\nGrid scores on train set:\n")
means = grid_search.cv_results_['mean_test_score']
stds = grid_search.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, grid_search.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))'''

Fitting 5 folds for each of 128 candidates, totalling 640 fits
Best f1: 0.207
Best parameters set found on train set: 

{'criterion': 'gini', 'max_depth': 50, 'min_samples_leaf': 10, 'min_samples_split': 10}


In [None]:
model = DecisionTreeClassifier(criterion = 'gini', max_depth= 50, min_samples_leaf= 10, min_samples_split= 10)

make_model(model, processed_train, processed_test, y_train, y_test, "Decision Tree Classifier")

### KNN

In [None]:
### REGRESSION KNN MODEL

mae_val = [] #to store mae values for different k

# checks mean absolute error scores on k from 1 to 20
for K in range(20):
    K = K+1
    
    # set up the KNN regressor
    model = KNeighborsClassifier(n_neighbors = K)

    model.fit(processed_train, y_train)  #fit the model
    pred=model.predict(processed_test) #make prediction on test set
    error = accuracy(y_test,pred) #calculate rmse
    mae_val.append(error) #store mae values
    print('MAE value for k= ' , K , 'is:', error)
    
# gets optimal k-value based on score minimum
index_min = np.argmin(mae_val) + 1

# makes model and fits using optimal k
model = neighbors.KNeighborsClassifier(n_neighbors = index_min)

make_model(model, processed_train, processed_test, y_train, y_test, 'KNN')

### Decision Tree

### Random Forest

### XGBoost

## Estimator Helper Class

This code by David S. Bautista at http://www.davidsbatista.net/blog/2018/02/23/model_optimization/

In [None]:
class EstimatorSelectionHelper:

    def __init__(self, models, params):
        if not set(models.keys()).issubset(set(params.keys())):
            missing_params = list(set(models.keys()) - set(params.keys()))
            raise ValueError("Some estimators are missing parameters: %s" % missing_params)
        self.models = models
        self.params = params
        self.keys = models.keys()
        self.grid_searches = {}

    def fit(self, X, y, cv=5, n_jobs=-1, verbose=1, scoring=None, refit=False):
        for key in self.keys:
            print("Running GridSearchCV for %s." % key)
            model = self.models[key]
            params = self.params[key]
            gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs,
                              verbose=verbose, scoring=scoring, refit=refit,
                              return_train_score=True)
            gs.fit(X,y)
            self.grid_searches[key] = gs    

    def score_summary(self, sort_by='mean_score'):
        def row(key, scores, params):
            d = {
                 'estimator': key,
                 'min_score': min(scores),
                 'max_score': max(scores),
                 'mean_score': np.mean(scores),
                 'std_score': np.std(scores),
            }
            return pd.Series({**params,**d})

        rows = []
        for k in self.grid_searches:
            print(k)
            params = self.grid_searches[k].cv_results_['params']
            scores = []
            for i in range(self.grid_searches[k].cv):
                key = "split{}_test_score".format(i)
                r = self.grid_searches[k].cv_results_[key]        
                scores.append(r.reshape(len(params),1))

            all_scores = np.hstack(scores)
            for p, s in zip(params,all_scores):
                rows.append((row(k, s, p)))

        df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)

        columns = ['estimator', 'min_score', 'mean_score', 'max_score', 'std_score']
        columns = columns + [c for c in df.columns if c not in columns]

        return df[columns]

In [None]:
models1 = {
    'Logistic Regression': LogisticRegression(),
    #'KNN' : KNeighborsClassifier(),
    'Decision Tree Classifier': DecisionTreeClassifier(),
    'Naive Bayes Classifier' : GaussianNB(),
    #'Random Forest Classifier': RandomForestClassifier(),
    #'Extra Trees Classifier': ExtraTreesClassifier(),
    #'AdaBoost Classifier': AdaBoostClassifier(),
    #'Gradient Boosting Classifier': GradientBoostingClassifier(),
    #'SVC': SVC(),
    #'XGBoost' : xgb.XGBClassifier()
}

params1 = {
    
    'Logistic Regression': {'solver': ['liblinear', 'lbfgs']},
    #'KNN' : {'n_neighbors' : [1,5,10]},
    'Decision Tree Classifier' : {'criterion' : ['gini', 'entropy'], 'max_depth':[1,3,5,7]},
    'Naive Bayes Classifier' : {},
    #'RandomForestClassifier': { 'max_depth':[1,3,5,7] },    
    #'ExtraTreesClassifier': { 'max_depth':[1,3,5,7] },
    #'AdaBoostClassifier':  { 'max_depth':[1,3,5,7] },
    #'GradientBoostingClassifier': { 'max_depth':[1,3,5,7], 'learning_rate': [0.8, 1.0] },
    #'SVC': {'kernel': ['rbf', 'linear'], 'C': [1, 10], 'gamma': [0.001]},
    #'XGBoost' : {'max_depth':[1,3,5,7]},    
    
}

In [None]:
helper1 = EstimatorSelectionHelper(models1, params1)
helper1.fit(processed_train, y_train, scoring='accuracy', n_jobs=-1)

In [None]:
helper1.score_summary(sort_by='max_score')

In [None]:
# prepare dictionary to store results
models = {}
models['Models'] = []
models['r2'] = []
models['mae'] = []
models['rmse'] = []

In [None]:
def make_model_log(model, train, test, y_train, y_test, title):
    
    model.fit(train, y_train)
    cv_5 = cross_val_score(model, train, y_train, cv=5)
    r2 = round(cv_5.mean()*100,2)
    
    test_predictions = model.predict(test)
    
    # reverse log transform our predicted values
    test_predictions_unscaled = np.exp(test_predictions).astype(int)
    test_predictions_unscaled = test_predictions_unscaled.flatten()
    
    test_actual = np.exp(y_test)
    
    # get residuals
    residuals = test_actual - test_predictions_unscaled

    fig = plt.figure(figsize=(20,15))
    plt.scatter(test_predictions_unscaled, residuals)
    plt.show()
    
    # Calculate our errors
    mae = round(mean_absolute_error(test_actual, test_predictions_unscaled), 2)
    rmse = round(np.sqrt(mean_squared_error(test_actual, test_predictions_unscaled)), 2)

    # append our results to our lists
    models['Models'].append(title)
    models['r2'].append(r2)
    models['mae'].append(mae)
    models['rmse'].append(rmse)

    print("R2: ", r2, "\nMAE: ", mae, "\nRMSE: ", rmse, "\n{} predictors used for this model".format(train.shape[1]))

In [None]:
print(classification_report(y_test, y_hat_test))

In [None]:
sns.set_style('darkgrid', {'axes.facecolor': '0.9'})

print('AUC: {}'.format(auc(fpr, tpr)))
plt.figure(figsize=(10, 8))
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve')
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.yticks([i/20.0 for i in range(21)])
plt.xticks([i/20.0 for i in range(21)])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

In [None]:
from sklearn.metrics import confusion_matrix

matrix = confusion_matrix(y_test, y_hat_test)


# Import plot_confusion_matrix
from sklearn.metrics import plot_confusion_matrix

# Visualize your confusion matrix
plot_confusion_matrix(logreg, x_test, y_test,
                     cmap=plt.cm.Blues)
plt.show()


In [None]:
# Fit SMOTE to training data
smote = SMOTE()
X_train_resampled, y_train_resampled = smote.fit_resample(x_train, y_train) 

# Preview synthetic sample class distribution
print('\n')
print(pd.Series(y_train_resampled).value_counts()) 

In [None]:
# Instantiate the model
logreg = LogisticRegression(class_weight='balanced', fit_intercept=False, C=1e12, solver='liblinear')

# Fit the model
logreg.fit(X_train_resampled, y_train_resampled)

# Predict
y_hat_test = logreg.predict(x_test)

y_score = logreg.decision_function(x_test)

fpr, tpr, thresholds = roc_curve(y_test, y_score)
    
print('AUC: {}'.format( auc(fpr, tpr)))

In [None]:
# Seaborn's beautiful styling
sns.set_style('darkgrid', {'axes.facecolor': '0.9'})

print('AUC: {}'.format(auc(fpr, tpr)))
plt.figure(figsize=(10, 8))
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve')
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.yticks([i/20.0 for i in range(21)])
plt.xticks([i/20.0 for i in range(21)])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

In [None]:
print(classification_report(y_test, y_hat_test))

# Above and Beyond Plan

* Unit testing
* stacked ensemble model 
* 

## PCA

In [None]:
pca_all = PCA(n_components=processed_train.shape[1], random_state=randomstate)
x_pca = pca_all.fit_transform(processed_train)

print("Explained variance with all components is ", sum(pca_all.explained_variance_ratio_ * 100))
print("1 component explains ", np.cumsum(pca_all.explained_variance_ratio_ * 100)[0])
print("1-2 principal component explains ", np.cumsum(pca_all.explained_variance_ratio_ * 100)[1])
print("1-3 components explains ", np.cumsum(pca_all.explained_variance_ratio_ * 100)[2])
print("1-4 components explains ", np.cumsum(pca_all.explained_variance_ratio_ * 100)[3])

plt.plot(np.cumsum(pca_all.explained_variance_ratio_))
plt.xlabel("Number of Components")
plt.ylabel("Explained Variance")
#plt.savefig('images/pca.png')

In [None]:
pca_2 = PCA(n_components=2, random_state=randomstate)
x_pca_2 = pca_2.fit_transform(processed_train)

print('Explained variation per principal component: {}'.format(pca_2.explained_variance_ratio_))

In [None]:
plt.figure(figsize=(10,10))
sns.scatterplot(x=x_pca_2[:,0], y=x_pca_2[:,1], s=70, hue=y_train, palette=['green', 'blue'])
plt.title("2D Plot of 23% Variability")
plt.xlabel("First Component")
plt.ylabel("Second Component")

In [None]:
pca_95th = PCA(n_components=.95, random_state=randomstate)
x_pca_95 = pca_95th.fit_transform(processed_train)

x_pca_95.shape

## MultiCollinearity

In [None]:
# look for multicollinearity of features
fig, ax = plt.subplots(figsize=(20, 20))

sns.heatmap(final.corr(), center=0,  
           vmin=-1, vmax=1,  square=True)

# title
plt.title('PEARSON CORRELATION MATRIX', fontsize=18)

plt.show()

In [None]:
final.corr()

In [None]:
#Get our list of highly correlated feature pairs with following steps:

# save correlation matrix as a new data frame
# converts all values to absolute value
# stacks the row:column pairs into a multindex
# reset the index to set the multindex to seperate columns
# sort values. 0 is the column automatically generated by the stacking
df_correlations = final.corr().abs().stack().reset_index().sort_values(0, ascending=False)

# zip the variable name columns in a new column named "pairs"
df_correlations['pairs'] = list(zip(df_correlations.level_0, df_correlations.level_1))

# set index to pairs
df_correlations.set_index(['pairs'], inplace = True)

# rename our results column to correlation
df_correlations.rename(columns={0: "correlation"}, inplace=True)

# Drop 1:1 correlations to get rid of self pairs
df_correlations.drop(df_correlations[df_correlations['correlation'] == 1.000000].index, inplace=True)

# view pairs above 75% correlation and below 90% correlation (engineered features will correlate with each other above 95%)
df_correlations[(df_correlations.correlation>.75) & (df_correlations.correlation<.95)]


In [None]:
# Check out our variables correlationg with price
df_correlations = final.corr().abs().stack().reset_index().sort_values(0, ascending=False)
df_correlations.loc[df_correlations['level_0'] == 'DEP_DEL15'].sort_values(0, ascending=False)

## DEPRECATED

In [None]:
df.drop(df.loc[df['DEP_TIME'].isna()].index, axis=0, inplace=True)
df.drop(df.loc[df['TAIL_NUM'].isna()].index, axis=0, inplace=True)
df.drop(df.loc[df['CANCELLED']==1].index, axis=0, inplace=True)

In [None]:
df.loc[(df['DEP_TIME_BLK']=='2100-2159') | (df['DEP_TIME_BLK']=='2200-2259') | (df['DEP_TIME_BLK']=='2300-2359'), 'DEP_BLOCK'] = 'LATE_NIGHT'

df.loc[(df['DEP_TIME_BLK']=='0001-0559'), 'DEP_BLOCK'] = 'EARLY_MORNING'
       
df.loc[(df['DEP_TIME_BLK']=='0600-0659') | (df['DEP_TIME_BLK']=='0700-0759') | (df['DEP_TIME_BLK']=='0800-0859') | (df['DEP_TIME_BLK']=='0900-0959'), 'DEP_BLOCK'] = 'MORNING'

df.loc[(df['DEP_TIME_BLK']=='1000-1059') | (df['DEP_TIME_BLK']=='1100-1159') | (df['DEP_TIME_BLK']=='1200-1259'), 'DEP_BLOCK'] = 'MIDDAY'

df.loc[(df['DEP_TIME_BLK']=='1300-1359') | (df['DEP_TIME_BLK']=='1400-1459') | (df['DEP_TIME_BLK']=='1500-1559') | (df['DEP_TIME_BLK']=='1600-1659'), 'DEP_BLOCK'] = 'AFTERNOON'

df.loc[(df['DEP_TIME_BLK']=='1700-1759') | (df['DEP_TIME_BLK']=='1800-1859') | (df['DEP_TIME_BLK']=='1900-1959') | (df['DEP_TIME_BLK']=='2000-2059') , 'DEP_BLOCK'] = 'EVENING'


In [None]:
df

In [None]:
df.drop(columns = ['MONTH', 'ORIGIN',  'DEST',  
                   'CRS_DEP_TIME', 'DEP_DELAY_NEW', 'CRS_ARR_TIME', 'ARR_TIME', 
                   'CANCELLED', 'CANCELLATION_CODE', 'CRS_ELAPSED_TIME', 'DISTANCE',
                   'CARRIER_DELAY', 'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY',
                  'ARR_DELAY_NEW', 'Unnamed: 32', 'DEP_TIME_BLK', 'ARR_TIME_BLK', 'ACTUAL_ELAPSED_TIME',
                  'DEST_AIRPORT_ID', 'DEST_CITY_NAME'],
        axis=1, inplace=True)

In [None]:
df.memory_usage().sum()

In [None]:
df.dtypes

In [None]:
df['DAY_OF_MONTH'] = df['DAY_OF_MONTH'].astype('int8')
df['DAY_OF_WEEK'] = df['DAY_OF_WEEK'].astype('object')
df['OP_CARRIER_FL_NUM'] = df['OP_CARRIER_FL_NUM'].astype('object')
#df['ORIGIN_AIRPORT_ID'] = df['ORIGIN_AIRPORT_ID'].astype('object')
#df['DEST_AIRPORT_ID'] = df['DEST_AIRPORT_ID'].astype('object')
df['DEP_DEL15'] = df['DEP_DEL15'].astype('int8')
df['DISTANCE_GROUP'] = df['DISTANCE_GROUP'].astype('int8')