In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from xgboost import XGBRegressor, XGBClassifier
from sklearn.metrics import classification_report
from joblib import dump, load
import statsmodels.formula.api as smf
from statsmodels.stats.outliers_influence import variance_inflation_factor

%matplotlib inline

  import pandas.util.testing as tm


In [2]:
data_path = 'C:/Users/Jesse/Desktop/midterm_data/data/'
flights_df = pd.read_csv(data_path + 'flights.csv')

# columns we are allowed to use
cols = ['fl_date', 
        'mkt_unique_carrier', 
        'branded_code_share', 
        'mkt_carrier', 
        'mkt_carrier_fl_num', 
        'op_unique_carrier', 
       'tail_num', 
       'op_carrier_fl_num', 
       'origin_airport_id', 
       'origin', 
       'origin_city_name', 
       'dest_airport_id', 
       'dest', 
       'dest_city_name', 
       'crs_dep_time', 
       'crs_arr_time', 
       'dup', 
       'crs_elapsed_time', 
       'flights', 
       'distance', 
       'cancelled']

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
# n=10000
# flights = flights_df.loc[flights_df['arr_delay'].notna(), cols].sample(n=n)
flights = flights_df.loc[flights_df['arr_delay'].notna(), cols]
flights.shape

(15615741, 21)

In [4]:
cancelled = flights_df.loc[flights_df['cancelled'] == 1]
notcancelled = flights_df.loc[flights_df['cancelled'] == 0]

cancelled_sampled = cancelled.sample(n=50000)
notcancelled = notcancelled.sample(n=100000)

In [5]:
df = pd.concat([cancelled_sampled, notcancelled])
df = df[cols]
df.shape

(150000, 21)

In [6]:
def preprocess_df(df):
    df.drop(['tail_num',
            'dest_airport_id',
            'dup',
            'flights'], axis=1, inplace=True)
    
    df['fl_date'] = pd.to_datetime(df['fl_date'], format='%Y-%m-%d')
    df['month'] = df['fl_date'].dt.month
    df['day'] = df['fl_date'].dt.day
    df['weekday'] = df['fl_date'].dt.dayofweek
    
    # Drop fl_date, not needed anymore
    df.drop('fl_date', axis=1, inplace=True)
    
    # Get orgin/dest statename
    df['origin_state'] = df['origin_city_name'].apply(lambda x: x.split(', ')[1])
    df['dest_state'] = df['dest_city_name'].apply(lambda x: x.split(', ')[1])
    
    df.drop(['origin_city_name', 'dest_city_name'], axis=1, inplace=True)
    
    
    df['origin'] = df['origin'].astype('category')
    df['dest'] = df['dest'].astype('category')
    
    # Define new function to convert crs time
    def convert_time_to_string(x):
        if np.isnan(x):
            return '00:00'

        if x == 2400:
            return '23:59'

        string = str(int(x))

        if len(string) < 4:
            for _ in range(4-len(string)):
                string = '0' + string

        return string[:2] + ':' + string[2:]
    
    df['crs_dep_time'] = df['crs_dep_time'].apply(lambda x: convert_time_to_string(x))
    df['crs_arr_time'] = df['crs_arr_time'].apply(lambda x: convert_time_to_string(x))
    
    # Convert to datetimes
    df['crs_dep_time'] = pd.to_datetime(df['crs_dep_time'], format='%H:%M')  
    df['crs_arr_time'] = pd.to_datetime(df['crs_arr_time'], format='%H:%M')
    
    # Create hr column, minute col
    df['dep_hr'] = df['crs_dep_time'].dt.hour
    df['dep_min'] = df['crs_dep_time'].dt.minute   
    df['arr_hr'] = df['crs_arr_time'].dt.hour
    df['arr_min'] = df['crs_arr_time'].dt.minute                                                
    
    # Drop crs
    df.drop(['mkt_unique_carrier', 'branded_code_share', 'origin_state', 'dest_state', 'op_unique_carrier'], axis=1, inplace=True)
    
    return df
    
    

In [7]:
df = preprocess_df(df)

In [8]:
df.dtypes

mkt_carrier                   object
mkt_carrier_fl_num             int64
op_carrier_fl_num              int64
origin_airport_id              int64
origin                      category
dest                        category
crs_dep_time          datetime64[ns]
crs_arr_time          datetime64[ns]
crs_elapsed_time             float64
distance                       int64
cancelled                      int64
month                          int64
day                            int64
weekday                        int64
dep_hr                         int64
dep_min                        int64
arr_hr                         int64
arr_min                        int64
dtype: object

In [9]:
# Using historical data
flights_df['fl_date'] = pd.to_datetime(flights_df['fl_date'], format='%Y-%m-%d')
flights_df['month'] = flights_df['fl_date'].dt.month

history = dict(flights_df.groupby(['origin', 'mkt_carrier', 'month'])['cancelled'].mean())

# Create a new column
def get_cancelled_rate(row):
    if (not pd.isnull(row['origin'])) and (not pd.isnull(row['mkt_carrier'])) and (not pd.isnull(row['month'])):
        cancel_rate = history[(row['origin'], row['mkt_carrier'], row['month'])]
        return cancel_rate
    else:
        return 0

df['cancel_rate'] = df.apply(lambda row: get_cancelled_rate(row), axis=1)

In [None]:
df.drop(['crs_dep_time', 'crs_arr_time'], axis=1, inplace=True)


In [46]:
df.dropna(inplace=True)

In [None]:
df.to_csv('C:/Users/Jesse/Desktop/midterm_data/data/binary_df.csv', index=False)

In [47]:
X = df.drop('cancelled', axis=1)
y = df['cancelled']

In [48]:
# Label encode origin and destination
airports_le = LabelEncoder()
X['origin'] = airports_le.fit_transform(X['origin'])
X['dest'] = airports_le.transform(X['dest'])

# label encode mkt_carrier
mkt_carrier_le = LabelEncoder()
X['mkt_carrier'] = mkt_carrier_le.fit_transform(X['mkt_carrier'])

In [49]:
X.isna().sum()

mkt_carrier           0
mkt_carrier_fl_num    0
op_carrier_fl_num     0
origin_airport_id     0
origin                0
dest                  0
crs_elapsed_time      0
distance              0
month                 0
day                   0
weekday               0
dep_hr                0
dep_min               0
arr_hr                0
arr_min               0
cancel_rate           0
dtype: int64

In [55]:
X.head(30)

Unnamed: 0,mkt_carrier,mkt_carrier_fl_num,op_carrier_fl_num,origin_airport_id,origin,dest,crs_elapsed_time,distance,month,day,weekday,dep_hr,dep_min,arr_hr,arr_min,cancel_rate
9224999,0,1676,1676,14492,296,195,334.0,2239,5,26,6,19,0,21,34,0.034725
9870947,8,4710,4710,11292,94,293,74.0,300,4,24,2,18,45,19,59,0.017959
8712055,0,4839,4839,10431,26,74,64.0,91,5,3,4,12,36,13,40,0.030181
10832251,0,2878,2878,14100,270,45,143.0,675,7,7,6,18,10,19,33,0.060779
14507128,0,4813,4813,14524,299,270,73.0,198,6,20,3,19,6,20,19,0.043344
14103966,0,1274,1274,11298,95,184,228.0,1391,6,3,0,9,6,13,54,0.046293
8312775,0,4052,4052,10721,47,184,87.0,187,7,17,1,14,33,16,0,0.050322
6427264,8,3485,3485,11618,119,93,84.0,199,2,17,5,20,36,22,0,0.028059
3478971,0,856,856,12478,184,47,74.0,187,12,2,0,23,10,0,24,0.009709
1707363,0,3625,3625,13930,256,76,54.0,135,11,11,0,11,55,12,49,0.031983


In [50]:
xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3)

In [51]:
xgb_clf = XGBClassifier()
rf_clf = RandomForestClassifier()


xgb_clf.fit(xtrain, ytrain)
rf_clf.fit(xtrain, ytrain)


print('xgb Accuracy: ', xgb_clf.score(xtest, ytest))
print('rf_clf Accuracy: ', rf_clf.score(xtest, ytest))

xgb Accuracy:  0.7327777777777778
rf_clf Accuracy:  0.7556888888888889


In [54]:
ypreds_xg = xgb_clf.predict(xtest)
ypreds_rf = rf_clf.predict(xtest)

In [56]:
print('XG BOOST CLASSIFIER')

print(classification_report(ytest, ypreds_xg))

XG BOOST CLASSIFIER
              precision    recall  f1-score   support

           0       0.76      0.87      0.81     30098
           1       0.64      0.45      0.53     14902

    accuracy                           0.73     45000
   macro avg       0.70      0.66      0.67     45000
weighted avg       0.72      0.73      0.72     45000



In [58]:
print('RANDOM FOREST CLASSIFIER')

print(classification_report(ytest, ypreds_rf))

RANDOM FOREST CLASSIFIER
              precision    recall  f1-score   support

           0       0.78      0.88      0.83     30098
           1       0.67      0.51      0.58     14902

    accuracy                           0.76     45000
   macro avg       0.73      0.69      0.70     45000
weighted avg       0.75      0.76      0.75     45000



In [62]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}


rf_random = RandomizedSearchCV(estimator = rf_clf, 
                               param_distributions = random_grid, 
                               n_iter = 10, cv = 2, verbose=2, 
                               random_state=42, n_jobs = -1)

rf_random.fit(X,y)

Fitting 2 folds for each of 10 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 out of  20 | elapsed: 10.5min remaining:  2.6min
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed: 14.2min finished


RandomizedSearchCV(cv=2, estimator=RandomForestClassifier(), n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=42, verbose=2)

In [63]:
print(rf_random.best_score_)
print(rf_random.best_estimator_)

0.7570567607568102
RandomForestClassifier(bootstrap=False, max_depth=90, max_features='sqrt',
                       min_samples_leaf=4, min_samples_split=10,
                       n_estimators=600)


In [64]:
param_grid = {
    'n_estimators': [400, 700, 1000],
    'colsample_bytree': [0.7, 0.8],
    'max_depth': [15,20,25],
    'reg_alpha': [1.1, 1.2, 1.3],
    'reg_lambda': [1.1, 1.2, 1.3],
    'subsample': [0.7, 0.8, 0.9]
}

rs = RandomizedSearchCV(estimator=XGBRegressor(), param_distributions=param_grid, cv=2, verbose=True, n_iter=4)

rs.fit(X, y)

print(rs.best_score_)
print(rs.best_estimator_)

Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.




[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed: 12.1min finished


-0.9998596576260552
XGBRegressor(colsample_bytree=0.8, max_depth=15, n_estimators=700,
             reg_alpha=1.2, reg_lambda=1.1, subsample=0.8)
