In [1]:
import glob, os, string
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')
%matplotlib inline 
import seaborn as sns  
rc = {'lines.linewidth': 2, 
      'axes.labelsize': 18, 
      'axes.titlesize': 18, 
      'axes.facecolor': 'DFDFE5'}
sns.set_context('notebook', rc=rc)
sns.set_style('darkgrid', rc=rc)
# This Function takes as input a custom palette
flatui = ["#9b59b6", "#3498db", "#95a5a6", "#e74c3c", "#34495e", "#2ecc71"]

1.0.5


In [2]:
#Set working directory
os.chdir("C:\Python")

In [3]:
dfFlight = pd.read_csv('Flight Delays Data.csv')
print(dfFlight.shape)
dfFlight.head()

(2719418, 14)


Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,Carrier,OriginAirportID,DestAirportID,CRSDepTime,DepDelay,DepDel15,CRSArrTime,ArrDelay,ArrDel15,Cancelled
0,2013,4,19,5,DL,11433,13303,837,-3.0,0.0,1138,1.0,0.0,0.0
1,2013,4,19,5,DL,14869,12478,1705,0.0,0.0,2336,-8.0,0.0,0.0
2,2013,4,19,5,DL,14057,14869,600,-4.0,0.0,851,-15.0,0.0,0.0
3,2013,4,19,5,DL,15016,11433,1630,28.0,1.0,1903,24.0,1.0,0.0
4,2013,4,19,5,DL,11193,12892,1615,-6.0,0.0,1805,-11.0,0.0,0.0


In [4]:
# Excluding some dataset columns that are possible target leakers like DepDelay, DepDel15, ArrDel15, Cancelled, Year.
dfFlight.drop(['DepDelay', 'DepDel15', 'ArrDel15', 'Cancelled', 'Year'], axis = 1, inplace = True)
print(dfFlight.shape)
dfFlight.head()

(2719418, 9)


Unnamed: 0,Month,DayofMonth,DayOfWeek,Carrier,OriginAirportID,DestAirportID,CRSDepTime,CRSArrTime,ArrDelay
0,4,19,5,DL,11433,13303,837,1138,1.0
1,4,19,5,DL,14869,12478,1705,2336,-8.0
2,4,19,5,DL,14057,14869,600,851,-15.0
3,4,19,5,DL,15016,11433,1630,1903,24.0
4,4,19,5,DL,11193,12892,1615,1805,-11.0


In [5]:
#checks the number of missing values in each column as isnull() 
dfFlight.apply(lambda x: sum(x.isnull()),axis=0) 

Month                  0
DayofMonth             0
DayOfWeek              0
Carrier                0
OriginAirportID        0
DestAirportID          0
CRSDepTime             0
CRSArrTime             0
ArrDelay           29033
dtype: int64

There are about **29,000 rows** missing. This isn't a lot compared to the total flight of over **2.7 million rows** just small percentage of the flights. Therefore, I can just remove them.

In [6]:
# Removing rows with missing values    
dfFlight = dfFlight[~pd.isnull(dfFlight).any(axis=1)]
print(dfFlight.shape)

(2690385, 9)


In [7]:
dfFlight.describe()

Unnamed: 0,Month,DayofMonth,DayOfWeek,OriginAirportID,DestAirportID,CRSDepTime,CRSArrTime,ArrDelay
count,2690385.0,2690385.0,2690385.0,2690385.0,2690385.0,2690385.0,2690385.0,2690385.0
mean,6.985231,15.7972,3.901087,12741.99,12742.32,1325.569,1504.45,6.637688
std,1.985857,8.806017,1.987864,1502.799,1502.908,471.3835,493.9494,38.64881
min,4.0,1.0,1.0,10140.0,10140.0,1.0,1.0,-94.0
25%,5.0,8.0,2.0,11292.0,11292.0,920.0,1119.0,-11.0
50%,7.0,16.0,4.0,12892.0,12892.0,1320.0,1526.0,-3.0
75%,9.0,23.0,6.0,14057.0,14057.0,1725.0,1918.0,10.0
max,10.0,31.0,7.0,15376.0,15376.0,2359.0,2359.0,1845.0


## Model prediction
I'll use XGBoost regreessor to predict flight delay time in minutes.

In [18]:
#import model libraries
import scipy.stats as st
from xgboost.sklearn import XGBRegressor
from sklearn.preprocessing import StandardScaler 
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error
import joblib

ImportError: cannot import name 'mean_absolute_percentage_error' from 'sklearn.metrics' (C:\Users\Trayan\Anaconda3\lib\site-packages\sklearn\metrics\__init__.py)

In [9]:
dfFlight.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,Carrier,OriginAirportID,DestAirportID,CRSDepTime,CRSArrTime,ArrDelay
0,4,19,5,DL,11433,13303,837,1138,1.0
1,4,19,5,DL,14869,12478,1705,2336,-8.0
2,4,19,5,DL,14057,14869,600,851,-15.0
3,4,19,5,DL,15016,11433,1630,1903,24.0
4,4,19,5,DL,11193,12892,1615,1805,-11.0


In [10]:
Y = dfFlight.ArrDelay
X = dfFlight.drop(['ArrDelay'], axis=1)

## Applying Ordinal Encoding to Categoricals
We need to convert some features into categorical group to make processing simpler. The columns OriginAirportID and DestAirportID represent categorical features. However, because they are integers, they are initially parsed as continuous numbers. It is also required to encode features like Carrier with a string category since XGBoost (like all of the other machine learning algorithms in Python) requires every feature vector to include only digits. 

In [11]:
CategLs = ['Month', 'DayofMonth', 'DayOfWeek', 'Carrier', 'OriginAirportID', 'DestAirportID', 'CRSDepTime', 'CRSArrTime'] # Categorical features
for fea in X[CategLs]: # Loop through all columns in the dataframe
    X[fea] = pd.Categorical(X[fea]).codes # Convert to categorical features

#Split data into train and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.35, random_state=123)
    

In [17]:
one_to_left = st.beta(10, 1)  
from_zero_positive = st.expon(0, 50)

params = {  
    "n_estimators": st.randint(3, 40),
    "max_depth": st.randint(3, 40),
    "learning_rate": st.uniform(0.05, 0.4),
    "colsample_bytree": one_to_left,
    "subsample": one_to_left,
    "gamma": st.uniform(0, 10),
    'reg_alpha': from_zero_positive,
    "min_child_weight": from_zero_positive
}

xgbreg = XGBRegressor(nthread=-1,objective ='reg:squarederror')


rsCV = RandomizedSearchCV(xgbreg, params, n_jobs=1)  
rsCV.fit(X_train, Y_train)
rsCV.best_params_, rsCV.best_score_

clf = XGBRegressor(**rsCV.best_params_)
clf.fit(X_train, Y_train)

print("MAE: %.4f" % mean_absolute_error(Y_test, clf.predict(X_test)))

MAE: 18.4700


In [25]:
y_Pred = clf.predict(X_test)

In [27]:
df_eval = pd.read_csv('Test-Flights.csv')
print(df_eval.shape)
df_eval.head()

(25, 15)


Unnamed: 0,Row ID,Year,Month,DayofMonth,DayOfWeek,Carrier,OriginAirportID,DestAirportID,CRSDepTime,DepDelay,DepDel15,CRSArrTime,ArrDelay,ArrDel15,Cancelled
0,1,2013,6,10,1,YV,14107,10800,1415,2,0,1534,0,0,0
1,2,2013,10,14,1,DL,10397,13244,2030,-2,0,2055,0,0,0
2,3,2013,6,19,3,AS,13891,14747,615,-3,0,848,0,0,0
3,4,2013,10,13,7,EV,10693,12953,950,-7,0,1302,0,0,0
4,5,2013,9,13,5,WN,13232,12191,1930,10,0,2155,0,0,0


In [29]:
df_eval.drop(['Row ID', 'DepDelay', 'DepDel15', 'ArrDel15', 'Cancelled', 'Year','ArrDelay'], axis = 1, inplace = True)
print(df_eval.shape)
df_eval.head()

KeyError: "['Row ID' 'DepDelay' 'DepDel15' 'ArrDel15' 'Cancelled' 'Year'] not found in axis"

In [33]:
CategLs1 = ['Month', 'DayofMonth', 'DayOfWeek', 'Carrier', 'OriginAirportID', 'DestAirportID', 'CRSDepTime', 'CRSArrTime'] # Categorical features
for fea in df_eval[CategLs1]: # Loop through all columns in the dataframe
    df_eval[fea] = pd.Categorical(df_eval[fea]).codes # Convert to categorical features

In [34]:
eval_pred = clf.predict(df_eval)

In [35]:
eval_pred

array([ 21.095118  ,   3.6571546 ,   8.9120035 ,   4.7319484 ,
         9.501804  ,   2.4647238 ,  17.167149  ,   7.668087  ,
         6.7583075 , -11.399771  ,  -2.5048888 ,   5.044197  ,
        17.88213   ,   5.659784  ,  -4.8704624 ,  -1.08533   ,
        30.00022   ,   5.0467434 ,  -2.6089723 ,  34.0347    ,
        31.53879   ,  -3.7182899 ,  -1.1765642 ,   0.16893667,
         2.7657006 ], dtype=float32)

In [36]:
df_eval['del_pred'] = eval_pred

In [37]:
df_eval

Unnamed: 0,Month,DayofMonth,DayOfWeek,Carrier,OriginAirportID,DestAirportID,CRSDepTime,CRSArrTime,del_pred
0,1,7,0,9,16,2,13,15,21.095118
1,5,9,0,3,0,12,17,20,3.657155
2,1,12,2,2,14,15,1,4,8.912004
3,5,8,6,4,1,8,7,12,4.731948
4,4,8,4,8,11,5,16,21,9.501804
5,2,0,0,8,5,9,8,11,2.464724
6,3,10,4,8,10,0,6,8,17.167149
7,1,16,2,4,7,4,0,3,7.668087
8,5,14,2,6,13,6,11,14,6.758307
9,5,18,2,4,9,13,0,2,-11.399771
