# Airline on-time regressor

In [82]:
import numpy
import pandas
import math
import os
import random
import time


Read files into data frame.

In [83]:
path = os.getcwd()
files = os.listdir(path)
print files

['airports.dat', '459805012_T_ONTIME.csv', 'airport_delay.ipynb', '.ipynb_checkpoints', 'airport_delay-Copy1.ipynb']


In [84]:
data_file = files[1]
airports = files[0]
data = pandas.read_csv(data_file)


Inspect the data frame. I chose to download features that can be easily inferred from a ticket to choose for prediction. The others are for possible responces or to help clean and select the data for training.  

In [85]:
print 'original data length:',len(data)
data.head(3)

original data length: 469968


Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,UNIQUE_CARRIER,AIRLINE_ID,FL_NUM,ORIGIN,DEST,CRS_DEP_TIME,CRS_ARR_TIME,ARR_DELAY,ARR_DEL15,CANCELLED,DIVERTED,DISTANCE,CARRIER_DELAY,Unnamed: 16
0,1,1,4,AA,19805,1,JFK,LAX,900,1230,7,0,0,0,2475,,
1,1,2,5,AA,19805,1,JFK,LAX,900,1230,-19,0,0,0,2475,,
2,1,3,6,AA,19805,1,JFK,LAX,900,1230,-39,0,0,0,2475,,


Remove records of cancelled or diverted flights

In [86]:
data = data[data['CANCELLED']==0]
data = data[data['DIVERTED']==0]
print 'data length without canceled and diverted flights:',len(data)

data length without canceled and diverted flights: 457013


Drop column 16 and now constant "CANCELLED", and "DIVERTED"

In [87]:
cols_to_drop = data.columns[[12,13,16]]
data.drop(cols_to_drop,axis = 1,inplace = True)
data.head(2)

Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,UNIQUE_CARRIER,AIRLINE_ID,FL_NUM,ORIGIN,DEST,CRS_DEP_TIME,CRS_ARR_TIME,ARR_DELAY,ARR_DEL15,DISTANCE,CARRIER_DELAY
0,1,1,4,AA,19805,1,JFK,LAX,900,1230,7,0,2475,
1,1,2,5,AA,19805,1,JFK,LAX,900,1230,-19,0,2475,


Split the data into train, validation, and test set

In [88]:
n = len(data)
n1 = len(data)*6/10
n2 = len(data)*8/10
s= random.sample(xrange(n), n)
train_ind = s[0:n1]
validation_ind = s[n1:n2]
test_ind = s[n2:]

train = data.iloc[train_ind]
validate = data.iloc[validation_ind]
test = data.iloc[test_ind]


Try to convert airport codes into numeric long/lat coordinats using downloaded table airports.dat from openflights.com: https://raw.githubusercontent.com/jpatokal/openflights/master/data/airports.dat.

In [89]:
airport_codes = pandas.read_csv(airports,header = None)
airport_codes.head(3)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,1,Goroka,Goroka,Papua New Guinea,GKA,AYGA,-6.081689,145.391881,5282,10,U,Pacific/Port_Moresby
1,2,Madang,Madang,Papua New Guinea,MAG,AYMD,-5.207083,145.7887,20,10,U,Pacific/Port_Moresby
2,3,Mount Hagen,Mount Hagen,Papua New Guinea,HGU,AYMH,-5.826789,144.295861,5388,10,U,Pacific/Port_Moresby


See how many values categorical variables contain 

In [90]:
print len(data.groupby('AIRLINE_ID'))
print len(data.groupby('UNIQUE_CARRIER'))
print len(data.groupby(['AIRLINE_ID','UNIQUE_CARRIER']))
print len(data.groupby('FL_NUM'))
print data.groupby(['AIRLINE_ID','UNIQUE_CARRIER']).groups.keys()

14
14
14
6334
[(21171L, 'VX'), (20409L, 'B6'), (19393L, 'WN'), (19805L, 'AA'), (20366L, 'EV'), (20304L, 'OO'), (20436L, 'F9'), (19977L, 'UA'), (20398L, 'MQ'), (19690L, 'HA'), (20416L, 'NK'), (19790L, 'DL'), (20355L, 'US'), (19930L, 'AS')]


AIRLINE_ID and UNIQUE_CARRIER  are one to one and onto, so will use just one category UNIQUE_CARRIER. The FL_NUM has too many categories and seem to be redundant, as route and time should specify it uniquely. Month, and day may use as numeric, time as well. With those FL_DATE seem to be redundant.

In [91]:
carriers = data.groupby('UNIQUE_CARRIER').groups.keys()

Function for processing that will transform UNIQUE_CARRIER into numeric columns, remove rest of categorical columns
and substitute ORIGIN and DEST codes with their coordinats.

In [92]:
def process(df):
    global airport_codes
    global carriers
    
    for c in carriers:
        df.loc[:,c] = (df['UNIQUE_CARRIER']==c)+0
        
    df = df.merge(airport_codes[[4,6,7]],left_on = 'ORIGIN',
                             right_on = 4,how = 'left')
    if len(df[df['ORIGIN'] != df[4]])>0 :
        print 'Unidentified ORIGIN airport'
    df.rename(columns={6: 'Origin_lat', 7: 'Origin_long'}, inplace=True)
    df.drop([4],axis = 1,inplace = True)
    
    df = df.merge(airport_codes[[4,6,7]],left_on = 'DEST',
                             right_on = 4,how = 'left')
    if len(df[df['DEST'] != df[4]])>0 :
        print 'Unidentified DEST airport'
    df.rename(columns={6: 'Dest_lat', 7: 'Dest_long'}, inplace=True)
    df.drop([4],axis = 1,inplace = True)
    cols_to_drop = df.columns[[3,4,5,6,7,13]]
    df.drop(cols_to_drop,axis = 1,inplace = True)
    
    cols = list(df.columns)
    cols.remove('ARR_DELAY')
    cols.remove('ARR_DEL15')
    cols = cols+['ARR_DELAY','ARR_DEL15']
    df = df[cols]
        
    return df
    

Process data sets and split them into features and responce.

In [93]:
train = process(train)
validate = process(validate)
test = process(test)

train_x = train[train.columns[0:24]]
train_y = train[train.columns[24]]
validate_x = validate[validate.columns[0:24]]
validate_y = validate[validate.columns[24]]
test_x = test[test.columns[0:24]]
test_y = test[test.columns[24]]

In [94]:
print 'Columns used for training and prediction:'
print train_x.dtypes

Columns used for training and prediction:
MONTH             int64
DAY_OF_MONTH      int64
DAY_OF_WEEK       int64
CRS_DEP_TIME      int64
CRS_ARR_TIME      int64
DISTANCE        float64
AA                int32
OO                int32
DL                int32
HA                int32
F9                int32
B6                int32
US                int32
AS                int32
NK                int32
MQ                int32
WN                int32
VX                int32
EV                int32
UA                int32
Origin_lat      float64
Origin_long     float64
Dest_lat        float64
Dest_long       float64
dtype: object


Now (after trying some othe models: linear regression, svm,random forest), it seems that GBR, althouth slower, produces better fit. So train using Gradiant Boosting Regressor.

In [95]:
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor


n_est = [10,50,100,200,500]
learn_r = [0.001,0.01, 0.05, 0.1]
max_d = [2,5,10]



val_results = numpy.zeros((5,4,3))

for i in [0,1,2,3,4]:
    for j in [0,1,2,3]:
        for k in [0,1,2]:
            mod = GradientBoostingRegressor(n_estimators=n_est[i], learning_rate=learn_r[j],
                                max_depth=max_d[k])
            start = time.time()
            mod.fit(train_x, train_y)
            pred_y = mod.predict(validate_x)
            val_results[i,j,k] = mean_squared_error(validate_y, pred_y)
            end = time.time()
            print 'Predicting time gbc model',n_est[i],learn_r[j],max_d[k],': ', end-start, \
            ', MSE = ', val_results[i,j,k] 

min_ind = numpy.where(val_results == val_results.min())
print min_ind, val_results[min_ind]
n_est = n_est[min_ind[0][0]]
learn_r = learn_r[min_ind[1][0]]
max_d = max_d[min_ind[2][0]]
print 'Best parameters: '
print 'n estimators: ',n_est
print 'learnin _rate: ', learn_r
print 'maximum depth: ',max_d

print 'Retraining on training and validation sets combined'
mod = GradientBoostingRegressor(n_estimators=n_est, learning_rate=learn_r,
                                max_depth=max_d)
mod.fit(pandas.concat([train_x,validate_x]),pandas.concat([train_y,validate_y]))
pred_y = mod.predict(test_x)
print 'Test MSE:', mean_squared_error(test_y, pred_y)
print 'Test RSE:', mean_squared_error(test_y, numpy.ones(len(pred_y))*test_y.mean())



Predicting time gbc model 10 0.001 2 :  4.38242912292 , MSE =  1506.20610302
Predicting time gbc model 10 0.001 5 :  13.2448580265 , MSE =  1505.19375481
Predicting time gbc model 10 0.001 10 :  69.1505608559 , MSE =  1502.89388129
Predicting time gbc model 10 0.01 2 :  4.40344285965 , MSE =  1493.43004423
Predicting time gbc model 10 0.01 5 :  13.3378608227 , MSE =  1484.47801468
Predicting time gbc model 10 0.01 10 :  68.0230691433 , MSE =  1463.94866832
Predicting time gbc model 10 0.05 2 :  4.3187110424 , MSE =  1455.19164583
Predicting time gbc model 10 0.05 5 :  13.3671009541 , MSE =  1422.46042039
Predicting time gbc model 10 0.05 10 :  68.9375660419 , MSE =  1357.33668159
Predicting time gbc model 10 0.1 2 :  4.16742801666 , MSE =  1428.32328942
Predicting time gbc model 10 0.1 5 :  14.1260919571 , MSE =  1368.59668091
Predicting time gbc model 10 0.1 10 :  69.7418630123 , MSE =  1296.65122693
Predicting time gbc model 50 0.001 2 :  19.3440830708 , MSE =  1500.27245066
Predicti