In [1]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
import forest
import numpy as np
import time
from sklearn.metrics import f1_score

In [25]:
airlineData = pd.read_csv("C:/Users/ntihish/Documents/IUB/HPC/Project/2008.csv.bz2")
pd.set_option('display.max_columns', 500)


In [26]:
airlineData.columns

Index(['Year', 'Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'CRSDepTime',
       'ArrTime', 'CRSArrTime', 'UniqueCarrier', 'FlightNum', 'TailNum',
       'ActualElapsedTime', 'CRSElapsedTime', 'AirTime', 'ArrDelay',
       'DepDelay', 'Origin', 'Dest', 'Distance', 'TaxiIn', 'TaxiOut',
       'Cancelled', 'CancellationCode', 'Diverted', 'CarrierDelay',
       'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay'],
      dtype='object')

In [4]:
airlineData['Dest'].nunique()

304

In [20]:
airlineData['ArrTime'].isna().sum()/airlineData['DepTime'].shape[0]

0.0

In [6]:
airlineData['ArrDelay'].quantile(0.25)

-10.0

In [7]:
airlineData.head()

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,Origin,Dest,Distance,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
0,2008,1,3,4,2003.0,1955,2211.0,2225,WN,335,N712SW,128.0,150.0,116.0,-14.0,8.0,IAD,TPA,810,4.0,8.0,0,,0,,,,,
1,2008,1,3,4,754.0,735,1002.0,1000,WN,3231,N772SW,128.0,145.0,113.0,2.0,19.0,IAD,TPA,810,5.0,10.0,0,,0,,,,,
2,2008,1,3,4,628.0,620,804.0,750,WN,448,N428WN,96.0,90.0,76.0,14.0,8.0,IND,BWI,515,3.0,17.0,0,,0,,,,,
3,2008,1,3,4,926.0,930,1054.0,1100,WN,1746,N612SW,88.0,90.0,78.0,-6.0,-4.0,IND,BWI,515,3.0,7.0,0,,0,,,,,
4,2008,1,3,4,1829.0,1755,1959.0,1925,WN,3920,N464WN,90.0,90.0,77.0,34.0,34.0,IND,BWI,515,3.0,10.0,0,,0,2.0,0.0,0.0,0.0,32.0


In [8]:
airlineData = airlineData.drop(['Origin','Dest','TailNum','UniqueCarrier','FlightNum','Cancelled','Diverted','CancellationCode', 'CarrierDelay','WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay'],axis = 1)

In [9]:
airlineData.shape

(7009728, 16)

In [10]:
airlineData.dropna().shape[0]/airlineData.shape[0]

0.9779308127219772

In [11]:
airlineData = airlineData.dropna()

In [12]:
airlineData.head()

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,Distance,TaxiIn,TaxiOut
0,2008,1,3,4,2003.0,1955,2211.0,2225,128.0,150.0,116.0,-14.0,8.0,810,4.0,8.0
1,2008,1,3,4,754.0,735,1002.0,1000,128.0,145.0,113.0,2.0,19.0,810,5.0,10.0
2,2008,1,3,4,628.0,620,804.0,750,96.0,90.0,76.0,14.0,8.0,515,3.0,17.0
3,2008,1,3,4,926.0,930,1054.0,1100,88.0,90.0,78.0,-6.0,-4.0,515,3.0,7.0
4,2008,1,3,4,1829.0,1755,1959.0,1925,90.0,90.0,77.0,34.0,34.0,515,3.0,10.0


In [13]:
airlineData['DepTimeInMins'] = airlineData['DepTime'].apply(lambda x : (int(x/100)*60 + x%100))
airlineData['CRSDepTimeInMins'] = airlineData['CRSDepTime'].apply(lambda x : (int(x/100)*60 + x%100))

In [14]:
airlineData['ArrTimeInMins'] = airlineData['ArrTime'].apply(lambda x : (int(x/100)*60 + x%100))
airlineData['CRSArrTimeInMins'] = airlineData['CRSArrTime'].apply(lambda x : (int(x/100)*60 + x%100))

In [15]:
airlineData['IsDelayed'] = airlineData['ArrDelay'] > 18
airlineData['IsDelayed'] = airlineData['IsDelayed'].map(int)

In [16]:
airlineData['IsDelayed'].value_counts()

0    5541790
1    1313239
Name: IsDelayed, dtype: int64

In [17]:
X_train, X_test, y_train, y_test = train_test_split(airlineData.loc[:, ~airlineData.columns.isin(['IsDelayed'])], airlineData['IsDelayed'], test_size=0.25, random_state=42)

In [18]:
airlineData.to_csv("AirlineReduced",index = False)

In [19]:
pd.read_csv("AirlineReduced").head()

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,Distance,TaxiIn,TaxiOut,DepTimeInMins,CRSDepTimeInMins,ArrTimeInMins,CRSArrTimeInMins,IsDelayed
0,2008,1,3,4,2003.0,1955,2211.0,2225,128.0,150.0,116.0,-14.0,8.0,810,4.0,8.0,1203.0,1195,1331.0,1345,0
1,2008,1,3,4,754.0,735,1002.0,1000,128.0,145.0,113.0,2.0,19.0,810,5.0,10.0,474.0,455,602.0,600,0
2,2008,1,3,4,628.0,620,804.0,750,96.0,90.0,76.0,14.0,8.0,515,3.0,17.0,388.0,380,484.0,470,0
3,2008,1,3,4,926.0,930,1054.0,1100,88.0,90.0,78.0,-6.0,-4.0,515,3.0,7.0,566.0,570,654.0,660,0
4,2008,1,3,4,1829.0,1755,1959.0,1925,90.0,90.0,77.0,34.0,34.0,515,3.0,10.0,1109.0,1075,1199.0,1165,1


In [86]:
clf = RandomForestClassifier(n_estimators=10, max_depth=2,random_state=0)
clf.fit(X_test, y_test)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [89]:
y_pred = clf.predict(X_train)

In [91]:
roc_auc_score(y_train, y_pred)

1.0

In [102]:
myForest = forest.forest(maxDepth=5,numTrees= 10,verbose = True)
TrainX,TrainY = np.array(X_test),np.array(y_test)
Xtest,yTest = np.array(X_test),np.array(y_test)
start = time.time()
myForest.trainForest(TrainX,TrainY)

finalPredictions = myForest.predict(Xtest)

print("Time elapsed is ", time.time()-start)
print("Accuracy is: " ,sum(finalPredictions==yTest)/len(yTest))

Built tree 1
Built tree 2
Built tree 3
Built tree 4
Built tree 5
Built tree 6
Built tree 7
Built tree 8
Built tree 9
Built tree 10

Predicting....
Time elapsed is  336.0638976097107
Accuracy is:  0.9934844867063922


In [104]:
f1_score(yTest,finalPredictions)

0.9827114235283189