In [139]:
# linear algebra
import numpy as np 

# data processing
import pandas as pd 

# data visualization
import seaborn as sns
from matplotlib import pyplot as plt
from matplotlib import style

# Algorithms
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB

In [109]:
test_df = pd.read_csv("flight_delays_test.csv")
train_df = pd.read_csv("flight_delays_train.csv")

In [110]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
Month                100000 non-null object
DayofMonth           100000 non-null object
DayOfWeek            100000 non-null object
DepTime              100000 non-null int64
UniqueCarrier        100000 non-null object
Origin               100000 non-null object
Dest                 100000 non-null object
Distance             100000 non-null int64
dep_delayed_15min    100000 non-null object
dtypes: int64(2), object(7)
memory usage: 6.9+ MB


In [111]:
total = train_df.isnull().sum().sort_values(ascending=False)
percent_1 = train_df.isnull().sum()/train_df.isnull().count()*100
percent_2 = (round(percent_1, 1)).sort_values(ascending=False)
missing_data = pd.concat([total, percent_2], axis=1, keys=['Total', '%'])
missing_data

Unnamed: 0,Total,%
dep_delayed_15min,0,0.0
Distance,0,0.0
Dest,0,0.0
Origin,0,0.0
UniqueCarrier,0,0.0
DepTime,0,0.0
DayOfWeek,0,0.0
DayofMonth,0,0.0
Month,0,0.0


In [112]:
train_df.head(1)

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min
0,c-8,c-21,c-7,1934,AA,ATL,DFW,732,N


In [113]:
test_df.head(1)

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance
0,c-7,c-25,c-3,615,YV,MRY,PHX,598


In [114]:
# extracting the predicted variable
train_y = train_df.pop('dep_delayed_15min')
train_y = train_y.map({'N': 0, 'Y': 1})
train_y

0        0
1        0
2        0
3        0
4        1
        ..
99995    0
99996    0
99997    0
99998    0
99999    0
Name: dep_delayed_15min, Length: 100000, dtype: int64

In [115]:
train_split = train_df.shape[0]
full_df = pd.concat((train_df, test_df))

In [116]:
# Hour and minute
full_df['hour'] = full_df['DepTime'] // 100
full_df.loc[full_df['hour'] == 24, 'hour'] = 0
full_df.loc[full_df['hour'] == 25, 'hour'] = 1
full_df['minute'] = full_df['DepTime'] % 100

In [117]:
# Season
full_df['summer'] = (full_df['Month'].isin(['c-6', 'c-7', 'c-8'])).astype(np.int32)
full_df['autumn'] = (full_df['Month'].isin(['c-9', 'c-10', 'c-11'])).astype(np.int32)
full_df['winter'] = (full_df['Month'].isin(['c-12', 'c-1', 'c-2'])).astype(np.int32)
full_df['spring'] = (full_df['Month'].isin(['c-3', 'c-4', 'c-5'])).astype(np.int32)

In [118]:
# Daytime
full_df['daytime'] = pd.cut(full_df['hour'], bins=[0, 6, 12, 18, 23], include_lowest=True)

In [119]:
# String to numerical
for col in ['Month', 'DayofMonth', 'DayOfWeek']:
    full_df[col] = full_df[col].apply(lambda x: x.split('-')[1]).astype(np.int32) - 1

In [120]:
# Label Encoding
for col in ['Origin', 'Dest', 'UniqueCarrier', 'daytime']:
    full_df[col] = pd.factorize(full_df[col])[0]

In [121]:
# Categorical columns
cat_cols = ['Month', 'DayofMonth', 'DayOfWeek', 'Origin', 'Dest', 'UniqueCarrier', 'hour', 'summer', 'autumn', 'winter', 'spring', 'daytime']

In [122]:
# Converting categorical columns to type 'category' as required by LGBM
for c in cat_cols:
    full_df[c] = full_df[c].astype('category')

In [123]:
# Split into train and test
train_df, test_df = full_df.iloc[:train_split], full_df.iloc[train_split:]
train_df.shape, train_y.shape, test_df.shape

((100000, 15), (100000,), (100000, 15))

In [124]:
train_df

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,hour,minute,summer,autumn,winter,spring,daytime
0,7,20,6,1934,0,0,0,732,19,34,1,0,0,0,0
1,3,19,2,1548,1,1,1,834,15,48,0,0,0,1,1
2,8,1,4,1422,2,2,2,416,14,22,0,1,0,0,1
3,10,24,5,1015,3,3,3,872,10,15,0,1,0,0,2
4,9,6,5,1828,4,4,4,423,18,28,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,4,3,2,1618,3,28,143,199,16,18,0,0,0,1,1
99996,0,17,2,804,11,60,163,884,8,4,0,0,1,0,2
99997,0,23,1,1901,5,66,12,1076,19,1,0,0,1,0,0
99998,3,26,3,1515,10,12,226,140,15,15,0,0,0,1,1


In [125]:
test_df

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,hour,minute,summer,autumn,winter,spring,daytime
0,6,24,2,615,14,139,17,598,6,15,1,0,0,0,3
1,3,16,1,739,4,11,91,1235,7,39,0,0,0,1,2
2,11,1,6,651,10,29,11,577,6,51,0,0,1,0,3
3,2,24,6,1614,4,42,138,377,16,14,0,0,0,1,1
4,5,5,2,1505,9,23,8,258,15,5,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,5,4,1,852,4,44,91,187,8,52,1,0,0,0,2
99996,10,23,5,1446,9,23,31,1515,14,46,0,1,0,0,1
99997,0,29,1,1509,3,23,90,438,15,9,0,0,1,0,1
99998,0,4,4,804,6,27,13,761,8,4,0,0,1,0,2


In [126]:
from sklearn.model_selection import cross_val_score
rf = RandomForestClassifier(n_estimators=100)
scores = cross_val_score(rf, train_df, train_y, cv=10, scoring = "accuracy")
print("Scores:", scores)
print("Mean:", scores.mean())
print("Standard Deviation:", scores.std())

Scores: [0.81931807 0.82121788 0.82151785 0.81991801 0.8185     0.8177
 0.81778178 0.8179818  0.82038204 0.82078208]
Mean: 0.8195099495588994
Standard Deviation: 0.0013837845205111761


In [127]:
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(train_df, train_y)

Y_prediction = random_forest.predict(test_df)

random_forest.score(train_df, train_y)
acc_random_forest = round(random_forest.score(train_df, train_y) * 100, 2)


importances = pd.DataFrame({'feature':train_df.columns,'importance':np.round(random_forest.feature_importances_,3)})
importances = importances.sort_values('importance',ascending=False).set_index('feature')
importances.head(15)

Unnamed: 0_level_0,importance
feature,Unnamed: 1_level_1
DepTime,0.148
Distance,0.138
Dest,0.116
Origin,0.112
minute,0.108
DayofMonth,0.103
UniqueCarrier,0.08
DayOfWeek,0.059
hour,0.053
Month,0.045


In [128]:
train_df  = train_df.drop("summer", axis=1)
test_df  = test_df.drop("summer", axis=1)

train_df  = train_df.drop("winter", axis=1)
test_df  = test_df.drop("winter", axis=1)

train_df  = train_df.drop("spring", axis=1)
test_df  = test_df.drop("spring", axis=1)

train_df  = train_df.drop("autumn", axis=1)
test_df  = test_df.drop("autumn", axis=1)

In [129]:
sub = pd.read_csv("sample_submission.csv")
sub.dep_delayed_15min = Y_prediction
sub.to_csv('rfc0.csv',index=False)
sub

Unnamed: 0,id,dep_delayed_15min
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0
...,...,...
99995,99995,0
99996,99996,0
99997,99997,0
99998,99998,0


In [134]:
random_forest = RandomForestRegressor(n_estimators=2000, criterion='mse', max_depth=30, 
                                      min_samples_split=28, min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
                                      max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, 
                                      min_impurity_split=None, bootstrap=True, oob_score=True, n_jobs=-1, 
                                      random_state=None, verbose=0, warm_start=False)
random_forest.fit(train_df, train_y)

Y_prediction = random_forest.predict(test_df)

random_forest.score(train_df, train_y)
acc_random_forest = round(random_forest.score(train_df, train_y) * 100, 2)


In [135]:
Y_prediction

array([0.03724661, 0.02575653, 0.05091585, ..., 0.19526576, 0.1165716 ,
       0.14886793])

In [136]:
acc_random_forest

45.38

In [137]:
sub = pd.read_csv("sample_submission.csv")
sub.dep_delayed_15min = Y_prediction
sub.to_csv('rfr3.csv',index=False)
sub

Unnamed: 0,id,dep_delayed_15min
0,0,0.037247
1,1,0.025757
2,2,0.050916
3,3,0.299793
4,4,0.215473
...,...,...
99995,99995,0.032759
99996,99996,0.117124
99997,99997,0.195266
99998,99998,0.116572


In [147]:
gbr = GradientBoostingRegressor(loss='ls', learning_rate=0.1, n_estimators=100, 
                                          subsample=1.0, criterion='friedman_mse', min_samples_split=2, 
                                          min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, 
                                          min_impurity_decrease=0.0, min_impurity_split=None, init=None, 
                                          random_state=None, max_features=None, alpha=0.9, verbose=0, 
                                          max_leaf_nodes=None, warm_start=False, presort='auto')
gbr.fit(train_df, train_y)

Y_prediction = gbr.predict(test_df)

gbr.score(train_df, train_y)
acc_gradient_boost_regr = round(gbr.score(train_df, train_y) * 100, 2)


In [145]:
Y_prediction

array([0.05829017, 0.05626619, 0.02197654, ..., 0.19641385, 0.11400273,
       0.08593312])

In [148]:
acc_gradient_boost_regr

11.01

In [146]:
sub = pd.read_csv("sample_submission.csv")
sub.dep_delayed_15min = Y_prediction
sub.to_csv('gbr1.csv',index=False)
sub

Unnamed: 0,id,dep_delayed_15min
0,0,0.058290
1,1,0.056266
2,2,0.021977
3,3,0.237348
4,4,0.232228
...,...,...
99995,99995,0.064740
99996,99996,0.221511
99997,99997,0.196414
99998,99998,0.114003
