## Using SciKit-Learn MLPRegressor Neural Net class for Arrival Delay

This notebook will use the same dataset as the EDx *principles of machine learning* to predict flight delays with the scikit learn MLPRegressor class.

See the jupyter notebook in :  *edx/priciples_of_ml/Principles Of Machine Learning DAT203.2* 


In [8]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

In [2]:
df = pd.read_csv('../data/azureml/dat203/Flight-Delays-Data.csv')

In [3]:
df.head()

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,Carrier,OriginAirportID,DestAirportID,CRSDepTime,DepDelay,DepDel15,CRSArrTime,ArrDelay,ArrDel15,Cancelled
0,2013,4,19,5,DL,11433,13303,837,-3.0,0.0,1138,1.0,0.0,0.0
1,2013,4,19,5,DL,14869,12478,1705,0.0,0.0,2336,-8.0,0.0,0.0
2,2013,4,19,5,DL,14057,14869,600,-4.0,0.0,851,-15.0,0.0,0.0
3,2013,4,19,5,DL,15016,11433,1630,28.0,1.0,1903,24.0,1.0,0.0
4,2013,4,19,5,DL,11193,12892,1615,-6.0,0.0,1805,-11.0,0.0,0.0


In [4]:
df.dropna(subset=['ArrDelay'], how='any', inplace=True)

In [5]:
# One Hot Encoding / dummies
# Represent categorical variables as binary vectors
# prefix=['Carrier_','DestAirportID_','OriginAirportID_'],
df = pd.get_dummies(df,  columns=['Carrier','DestAirportID','OriginAirportID'],sparse=False)
df.head()

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,CRSDepTime,DepDelay,DepDel15,CRSArrTime,ArrDelay,ArrDel15,...,OriginAirportID_14747,OriginAirportID_14771,OriginAirportID_14831,OriginAirportID_14843,OriginAirportID_14869,OriginAirportID_14893,OriginAirportID_14908,OriginAirportID_15016,OriginAirportID_15304,OriginAirportID_15376
0,2013,4,19,5,837,-3.0,0.0,1138,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,2013,4,19,5,1705,0.0,0.0,2336,-8.0,0.0,...,0,0,0,0,1,0,0,0,0,0
2,2013,4,19,5,600,-4.0,0.0,851,-15.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,2013,4,19,5,1630,28.0,1.0,1903,24.0,1.0,...,0,0,0,0,0,0,0,1,0,0
4,2013,4,19,5,1615,-6.0,0.0,1805,-11.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
# Drop all columns not used in the modeling process
df.drop(['Year', 'DayofMonth', 'CRSDepTime', 'CRSArrTime', 'ArrDel15', 'Cancelled', 'DepDel15'], axis=1, inplace=True)
df.head()

Unnamed: 0,Month,DayOfWeek,DepDelay,ArrDelay,Carrier_9E,Carrier_AA,Carrier_AS,Carrier_B6,Carrier_DL,Carrier_EV,...,OriginAirportID_14747,OriginAirportID_14771,OriginAirportID_14831,OriginAirportID_14843,OriginAirportID_14869,OriginAirportID_14893,OriginAirportID_14908,OriginAirportID_15016,OriginAirportID_15304,OriginAirportID_15376
0,4,5,-3.0,1.0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,4,5,0.0,-8.0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
2,4,5,-4.0,-15.0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,4,5,28.0,24.0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
4,4,5,-6.0,-11.0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
X = df.drop(['ArrDelay'], axis=1)
y = df['ArrDelay']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(2017788, 159)
(2017788,)
(672597, 159)
(672597,)


## MLPRegressor

In [11]:
mlp = MLPRegressor(hidden_layer_sizes=(5,),
                                       activation='relu',
                                       solver='adam',
                                       learning_rate='adaptive',
                                       max_iter=1000,
                                       learning_rate_init=0.01,
                                       alpha=0.01)

In [12]:
mlp.fit(X_train, y_train)

MLPRegressor(activation='relu', alpha=0.01, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(5,), learning_rate='adaptive',
       learning_rate_init=0.01, max_iter=1000, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [13]:
score = mlp.score(X_test, y_test)

In [14]:
print(score)

0.8910055150665968


In [17]:
mlp = MLPRegressor(hidden_layer_sizes=(10,10,10),
                                       activation='relu',
                                       solver='adam',
                                       learning_rate='adaptive',
                                       max_iter=1000,
                                       learning_rate_init=0.01,
                                       alpha=0.01)
mlp.fit(X_train, y_train)
score = mlp.score(X_test, y_test)
print(score)


0.8928705629593798


In [15]:
mlp = MLPRegressor(hidden_layer_sizes=(5,),
                                       activation='relu',
                                       solver='adam',
                                       learning_rate='adaptive',
                                       max_iter=1000,
                                       learning_rate_init=0.01,
                                       alpha=0.01)
scores = cross_val_score(mlp, X, y)

In [20]:
print(scores)

[0.88605624 0.89930171 0.86303822]


## Make Predictions on the Test Set


In [21]:
df_test_set = pd.read_csv('../data/azureml/dat203/Test-Flights.csv')
# create dummies out of the test set columns
df_test_set = pd.get_dummies(df_test_set,  columns=['Carrier','DestAirportID','OriginAirportID'],sparse=False)
df_test_set.drop(['Year', 'DayofMonth', 'CRSDepTime', 'CRSArrTime', 'ArrDel15', 'Cancelled', 'DepDel15'], axis=1, inplace=True)
X2 = df_test_set.drop(['ArrDelay'], axis=1)
# for everyone column that is in the training data, that is NOT in the test set, add a column of zeros.
# we can do this because of the dummies, and if the column did not exist, it has a zero.
for c_x in X.columns:
    if c_x not in X2.columns:
        #print(f"X column not in test columns: {c_x}")
        X2[c_x] = 0
        
X2.drop(['Row ID'], axis=1, inplace=True)

In [22]:
mlp = MLPRegressor(hidden_layer_sizes=(5,),
                                       activation='relu',
                                       solver='adam',
                                       learning_rate='adaptive',
                                       max_iter=1000,
                                       learning_rate_init=0.01,
                                       alpha=0.01)
mlp.fit(X, y)

MLPRegressor(activation='relu', alpha=0.01, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(5,), learning_rate='adaptive',
       learning_rate_init=0.01, max_iter=1000, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [23]:
y2_predictions = mlp.predict(X2)
for i, y2_pred in enumerate(y2_predictions):
    print(f"{i+1}: {y2_pred}")


1: 7.286335786244964
2: -2.2999834361367655
3: -3.4537685709923474
4: -9.646549394836173
5: 7.28948143809011
6: 19.202464436173706
7: -3.0607936987908793
8: 282.1602055638549
9: -0.9689804843834935
10: -14.552563143327399
11: -5.785599411847075
12: 11.383298770669613
13: -11.625006714276445
14: -7.089062994488142
15: -5.304671630175276
16: -2.072817224616957
17: -0.6247614865973747
18: -10.275622998351189
19: -1.0433361116026232
20: -3.750247907818547
21: 1.5288362009686218
22: 3.6611511282128166
23: -9.672438892794828
24: 6.491374681647733
25: 8.675642783992632


### Scored Results from original submission

3.7725156569882055

-3.8938305993689077 - Wrong

-2.8607141466171035

-8.559298023634756

10.995361938712417 - Wrong

21.022128070393833

-2.805496001663778 - Wrong

279.5788357328236

-0.8499371728121856

-12.53619955079647 - Wrong

-9.474685085304962

14.173694794031123

-11.367471700298589

-8.385155635822157

-1.1477647726290232

-1.2994777816885998

-2.078893274211619

-10.332268058516082

-2.4180271470933983

-2.397893311691361

1.2571905723881875

3.1713591494909856 - Wrong

-10.105779900346644

8.049489003493031

11.093320782452446


First Attempt was 20 /25

