In [2]:
import pandas as pd
import numpy as np
import keras

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
from matplotlib import pyplot as plt

from keras.layers import Input, Dense
from keras.models import Model

from sklearn.preprocessing import StandardScaler, MinMaxScaler, Normalizer

%matplotlib inline

In [4]:
df0 = pd.read_csv("flightDelay.csv")
df0.head()

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,Carrier,OriginAirportID,DestAirportID,CRSDepTime,DepDelay,DepDel15,CRSArrTime,ArrDelay,ArrDel15,Cancelled
0,2013,8,3,6,DL,10397,14747,1754,-3.0,0.0,2013,-5.0,0,0
1,2013,4,21,7,DL,11193,13204,905,-7.0,0.0,1112,2.0,0,0
2,2013,7,9,2,AA,11298,13487,2025,35.0,1.0,2245,25.0,1,0
3,2013,5,23,4,B6,11057,10721,1000,-9.0,0.0,1208,-18.0,0,0
4,2013,5,2,4,AA,13930,11298,740,-3.0,0.0,1010,-6.0,0,0


In [5]:
df = df0[["CRSDepTime", "DepDelay", "ArrDelay"]].dropna()
n = len(df)
n_train = range( int(n/2) )
n_test = range( int(n/2)+1, n)

The re-scaling is indeed a very tricky part.
It is recommended to obtain the mean and the scale from the training data, and then use them to transform the test data

In [6]:
df_train = df.iloc[n_train]

sc = StandardScaler()

df_mean = sc.fit(df_train).mean_
df_dev = sc.fit(df_train).scale_

df_tr = sc.fit_transform(df_train)

In [7]:
x_ = df_tr[:, 0:1 ]
y_ = df_tr[:, 2]

df1 = pd.DataFrame(df_tr, columns = ("CRSDepTime", "DepDelay", "ArrDelay"))
x_  = df1[ ["CRSDepTime", "DepDelay"] ]
y_  = df1[ ["ArrDelay"] ]

In [8]:
# see the ecplanation at 
# https://stackoverflow.com/questions/37232782/nan-loss-when-training-regression-network
# about loss: nan

# Not really. These nan is because the data contrains NA. 
# use .dropna() to remove NAs.

inputs = Input(shape=(2,)) # the # of columns of the train data X
preds = Dense(1, activation='linear') (inputs)

model = Model(inputs=inputs,outputs=preds)

sgd=keras.optimizers.SGD(lr = .0005)
# learning rate is important.
# if set as a big number,
# the resutls are infinity or nan with lr = 0.5
# the results are very bad with lr = 0.1
# When lr = .0005 and epochs = 10, the epochs is too small so that each run returns diff. results

model.compile(optimizer=sgd, loss='mse')

# according to ESL, the fitting uses the training period only
# and it does not seek the global optimizer (such as OLS)
# this is true because so far the test data set is not defined yet
# instead, it uses an iteration algorithm between two parts of parameters
history = model.fit(x_, y_, batch_size=1, verbose=1, epochs=30, shuffle=False) 

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


### My Thought

Of course this is still reasonable when the model is over-parametrized.
However, if the model is over parametrized, a statisticians will regularize the model
but will still seek the global minimizer of the regularized model.

Anyway, the iterative algorithm here is still seeking approaching the global minimum
It just terminates when it reaches the early stopping number of iterations.
Asymptotically this number still goes to infinity.
Just because of the finite sample it appears different. (Is it my self comfort?)

In summary, global minimal is not needed either in finite sample or asymptotically.
Remember Andrews' paper about k-bootstrap? It is sufficiently good if the optimizer
is close to the global minimizer with some small order term.

In this OLS is so uncomfortable because OLS's global minimizer is too trivial.

In [9]:
weights = model.trainable_weights # weight tensors
print(weights)
import keras.backend as K
K.get_session().run(model.trainable_weights)
# the parameter from such an algorithm has no clear interpretation
# it is hidden deep in the code. Difficult to extract

[<tf.Variable 'dense_1/kernel:0' shape=(2, 1) dtype=float32_ref>, <tf.Variable 'dense_1/bias:0' shape=(1,) dtype=float32_ref>]


[array([[5.504938e-04],
        [9.126350e-01]], dtype=float32), array([-0.00030193], dtype=float32)]

As I have compared it with OLS, the results are indeed very close. Moreover, the out-of-sample MSE is slightly smaller than that of OLS. This is expected because the parameter of Keras is decided to minimize MSE out of sample

In [10]:
score_train = model.evaluate(x_, y_, verbose=0) # report the MSE
print(score_train) # the global minimum in sample MSE is 0.1358457

# do not confuse test data with validation data
# test data is the final block of data that the researcher pretends 
# to be unobservable when fitting the model
# See ESL page 222

# the validation data is the one used to determine tuning parameters

# This model is simple enough with no tuning parameters
# therefore a sample spliting in the fitting between the training data
# and the validation data is unneccessary

df_test  = df.iloc[n_test]
df_te = (df_test - df_mean)/df_dev
x_te  = df_te[ ["CRSDepTime", "DepDelay"] ]
y_te  = df_te[ ["ArrDelay"] ]
score_test = model.evaluate(x_te, y_te, verbose=0) # report the MSE
print(score_test)

print(model.summary())

0.13735204420431896
0.17894379683394931
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 2)                 0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 3         
Total params: 3
Trainable params: 3
Non-trainable params: 0
_________________________________________________________________
None


In [11]:
# plt.scatter(X_train[["DepDelay"]], y_train,color='black')
# plt.scatter(X_test[["DepDelay"]], model.predict(X_test), color='blue', linewidth=1)