<a href="https://colab.research.google.com/github/NicoleLund/flight_delay_prediction/blob/model_prep_210819/data_manipulation_modeling/investigate_models/e_nn_all_y.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# e_nn_all_y
----

Written in Google Colab

By Nicole Lund 

This workbook builds a neural net model for 2017 flight performance.

In [2]:
# Import Dependencies

# Plotting
%matplotlib inline
import matplotlib.pyplot as plt

# Data manipulation
import numpy as np
import pandas as pd
from statistics import mean
from operator import itemgetter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow.keras.utils import to_categorical

# Parameter Selection
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Model Development
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC 
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

# Model Metrics
from sklearn.metrics import classification_report

# Save/load files
from tensorflow.keras.models import load_model
import joblib

# # Ignore deprecation warnings
# import warnings
# warnings.simplefilter('ignore', FutureWarning)

In [3]:
# Set the seed value for the notebook, so the results are reproducible
from numpy.random import seed
seed(1)

# Read in the csv model files

In [4]:
# Read the CSV files from AWS to Pandas Dataframe
X_train = pd.read_csv("https://flight-delay-prediction.s3.us-west-1.amazonaws.com/2017_TUS_X_train.csv")
X_test = pd.read_csv("https://flight-delay-prediction.s3.us-west-1.amazonaws.com/2017_TUS_X_test.csv")
y_train = pd.read_csv("https://flight-delay-prediction.s3.us-west-1.amazonaws.com/2017_TUS_y_train.csv")
y_test = pd.read_csv("https://flight-delay-prediction.s3.us-west-1.amazonaws.com/2017_TUS_y_test.csv")

In [5]:
X_train.head(3)

Unnamed: 0,OP_CARRIER_FL_NUM,CRS_DEP_TIME,CRS_ARR_TIME,DISTANCE,DL,OO,UA,WN,AA,EV,AS,Sunday,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,ATL,DEN,DFW,HOU,IAH,JFK,LAS,LAX,MDW,MSP,OAK,ORD,PDX,PHX,SAN,SEA,SFO,SJC,SLC
0,5538,1120,1647,1437,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5538,1219,1650,1437,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2068,1625,2040,1440,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
y_train.head(3)

Unnamed: 0,CANCELLED,DIVERTED,DELAY
0,0,0,0
1,0,0,0
2,0,0,0


In [7]:
print(X_train.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10784 entries, 0 to 10783
Data columns (total 37 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   OP_CARRIER_FL_NUM  10784 non-null  int64  
 1   CRS_DEP_TIME       10784 non-null  int64  
 2   CRS_ARR_TIME       10784 non-null  int64  
 3   DISTANCE           10784 non-null  int64  
 4   DL                 10784 non-null  float64
 5   OO                 10784 non-null  float64
 6   UA                 10784 non-null  float64
 7   WN                 10784 non-null  float64
 8   AA                 10784 non-null  float64
 9   EV                 10784 non-null  float64
 10  AS                 10784 non-null  float64
 11  Sunday             10784 non-null  float64
 12  Monday             10784 non-null  float64
 13  Tuesday            10784 non-null  float64
 14  Wednesday          10784 non-null  float64
 15  Thursday           10784 non-null  float64
 16  Friday             107

# Hyperparameter Tuning

In [8]:
# Code was modified from sample code presented on
# https://machinelearningmastery.com/grid-search-hyperparameters-deep-learning-models-python-keras/

# Function to create model, required for KerasClassifier
def create_model(neurons=40):
	# create model
	model = Sequential()
	model.add(Dense(neurons, input_dim=X_train.shape[1], activation='relu'))
	model.add(Dropout(0.2))
	model.add(Dense(units=y_train.shape[1], activation='softmax'))
	# Compile model
	model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
	return model

In [13]:
# Code was modified from sample code presented on
# https://machinelearningmastery.com/grid-search-hyperparameters-deep-learning-models-python-keras/

# Use scikit-learn to grid search the batch size and epochs

# create model
grid_model = KerasClassifier(build_fn=create_model, verbose=0)

# define the grid search parameters
batch_size = [100, 1000]
epochs = [100, 500]
neurons = [2500, 5000, 10000]
param_grid = dict(batch_size=batch_size, epochs=epochs, neurons=neurons)

# Apply GridSearchCV
grid = GridSearchCV(estimator=grid_model, param_grid=param_grid, n_jobs=-1, cv=3)
grid_result = grid.fit(X_train, y_train)

# summarize results
print("--------------------------")
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))



--------------------------
Best: 0.897904 using {'batch_size': 100, 'epochs': 500, 'neurons': 10000}
0.099685 (0.004806) with: {'batch_size': 100, 'epochs': 100, 'neurons': 2500}
0.032731 (0.042558) with: {'batch_size': 100, 'epochs': 100, 'neurons': 5000}
0.601484 (0.422773) with: {'batch_size': 100, 'epochs': 100, 'neurons': 10000}
0.066389 (0.044564) with: {'batch_size': 100, 'epochs': 500, 'neurons': 2500}
0.330737 (0.401140) with: {'batch_size': 100, 'epochs': 500, 'neurons': 5000}
0.897904 (0.005248) with: {'batch_size': 100, 'epochs': 500, 'neurons': 10000}
0.099685 (0.004806) with: {'batch_size': 1000, 'epochs': 100, 'neurons': 2500}
0.099685 (0.004806) with: {'batch_size': 1000, 'epochs': 100, 'neurons': 5000}
0.332489 (0.398381) with: {'batch_size': 1000, 'epochs': 100, 'neurons': 10000}
0.099685 (0.004806) with: {'batch_size': 1000, 'epochs': 500, 'neurons': 2500}
0.069365 (0.047673) with: {'batch_size': 1000, 'epochs': 500, 'neurons': 5000}
0.099685 (0.004806) with: {'batch

## Neural Network Model

In [10]:
# Best: 0.897904 using {'batch_size': 100, 'epochs': 500, 'neurons': 10000}

# Create model
nn_model = Sequential()

# Define first layer
nn_model.add(Dense(units=10000,
                activation='relu', input_dim=X_train.shape[1]))

# Define output layer
nn_model.add(Dense(units=y_train.shape[1], activation='softmax'))    

# Review Model
print(nn_model.summary())

# Compile Model
nn_model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Train model
nn_model.fit(
    X_train,
    y_train,
    epochs=500,
    batch_size=100,
    shuffle=True,
    verbose=0
)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_2 (Dense)              (None, 10000)             380000    
_________________________________________________________________
dense_3 (Dense)              (None, 3)                 30003     
Total params: 410,003
Trainable params: 410,003
Non-trainable params: 0
_________________________________________________________________
None


<keras.callbacks.History at 0x7f9d91d83b90>

# Score Model

In [11]:
# Evaluate the model using the testing data
model_loss, model_accuracy = nn_model.evaluate(
    X_test, y_test, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

145/145 - 0s - loss: 33130.4531 - accuracy: 0.8912
Loss: 33130.453125, Accuracy: 0.8911726474761963


# Make **Predictions**

In [14]:
predictions = nn_model.predict(X_test)

In [16]:
pd.DataFrame({"Prediction": predictions[:,0], "Actual": y_test.CANCELLED})

Unnamed: 0,Prediction,Actual
0,1.0,0
1,1.0,0
2,1.0,0
3,1.0,0
4,1.0,0
...,...,...
4617,1.0,0
4618,1.0,0
4619,1.0,0
4620,1.0,0


In [17]:
pd.DataFrame({"Prediction": predictions[:,1], "Actual": y_test.DIVERTED})

Unnamed: 0,Prediction,Actual
0,0.0,0
1,0.0,0
2,0.0,0
3,0.0,0
4,0.0,0
...,...,...
4617,0.0,0
4618,0.0,0
4619,0.0,0
4620,0.0,0


In [15]:
pd.DataFrame({"Prediction": predictions[:,2], "Actual": y_test.DELAY})

Unnamed: 0,Prediction,Actual
0,0.0,0
1,0.0,0
2,0.0,0
3,0.0,0
4,0.0,1
...,...,...
4617,0.0,0
4618,0.0,0
4619,0.0,0
4620,0.0,1


In [18]:
from google.colab import files
nn_model.save("neuralNetwork_allY_model.h5")
files.download('neuralNetwork_allY_model.h5')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# This model is not suitable. Visual inspection of the predictions shows that it does not perform well despite having reasonable accuracy.