# e_nn_all_y_dec_hrs
----

Written in Google Colab

By Nicole Lund 

This workbook builds a neural net model for 2017 flight performance using decimal hours for input.

In [1]:
# Import Dependencies

# Plotting
%matplotlib inline
import matplotlib.pyplot as plt

# Data manipulation
import numpy as np
import pandas as pd
from statistics import mean
from operator import itemgetter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow.keras.utils import to_categorical

# Parameter Selection
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Model Development
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC 
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

# Model Metrics
from sklearn.metrics import classification_report

# Save/load files
from tensorflow.keras.models import load_model
import joblib

# # Ignore deprecation warnings
# import warnings
# warnings.simplefilter('ignore', FutureWarning)

In [2]:
%tensorflow_version 2.x
import tensorflow as tf
print("Tensorflow version " + tf.__version__)

try:
  tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
  print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])
except ValueError:
  raise BaseException('ERROR: Not connected to a TPU runtime; please see the previous cell in this notebook for instructions!')

tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)

Tensorflow version 2.6.0
Running on TPU  ['10.14.230.186:8470']
INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Initializing the TPU system: grpc://10.14.230.186:8470


INFO:tensorflow:Initializing the TPU system: grpc://10.14.230.186:8470


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Found TPU system:


INFO:tensorflow:Found TPU system:


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


In [3]:
# Set the seed value for the notebook, so the results are reproducible
from numpy.random import seed
seed(1)

# Read in the csv model files

In [4]:
# Read the CSV files from AWS to Pandas Dataframe
X_train = pd.read_csv("https://raw.githubusercontent.com/NicoleLund/flight_delay_prediction/model_refine_210821/data_manipulation_modeling/feature_assessment/2017_TUS_X_train_dec_hrs.csv")
X_test = pd.read_csv("https://raw.githubusercontent.com/NicoleLund/flight_delay_prediction/model_refine_210821/data_manipulation_modeling/feature_assessment/2017_TUS_X_test_dec_hrs.csv")
y_train = pd.read_csv("https://raw.githubusercontent.com/NicoleLund/flight_delay_prediction/model_refine_210821/data_manipulation_modeling/feature_assessment/2017_TUS_y_train_dec_hrs.csv")
y_test = pd.read_csv("https://raw.githubusercontent.com/NicoleLund/flight_delay_prediction/model_refine_210821/data_manipulation_modeling/feature_assessment/2017_TUS_y_test_dec_hrs.csv")

In [5]:
X_train.head(3)

Unnamed: 0,OP_CARRIER_FL_NUM,CRS_DEP_hours,CRS_ARR_hours,DISTANCE,DL,OO,UA,WN,AA,EV,AS,Sunday,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,ATL,DEN,DFW,HOU,IAH,JFK,LAS,LAX,MDW,MSP,OAK,ORD,PDX,PHX,SAN,SEA,SFO,SJC,SLC
0,5538,11.333333,16.783333,1437,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5538,12.316667,16.833333,1437,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2068,16.416667,20.666667,1440,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
y_train.head(3)

Unnamed: 0,CANCELLED,DIVERTED,DELAY
0,0,0,0
1,0,0,0
2,0,0,0


In [7]:
print(X_train.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10784 entries, 0 to 10783
Data columns (total 37 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   OP_CARRIER_FL_NUM  10784 non-null  int64  
 1   CRS_DEP_hours      10784 non-null  float64
 2   CRS_ARR_hours      10784 non-null  float64
 3   DISTANCE           10784 non-null  int64  
 4   DL                 10784 non-null  float64
 5   OO                 10784 non-null  float64
 6   UA                 10784 non-null  float64
 7   WN                 10784 non-null  float64
 8   AA                 10784 non-null  float64
 9   EV                 10784 non-null  float64
 10  AS                 10784 non-null  float64
 11  Sunday             10784 non-null  float64
 12  Monday             10784 non-null  float64
 13  Tuesday            10784 non-null  float64
 14  Wednesday          10784 non-null  float64
 15  Thursday           10784 non-null  float64
 16  Friday             107

# Hyperparameter Tuning

In [8]:
# Code was modified from sample code presented on
# https://machinelearningmastery.com/grid-search-hyperparameters-deep-learning-models-python-keras/

# Function to create model, required for KerasClassifier
def create_model(neurons=40):
	# create model
	model = Sequential()
	model.add(Dense(neurons, input_dim=X_train.shape[1], activation='relu'))
	model.add(Dropout(0.2))
	model.add(Dense(units=y_train.shape[1], activation='softmax'))
	# Compile model
	model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
	return model

In [10]:
# Code was modified from sample code presented on
# https://machinelearningmastery.com/grid-search-hyperparameters-deep-learning-models-python-keras/

# Use scikit-learn to grid search the batch size and epochs

# create model
with tpu_strategy.scope():
  grid_model = KerasClassifier(build_fn=create_model, verbose=0)

# define the grid search parameters
batch_size = [100, 1000]
epochs = [100, 500]
neurons = [2500, 5000, 10000]
param_grid = dict(batch_size=batch_size, epochs=epochs, neurons=neurons)

# Apply GridSearchCV
grid = GridSearchCV(estimator=grid_model, param_grid=param_grid, n_jobs=-1, cv=3)
grid_result = grid.fit(X_train, y_train)

# summarize results
print("--------------------------")
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

--------------------------
Best: 0.633555 using {'batch_size': 100, 'epochs': 500, 'neurons': 10000}
0.100242 (0.005289) with: {'batch_size': 100, 'epochs': 100, 'neurons': 2500}
0.066389 (0.044564) with: {'batch_size': 100, 'epochs': 100, 'neurons': 5000}
0.035707 (0.047941) with: {'batch_size': 100, 'epochs': 100, 'neurons': 10000}
0.596838 (0.420653) with: {'batch_size': 100, 'epochs': 500, 'neurons': 2500}
0.099685 (0.004806) with: {'batch_size': 100, 'epochs': 500, 'neurons': 5000}
0.633555 (0.375445) with: {'batch_size': 100, 'epochs': 500, 'neurons': 10000}
0.099685 (0.004806) with: {'batch_size': 1000, 'epochs': 100, 'neurons': 2500}
0.035707 (0.047941) with: {'batch_size': 1000, 'epochs': 100, 'neurons': 5000}
0.332489 (0.398381) with: {'batch_size': 1000, 'epochs': 100, 'neurons': 10000}
0.099685 (0.004806) with: {'batch_size': 1000, 'epochs': 500, 'neurons': 2500}
0.099685 (0.004806) with: {'batch_size': 1000, 'epochs': 500, 'neurons': 5000}
0.340112 (0.401988) with: {'batch

## Neural Network Model

In [12]:
# Best: 0.633555 using {'batch_size': 100, 'epochs': 500, 'neurons': 10000}

with tpu_strategy.scope():

  # Create model
  nn_model = Sequential()

  # Define first layer
  nn_model.add(Dense(units=10000,
                  activation='relu', input_dim=X_train.shape[1]))

  # Define output layer
  nn_model.add(Dense(units=y_train.shape[1], activation='softmax'))    

  # Review Model
  print(nn_model.summary())

  # Compile Model
  nn_model.compile(optimizer='adam',
                loss='categorical_crossentropy',
                metrics=['accuracy'])

  # Train model
  nn_model.fit(
      X_train,
      y_train,
      epochs=500,
      batch_size=100,
      shuffle=True,
      verbose=0
  )

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_4 (Dense)              (None, 10000)             380000    
_________________________________________________________________
dense_5 (Dense)              (None, 3)                 30003     
Total params: 410,003
Trainable params: 410,003
Non-trainable params: 0
_________________________________________________________________
None


# Score Model

In [13]:
# Evaluate the model using the testing data
model_loss, model_accuracy = nn_model.evaluate(
    X_test, y_test, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

145/145 - 3s - loss: 13295.6055 - accuracy: 0.1069
Loss: 13295.60546875, Accuracy: 0.10688013583421707


In [15]:
from google.colab import files
nn_model.save("neuralNetwork_allY_dec_hrs_model.h5")
files.download('neuralNetwork_allY_dec_hrs_model.h5')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Make **Predictions**

In [14]:
predictions = nn_model.predict(X_test)

In [16]:
results_cancelled = pd.DataFrame({ \
            "CANCELLED": y_test.CANCELLED,"CANCELLED_PREDICT": predictions[:,0]})
results_diverted = pd.DataFrame({ \
            "DIVERTED": y_test.DIVERTED,"DIVERTED_PREDICT": predictions[:,1]})
results_delayed = pd.DataFrame({ \
            "DELAY": y_test.DELAY, "DELAY_PREDICT": predictions[:,2]})

In [17]:
cancel_summary = results_cancelled.apply(pd.value_counts)
cancel_summary

Unnamed: 0,CANCELLED,CANCELLED_PREDICT
0.0,4590,4622.0
1.0,32,


In [18]:
print(classification_report(y_test.CANCELLED, predictions[:,0]))

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      4590
           1       0.00      0.00      0.00        32

    accuracy                           0.99      4622
   macro avg       0.50      0.50      0.50      4622
weighted avg       0.99      0.99      0.99      4622



  _warn_prf(average, modifier, msg_start, len(result))


In [19]:
diverted_summary = results_diverted.apply(pd.value_counts)
diverted_summary

Unnamed: 0,DIVERTED,DIVERTED_PREDICT
0.0,4613,4622.0
1.0,9,


In [20]:
print(classification_report(y_test.DIVERTED, predictions[:,1]))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4613
           1       0.00      0.00      0.00         9

    accuracy                           1.00      4622
   macro avg       0.50      0.50      0.50      4622
weighted avg       1.00      1.00      1.00      4622



  _warn_prf(average, modifier, msg_start, len(result))


In [21]:
delay_summary = results_delayed.apply(pd.value_counts)
delay_summary

Unnamed: 0,DELAY,DELAY_PREDICT
0.0,4087,
1.0,535,4622.0


In [22]:
print(classification_report(y_test.DELAY, predictions[:,2]))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00      4087
           1       0.12      1.00      0.21       535

    accuracy                           0.12      4622
   macro avg       0.06      0.50      0.10      4622
weighted avg       0.01      0.12      0.02      4622



  _warn_prf(average, modifier, msg_start, len(result))
