<a href="https://colab.research.google.com/github/NicoleLund/flight_delay_prediction/blob/model_prep_210819/data_manipulation_modeling/investigate_models/c_logistic_regression_delays.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# c_logistic_regression_delays
----

Written in Google Colab

By Nicole Lund 

This workbook builds a logistic regression model for 2017 flight performance.

In [21]:
# Import Dependencies

# Plotting
%matplotlib inline
import matplotlib.pyplot as plt

# Data manipulation
import numpy as np
import pandas as pd
import math
from statistics import mean
from operator import itemgetter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow.keras.utils import to_categorical

# Parameter Selection
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Model Development
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC 
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

# Model Metrics
from sklearn.metrics import classification_report

# Save/load files
from tensorflow.keras.models import load_model
import joblib

# # Ignore deprecation warnings
# import warnings
# warnings.simplefilter('ignore', FutureWarning)

In [2]:
# Set the seed value for the notebook, so the results are reproducible
from numpy.random import seed
seed(1)

# Read in the csv model files

In [3]:
# Read the CSV files from AWS to Pandas Dataframe
X_train = pd.read_csv("https://flight-delay-prediction.s3.us-west-1.amazonaws.com/2017_TUS_X_train.csv")
X_test = pd.read_csv("https://flight-delay-prediction.s3.us-west-1.amazonaws.com/2017_TUS_X_test.csv")
y_train = pd.read_csv("https://flight-delay-prediction.s3.us-west-1.amazonaws.com/2017_TUS_y_train.csv")
y_test = pd.read_csv("https://flight-delay-prediction.s3.us-west-1.amazonaws.com/2017_TUS_y_test.csv")

In [4]:
X_train.head(3)

Unnamed: 0,OP_CARRIER_FL_NUM,CRS_DEP_TIME,CRS_ARR_TIME,DISTANCE,DL,OO,UA,WN,AA,EV,AS,Sunday,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,ATL,DEN,DFW,HOU,IAH,JFK,LAS,LAX,MDW,MSP,OAK,ORD,PDX,PHX,SAN,SEA,SFO,SJC,SLC
0,5538,1120,1647,1437,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5538,1219,1650,1437,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2068,1625,2040,1440,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
y_train.head(3)

Unnamed: 0,CANCELLED,DIVERTED,DELAY
0,0,0,0
1,0,0,0
2,0,0,0


## Logistic Regression Classifier Method

In [6]:
# Create model
model = LogisticRegression(solver='lbfgs', max_iter=1000)
model = model.fit(X_train, y_train.DELAY)

# Hyperparameter Tuning

In [12]:
# Create the GridSearchCV model
# param_grid = {'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}
# Note: newton-cg, sag and saga did not converge
param_grid = {'solver': ['lbfgs', 'liblinear']}
grid = GridSearchCV(model, param_grid, verbose=3)

In [13]:
# Fit the model using the grid search estimator. 
grid.fit(X_train, y_train.DELAY)

Fitting 5 folds for each of 2 candidates, totalling 10 fits
[CV] solver=lbfgs ....................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ........................ solver=lbfgs, score=0.892, total=   0.2s
[CV] solver=lbfgs ....................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s


[CV] ........................ solver=lbfgs, score=0.893, total=   0.3s
[CV] solver=lbfgs ....................................................
[CV] ........................ solver=lbfgs, score=0.893, total=   0.2s
[CV] solver=lbfgs ....................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.5s remaining:    0.0s


[CV] ........................ solver=lbfgs, score=0.892, total=   0.3s
[CV] solver=lbfgs ....................................................
[CV] ........................ solver=lbfgs, score=0.893, total=   0.2s
[CV] solver=liblinear ................................................
[CV] .................... solver=liblinear, score=0.892, total=   0.1s
[CV] solver=liblinear ................................................
[CV] .................... solver=liblinear, score=0.892, total=   0.1s
[CV] solver=liblinear ................................................
[CV] .................... solver=liblinear, score=0.892, total=   0.1s
[CV] solver=liblinear ................................................
[CV] .................... solver=liblinear, score=0.892, total=   0.1s
[CV] solver=liblinear ................................................
[CV] .................... solver=liblinear, score=0.892, total=   0.1s


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    1.5s finished


GridSearchCV(cv=None, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=1000, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'solver': ['lbfgs', 'liblinear']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

In [14]:
print(grid.best_params_)
print(grid.best_score_)

{'solver': 'lbfgs'}
0.8928041592158422


# Score Model

In [15]:
print('Logistic Regression Model Score:')
model.score(X_test, y_test.DELAY)

Logistic Regression Model Score:


0.8836001730852445

# Make **Predictions**

In [28]:
predictions = model.predict(X_test)
results_delayed = pd.DataFrame({ \
            "DELAY": y_test.DELAY, "DELAY_PREDICT": predictions})
delay_summary = results_delayed.apply(pd.value_counts)
delay_summary

Unnamed: 0,DELAY,DELAY_PREDICT
0,4087,4617
1,535,5


In [30]:
print(classification_report(y_test.DELAY, predictions))

              precision    recall  f1-score   support

           0       0.88      1.00      0.94      4087
           1       0.20      0.00      0.00       535

    accuracy                           0.88      4622
   macro avg       0.54      0.50      0.47      4622
weighted avg       0.81      0.88      0.83      4622

