<a href="https://colab.research.google.com/github/NicoleLund/flight_delay_prediction/blob/model_prep_210819/data_manipulation_modeling/feature_assessment/model_preparation_2015to2017.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# model_preparation_2015to2017
----

Written in Google Colab

By Nicole Lund 

This workbook investigates prepares 2017 flight performance for models.

In [1]:
# Import Dependencies

# Plotting
%matplotlib inline
import matplotlib.pyplot as plt

# Data manipulation
import numpy as np
import pandas as pd
from statistics import mean
from operator import itemgetter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, OneHotEncoder
from tensorflow.keras.utils import to_categorical

# Parameter Selection
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Model Development
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC 
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

# Model Metrics
from sklearn.metrics import classification_report

# Save/load files
from tensorflow.keras.models import load_model
import joblib
from google.colab import files

# # Ignore deprecation warnings
# import warnings
# warnings.simplefilter('ignore', FutureWarning)

In [2]:
# Set the seed value for the notebook, so the results are reproducible
from numpy.random import seed
seed(1)

# Read the clean CSV

In [40]:
dtype_definition={ \
    'CRS_DEP_TIME': int, \
    # 'DEP_TIME': int, \
    # 'DEP_DELAY': int, \
    # 'TAXI_OUT': int, \
    # 'WHEELS_OFF': int, \
    # 'WHEELS_ON': int, \
    # 'TAXI_IN': int, \
    'CRS_ARR_TIME': int, \
    # 'ARR_TIME': int, \
    # 'ARR_DELAY': int, \
    'CANCELLED': int, \
    'CANCELLATION_CODE': str, \
    'DIVERTED': int, \
    'CRS_ELAPSED_TIME': int, \
    # 'ACTUAL_ELAPSED_TIME': int, \
    # 'AIR_TIME': int, \
    'DISTANCE': int, \
    # 'CARRIER_DELAY': int, \
    # 'WEATHER_DELAY': int, \
    # 'NAS_DELAY': int, \
    # 'SECURITY_DELAY': int, \
    # 'LATE_AIRCRAFT_DELAY': int, \
    'DELAY': int}

In [43]:
# Read the CSV file from AWS to Pandas Dataframe
df_2015 = pd.read_csv("https://finalproject-3.s3.us-west-1.amazonaws.com/2015_TUS.csv", dtype=dtype_definition)
df_2016 = pd.read_csv("https://finalproject-3.s3.us-west-1.amazonaws.com/2016_TUS.csv", dtype=dtype_definition)
df_2017 = pd.read_csv("https://finalproject-3.s3.us-west-1.amazonaws.com/2017_TUS.csv", dtype=dtype_definition)

In [44]:
# Append DataFrames together
df_1 = df_2015.append(df_2016)
df = df_1.append(df_2017)

In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 47917 entries, 0 to 15405
Data columns (total 37 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   origin_city_name     47917 non-null  object 
 1   dest_city_name       47917 non-null  object 
 2   FL_DATE              47917 non-null  object 
 3   OP_CARRIER           47917 non-null  object 
 4   OP_CARRIER_FL_NUM    47917 non-null  int64  
 5   ORIGIN               47917 non-null  object 
 6   DEST                 47917 non-null  object 
 7   CRS_DEP_TIME         47917 non-null  int64  
 8   DEP_TIME             47538 non-null  float64
 9   DEP_DELAY            47538 non-null  float64
 10  TAXI_OUT             47516 non-null  float64
 11  WHEELS_OFF           47516 non-null  float64
 12  WHEELS_ON            47503 non-null  float64
 13  TAXI_IN              47503 non-null  float64
 14  CRS_ARR_TIME         47917 non-null  int64  
 15  ARR_TIME             47503 non-null 

# Pre-processing

In [46]:
# Split dataframe into X and y
# Define model variables

# Model input
X_categorical_df = df[['OP_CARRIER', 'day_of_week', 'DEST']]
X_numeric_df = df[['OP_CARRIER_FL_NUM', 'CRS_DEP_TIME', 'CRS_ARR_TIME', 'DISTANCE']]
X_numeric_df.DISTANCE = X_numeric_df.DISTANCE.astype(int)

# Model output
y_df = df[['CANCELLED', 'DIVERTED', 'DELAY']]
y_df.CANCELLED = y_df.CANCELLED.astype(int)
y_df.DIVERTED = y_df.DIVERTED.astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [47]:
# Review model output
print(y_df.info())
y_df.head(3)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 47917 entries, 0 to 15405
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   CANCELLED  47917 non-null  int64
 1   DIVERTED   47917 non-null  int64
 2   DELAY      47917 non-null  int64
dtypes: int64(3)
memory usage: 1.5 MB
None


Unnamed: 0,CANCELLED,DIVERTED,DELAY
0,0,0,0
1,0,0,0
2,0,0,0


In [48]:
# Review model numeric input
print(X_numeric_df.info())
X_numeric_df.head(3)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 47917 entries, 0 to 15405
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype
---  ------             --------------  -----
 0   OP_CARRIER_FL_NUM  47917 non-null  int64
 1   CRS_DEP_TIME       47917 non-null  int64
 2   CRS_ARR_TIME       47917 non-null  int64
 3   DISTANCE           47917 non-null  int64
dtypes: int64(4)
memory usage: 1.8 MB
None


Unnamed: 0,OP_CARRIER_FL_NUM,CRS_DEP_TIME,CRS_ARR_TIME,DISTANCE
0,1240,1358,1923,1541
1,1240,1358,1923,1541
2,1487,700,1221,1541


In [49]:
# Review model categorical input
print(X_categorical_df.info())
X_categorical_df.head(3)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 47917 entries, 0 to 15405
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   OP_CARRIER   47917 non-null  object
 1   day_of_week  47917 non-null  object
 2   DEST         47917 non-null  object
dtypes: object(3)
memory usage: 1.5+ MB
None


Unnamed: 0,OP_CARRIER,day_of_week,DEST
0,DL,Thursday,ATL
1,DL,Friday,ATL
2,DL,Friday,ATL


In [50]:
# Get categories for one hot encoder
carriers = pd.unique(X_categorical_df.OP_CARRIER)
day = pd.unique(X_categorical_df.day_of_week)
destination = pd.unique(X_categorical_df.DEST)
print(carriers)
print(day)
print(destination)

['DL' 'OO' 'WN' 'EV' 'UA' 'AA' 'US' 'AS']
['Thursday' 'Friday' 'Saturday' 'Sunday' 'Monday' 'Tuesday' 'Wednesday']
['ATL' 'DEN' 'DFW' 'HOU' 'IAH' 'LAS' 'LAX' 'MDW' 'MSP' 'ORD' 'PDX' 'PHX'
 'SAN' 'SEA' 'SFO' 'SLC' 'JFK' 'OAK' 'SJC']


In [51]:
column_names = ['DL', 'OO', 'WN', 'EV', 'UA', 'AA', 'US', 'AS', \
 'Thursday', 'Friday', 'Saturday', 'Sunday', 'Monday', 'Tuesday', 'Wednesday', \
 'ATL', 'DEN', 'DFW', 'HOU', 'IAH', 'LAS', 'LAX', 'MDW', 'MSP', 'ORD', 'PDX', 'PHX', \
 'SAN', 'SEA', 'SFO', 'SLC', 'JFK', 'OAK', 'SJC']

In [52]:
# One-hot-encode text object fields
encoder = OneHotEncoder(categories=[carriers,day,destination])
encoder.fit(X_categorical_df)
X_encoded = encoder.transform(X_categorical_df)
X_encoded.toarray()

array([[1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]])

In [53]:
X_encoded_df = pd.DataFrame(X_encoded.toarray(), columns=column_names)
X_encoded_df.head(3)

Unnamed: 0,DL,OO,WN,EV,UA,AA,US,AS,Thursday,Friday,Saturday,Sunday,Monday,Tuesday,Wednesday,ATL,DEN,DFW,HOU,IAH,LAS,LAX,MDW,MSP,ORD,PDX,PHX,SAN,SEA,SFO,SLC,JFK,OAK,SJC
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [54]:
X_df = X_numeric_df
X_df = X_df.join(X_encoded_df)
X_df.head(3)

Unnamed: 0,OP_CARRIER_FL_NUM,CRS_DEP_TIME,CRS_ARR_TIME,DISTANCE,DL,OO,WN,EV,UA,AA,US,AS,Thursday,Friday,Saturday,Sunday,Monday,Tuesday,Wednesday,ATL,DEN,DFW,HOU,IAH,LAS,LAX,MDW,MSP,ORD,PDX,PHX,SAN,SEA,SFO,SLC,JFK,OAK,SJC
0,1240,1358,1923,1541,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,1240,1425,1951,1541,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,2066,1430,2003,1541,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [55]:
# Split X and y into training and testing groups
X_train, X_test, y_train, y_test = train_test_split(
    X_df, y_df, test_size=0.3, random_state=42)

In [56]:
# Display training data
X_train.head()

Unnamed: 0,OP_CARRIER_FL_NUM,CRS_DEP_TIME,CRS_ARR_TIME,DISTANCE,DL,OO,WN,EV,UA,AA,US,AS,Thursday,Friday,Saturday,Sunday,Monday,Tuesday,Wednesday,ATL,DEN,DFW,HOU,IAH,LAS,LAX,MDW,MSP,ORD,PDX,PHX,SAN,SEA,SFO,SLC,JFK,OAK,SJC
15094,4769,1114,1405,601,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
6930,1973,1840,1955,365,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7179,1334,630,705,451,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1689,5479,800,1059,639,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3251,2304,1355,1805,813,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [57]:
# Scale the data with MinMaxScaler
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Save the output to csv files

In [58]:
X_train.to_csv('2015to2017_TUS_X_train.csv', index = False)
files.download('2015to2017_TUS_X_train.csv')
y_train.to_csv('2015to2017_TUS_y_train.csv', index = False)
files.download('2015to2017_TUS_y_train.csv')
X_test.to_csv('2015to2017_TUS_X_test.csv', index = False)
files.download('2015to2017_TUS_X_test.csv')
y_test.to_csv('2015to2017_TUS_y_test.csv', index = False)
files.download('2015to2017_TUS_y_test.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>