<a href="https://colab.research.google.com/github/NicoleLund/flight_delay_prediction/blob/model_prep_210819/data_manipulation_modeling/feature_assessment/model_preparation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# model_preparation_dec_hrs
----

Written in Jupyter Notebook, Python

By Nicole Lund 

This workbook investigates prepares 2017 flight performance for models.

In [11]:
# Import Dependencies

# Plotting
%matplotlib inline
import matplotlib.pyplot as plt

# Data manipulation
import numpy as np
import pandas as pd
import math
import datetime
from statistics import mean
from operator import itemgetter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, OneHotEncoder
from tensorflow.keras.utils import to_categorical

# Parameter Selection
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Model Development
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC 
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

# Model Metrics
from sklearn.metrics import classification_report

# Save/load files
from tensorflow.keras.models import load_model
import joblib
# from google.colab import files

# # Ignore deprecation warnings
# import warnings
# warnings.simplefilter('ignore', FutureWarning)

In [3]:
# Set the seed value for the notebook, so the results are reproducible
from numpy.random import seed
seed(1)

# Read the clean CSV

In [16]:
# Import data
# Read the CSV file from AWS to Pandas Dataframe
url = "https://raw.githubusercontent.com/NicoleLund/flight_delay_prediction/main/data_manipulation_modeling/data_clean/2017_TUS.csv"
df = pd.read_csv(url)

df.head(3)

Unnamed: 0,origin_city_name,dest_city_name,FL_DATE,OP_CARRIER,OP_CARRIER_FL_NUM,ORIGIN,DEST,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,...,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,CRS_DEP_HM,DEP_TIME_HM,WHEELS_OFF_HM,WHEELS_ON_HM,CRS_ARR_TIME_HM,ARR_TIME_HM,day_of_week,DELAY
0,"TUCSON, ARIZONA, USA","ATLANTA, GEORGIA, USA",2017-01-01,DL,2066,TUS,ATL,1430,1459.0,29.0,...,0.0,14.0,14:30,14:59,15:09,20:11,20:03,20:28,Sunday,0
1,"TUCSON, ARIZONA, USA","ATLANTA, GEORGIA, USA",2017-01-02,DL,1127,TUS,ATL,600,637.0,37.0,...,0.0,0.0,06:00,06:37,06:47,11:57,11:29,12:03,Monday,1
2,"TUCSON, ARIZONA, USA","ATLANTA, GEORGIA, USA",2017-01-02,DL,2066,TUS,ATL,1430,1447.0,17.0,...,,,14:30,14:47,14:57,19:43,20:05,19:52,Monday,0


# Pre-processing

In [19]:
def time_conv(mil_time):
  hours = []
  hh = []
  mm = []
  for time in mil_time:
    if time == 2400:
      time = 0
    if math.isnan(time):
      hours.append(time)
      hh.append(time)
      mm.append(time)
    elif time < 59:
      hour = int(datetime.datetime.strptime(str(int(time)), '%M').strftime('%H'))
      minute = int(datetime.datetime.strptime(str(int(time)), '%M').strftime('%M'))
      hours.append(hour + minute/60)  
      hh.append(hour)  
      mm.append(minute)  
    else:
      hour = int(datetime.datetime.strptime(str(int(time)), '%H%M').strftime('%H'))
      minute = int(datetime.datetime.strptime(str(int(time)), '%H%M').strftime('%M'))
      hours.append(hour + minute/60)  
      hh.append(hour)  
      mm.append(minute)
  return hours, hh, mm

In [20]:
df['CRS_DEP_hours'], df['CRS_DEP_HH'], df['CRS_DEP_MM'] = time_conv(df['CRS_DEP_TIME'])
df['CRS_ARR_hours'], df['CRS_ARR_HH'], df['CRS_ARR_MM'] = time_conv(df['CRS_ARR_TIME'])

In [21]:
df.head(3)

Unnamed: 0,origin_city_name,dest_city_name,FL_DATE,OP_CARRIER,OP_CARRIER_FL_NUM,ORIGIN,DEST,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,...,CRS_ARR_TIME_HM,ARR_TIME_HM,day_of_week,DELAY,CRS_DEP_hours,CRS_DEP_HH,CRS_DEP_MM,CRS_ARR_hours,CRS_ARR_HH,CRS_ARR_MM
0,"TUCSON, ARIZONA, USA","ATLANTA, GEORGIA, USA",2017-01-01,DL,2066,TUS,ATL,1430,1459.0,29.0,...,20:03,20:28,Sunday,0,14.5,14,30,20.05,20,3
1,"TUCSON, ARIZONA, USA","ATLANTA, GEORGIA, USA",2017-01-02,DL,1127,TUS,ATL,600,637.0,37.0,...,11:29,12:03,Monday,1,6.0,6,0,11.483333,11,29
2,"TUCSON, ARIZONA, USA","ATLANTA, GEORGIA, USA",2017-01-02,DL,2066,TUS,ATL,1430,1447.0,17.0,...,20:05,19:52,Monday,0,14.5,14,30,20.083333,20,5


In [22]:
# Split dataframe into X and y
# Define model variables

# Model input
X_categorical_df = df[['OP_CARRIER', 'day_of_week', 'DEST']]
X_numeric_df = df[['OP_CARRIER_FL_NUM', 'CRS_DEP_hours', 'CRS_ARR_hours', 'DISTANCE']]
X_numeric_df.DISTANCE = X_numeric_df.DISTANCE.astype(int)

# Model output
y_df = df[['CANCELLED', 'DIVERTED', 'DELAY']]
y_df.CANCELLED = y_df.CANCELLED.astype(int)
y_df.DIVERTED = y_df.DIVERTED.astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [23]:
# Review model output
print(y_df.info())
y_df.head(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15406 entries, 0 to 15405
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   CANCELLED  15406 non-null  int32
 1   DIVERTED   15406 non-null  int32
 2   DELAY      15406 non-null  int64
dtypes: int32(2), int64(1)
memory usage: 240.8 KB
None


Unnamed: 0,CANCELLED,DIVERTED,DELAY
0,0,0,0
1,0,0,1
2,0,0,0


In [24]:
# Review model numeric input
print(X_numeric_df.info())
X_numeric_df.head(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15406 entries, 0 to 15405
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   OP_CARRIER_FL_NUM  15406 non-null  int64  
 1   CRS_DEP_hours      15406 non-null  float64
 2   CRS_ARR_hours      15406 non-null  float64
 3   DISTANCE           15406 non-null  int32  
dtypes: float64(2), int32(1), int64(1)
memory usage: 421.4 KB
None


Unnamed: 0,OP_CARRIER_FL_NUM,CRS_DEP_hours,CRS_ARR_hours,DISTANCE
0,2066,14.5,20.05,1541
1,1127,6.0,11.483333,1541
2,2066,14.5,20.083333,1541


In [25]:
# Review model categorical input
print(X_categorical_df.info())
X_categorical_df.head(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15406 entries, 0 to 15405
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   OP_CARRIER   15406 non-null  object
 1   day_of_week  15406 non-null  object
 2   DEST         15406 non-null  object
dtypes: object(3)
memory usage: 361.2+ KB
None


Unnamed: 0,OP_CARRIER,day_of_week,DEST
0,DL,Sunday,ATL
1,DL,Monday,ATL
2,DL,Monday,ATL


In [26]:
# Get categories for one hot encoder
carriers = pd.unique(X_categorical_df.OP_CARRIER)
day = pd.unique(X_categorical_df.day_of_week)
destination = pd.unique(X_categorical_df.DEST)
print(carriers)
print(day)
print(destination)

['DL' 'OO' 'UA' 'WN' 'AA' 'EV' 'AS']
['Sunday' 'Monday' 'Tuesday' 'Wednesday' 'Thursday' 'Friday' 'Saturday']
['ATL' 'DEN' 'DFW' 'HOU' 'IAH' 'JFK' 'LAS' 'LAX' 'MDW' 'MSP' 'OAK' 'ORD'
 'PDX' 'PHX' 'SAN' 'SEA' 'SFO' 'SJC' 'SLC']


In [27]:
column_names = ['DL', 'OO', 'UA', 'WN', 'AA', 'EV', 'AS', \
 'Sunday', 'Monday', 'Tuesday', 'Wednesday', \
 'Thursday', 'Friday', 'Saturday', 'ATL', 'DEN', \
 'DFW', 'HOU', 'IAH', 'JFK', 'LAS', 'LAX', 'MDW', \
 'MSP', 'OAK', 'ORD', 'PDX', 'PHX', 'SAN', 'SEA', 'SFO', 'SJC', 'SLC']

In [28]:
# One-hot-encode text object fields
encoder = OneHotEncoder(categories=[carriers,day,destination])
encoder.fit(X_categorical_df)
X_encoded = encoder.transform(X_categorical_df)
X_encoded.toarray()

array([[1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 1., 0., ..., 0., 0., 1.],
       [0., 1., 0., ..., 0., 0., 1.],
       [0., 1., 0., ..., 0., 0., 1.]])

In [29]:
X_encoded_df = pd.DataFrame(X_encoded.toarray(), columns=column_names)
X_encoded_df.head(3)

Unnamed: 0,DL,OO,UA,WN,AA,EV,AS,Sunday,Monday,Tuesday,...,MSP,OAK,ORD,PDX,PHX,SAN,SEA,SFO,SJC,SLC
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
X_df = X_numeric_df
X_df = X_df.join(X_encoded_df)
X_df.head(3)

Unnamed: 0,OP_CARRIER_FL_NUM,CRS_DEP_hours,CRS_ARR_hours,DISTANCE,DL,OO,UA,WN,AA,EV,...,MSP,OAK,ORD,PDX,PHX,SAN,SEA,SFO,SJC,SLC
0,2066,14.5,20.05,1541,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1127,6.0,11.483333,1541,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2066,14.5,20.083333,1541,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
# Split X and y into training and testing groups
X_train, X_test, y_train, y_test = train_test_split(
    X_df, y_df, test_size=0.3, random_state=42)

In [32]:
# Display training data
X_train.head()

Unnamed: 0,OP_CARRIER_FL_NUM,CRS_DEP_hours,CRS_ARR_hours,DISTANCE,DL,OO,UA,WN,AA,EV,...,MSP,OAK,ORD,PDX,PHX,SAN,SEA,SFO,SJC,SLC
10939,5538,11.333333,16.783333,1437,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10779,5538,12.316667,16.833333,1437,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10092,2068,16.416667,20.666667,1440,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8774,1251,5.083333,6.583333,451,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14968,4769,11.5,14.35,601,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [33]:
# Scale the data with MinMaxScaler
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Save the output to csv files

In [107]:
X_train.to_csv('2017_TUS_X_train_dec_hrs.csv', index = False)
# files.download('2017_TUS_X_train_dec_hrs.csv')
y_train.to_csv('2017_TUS_y_train_dec_hrs.csv', index = False)
# files.download('2017_TUS_y_train_dec_hrs.csv')
X_test.to_csv('2017_TUS_X_test_dec_hrs.csv', index = False)
# files.download('2017_TUS_X_test_dec_hrs.csv')
y_test.to_csv('2017_TUS_y_test_dec_hrs.csv', index = False)
# files.download('2017_TUS_y_test_dec_hrs.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>