In [19]:
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import Dropout, Dense
from sklearn.metrics import f1_score
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers.convolutional import Convolution1D
from keras.layers.convolutional import MaxPooling1D

In [20]:
data = pd.read_csv("../On_Time_Marketing_Carrier_On_Time_Performance_2022_12_final.csv")

In [21]:
data.head()

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,FlightDate,Marketing_Airline_Network,DOT_ID_Marketing_Airline,Flight_Number_Marketing_Airline,Origin,OriginCityName,...,DistanceGroup,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,FirstDepTime,TotalAddGTime,LongestAddGTime,DivAirportLandings
0,2022,12,19,1,12/19/2022,DL,19790,4628,BDL,"Hartford, CT",...,1,,,,,,,,,0
1,2022,12,20,2,12/20/2022,DL,19790,4628,BDL,"Hartford, CT",...,1,,,,,,,,,0
2,2022,12,21,3,12/21/2022,DL,19790,4628,BDL,"Hartford, CT",...,1,,,,,,,,,0
3,2022,12,22,4,12/22/2022,DL,19790,4628,BDL,"Hartford, CT",...,1,,,,,,,,,0
4,2022,12,23,5,12/23/2022,DL,19790,4628,BDL,"Hartford, CT",...,1,,,,,,,,,0


In [22]:
data.shape

(576827, 48)

In [23]:
data['FlightDate'] = pd.to_datetime(data['FlightDate'])

In [24]:
data["DelayGroup"] = None
data.loc[data["DepDelayMinutes"] == 0, "DelayGroup"] = "OnTime_Early"
data.loc[
    (data["DepDelayMinutes"] > 0) & (data["DepDelayMinutes"] <= 15), "DelayGroup"
] = "Small_Delay"
data.loc[
    (data["DepDelayMinutes"] > 15) & (data["DepDelayMinutes"] <= 45), "DelayGroup"
] = "Medium_Delay"
data.loc[data["DepDelayMinutes"] > 45, "DelayGroup"] = "Large_Delay"
data.loc[data["Cancelled"], "DelayGroup"] = "Cancelled"

In [25]:
train = data[data['FlightDate']<='2022-12-25']
test = data[data['FlightDate']>'2022-12-25']

In [26]:
train.loc[train['DelayGroup'].isnull()==True,'DelayGroup'] = 'OnTime_Early'

In [27]:
test.loc[test['DelayGroup'].isnull()==True,'DelayGroup'] = 'OnTime_Early'

In [28]:
train.columns

Index(['Year', 'Month', 'DayofMonth', 'DayOfWeek', 'FlightDate',
       'Marketing_Airline_Network', 'DOT_ID_Marketing_Airline',
       'Flight_Number_Marketing_Airline', 'Origin', 'OriginCityName',
       'OriginStateName', 'Dest', 'DestCityName', 'DestStateName',
       'CRSDepTime', 'DepTime', 'DepDelay', 'DepDelayMinutes', 'DepDel15',
       'DepartureDelayGroups', 'TaxiOut', 'WheelsOff', 'WheelsOn', 'TaxiIn',
       'CRSArrTime', 'ArrTime', 'ArrDelay', 'ArrDelayMinutes', 'ArrDel15',
       'ArrivalDelayGroups', 'Cancelled', 'CancellationCode', 'Diverted',
       'CRSElapsedTime', 'ActualElapsedTime', 'AirTime', 'Flights', 'Distance',
       'DistanceGroup', 'CarrierDelay', 'WeatherDelay', 'NASDelay',
       'SecurityDelay', 'LateAircraftDelay', 'FirstDepTime', 'TotalAddGTime',
       'LongestAddGTime', 'DivAirportLandings', 'DelayGroup'],
      dtype='object')

In [29]:
train[['DayOfWeek','Marketing_Airline_Network','Origin','Dest','CRSDepTime','TaxiOut','WheelsOff', 'WheelsOn', 'TaxiIn','CRSArrTime','Diverted',
       'CRSElapsedTime','AirTime', 'Flights', 'Distance']].isnull().sum()

DayOfWeek                        0
Marketing_Airline_Network        0
Origin                           0
Dest                             0
CRSDepTime                       0
TaxiOut                      18178
WheelsOff                    18178
WheelsOn                     18178
TaxiIn                       18178
CRSArrTime                       0
Diverted                         0
CRSElapsedTime                   0
AirTime                      18178
Flights                          0
Distance                         0
dtype: int64

In [42]:
train_data = train[['DayOfWeek','CRSDepTime','TaxiOut','WheelsOff', 'WheelsOn', 'TaxiIn','CRSArrTime','Diverted',
       'CRSElapsedTime','AirTime', 'Flights', 'Distance','DelayGroup']]

In [31]:
train_data.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data.dropna(inplace=True)


In [32]:
train_data.head()

Unnamed: 0,DayOfWeek,Marketing_Airline_Network,Origin,Dest,CRSDepTime,TaxiOut,WheelsOff,WheelsOn,TaxiIn,CRSArrTime,Diverted,CRSElapsedTime,AirTime,Flights,Distance,DelayGroup
0,1,DL,BDL,LGA,523,16.0,534.0,559.0,25.0,630,0,67,25.0,1,101,Cancelled
1,2,DL,BDL,LGA,523,11.0,529.0,557.0,18.0,630,0,67,28.0,1,101,Cancelled
2,3,DL,BDL,LGA,523,15.0,536.0,606.0,10.0,630,0,67,30.0,1,101,OnTime_Early
3,4,DL,BDL,LGA,523,14.0,532.0,603.0,8.0,630,0,67,31.0,1,101,OnTime_Early
11,4,DL,JFK,MSP,735,56.0,924.0,1113.0,7.0,1014,0,219,169.0,1,1029,Large_Delay


In [33]:
X_train = train_data[['DayOfWeek','CRSDepTime','TaxiOut','WheelsOff', 'WheelsOn', 'TaxiIn','CRSArrTime','Diverted',
       'CRSElapsedTime','AirTime', 'Flights', 'Distance']]
Y_train = train_data['DelayGroup']

In [34]:
test_data = test[['DayOfWeek','CRSDepTime','TaxiOut','WheelsOff', 'WheelsOn', 'TaxiIn','CRSArrTime','Diverted',
       'CRSElapsedTime','AirTime', 'Flights', 'Distance','DelayGroup']]
test_data.dropna(inplace=True)
X_test = test_data[['DayOfWeek','CRSDepTime','TaxiOut','WheelsOff', 'WheelsOn', 'TaxiIn','CRSArrTime','Diverted',
       'CRSElapsedTime','AirTime', 'Flights', 'Distance']]
Y_test = test_data['DelayGroup']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data.dropna(inplace=True)


In [35]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(Y_train)
y_train_enc = le.transform(Y_train)
y_test_enc = le.transform(Y_test)

In [36]:
# X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)

In [38]:
model = Sequential()

model.add(Convolution1D(64, 10, padding='valid', activation='relu',input_shape=(X_train.shape[1], 1)))
model.add(MaxPooling1D(pool_size=3))

# model.add(Convolution1D(128, 10, padding='valid', activation='relu'))
# model.add(MaxPooling1D(pool_size=3))


model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(512, activation='relu')) 
 
model.add(Dense(1, activation='sigmoid'))

In [39]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['binary_accuracy'])

# Fit data to model
model_fit = model.fit(X_train, y_train_enc,
            batch_size=32,
            epochs=8,
            verbose=1)

Epoch 1/8


2023-04-24 11:50:02.591009: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


In [40]:
accuracy = model.evaluate(X_test, y_test_enc, verbose=0)
print('Accuracy is:' + str(accuracy[1] * 100)+'%')
y_pred = model.predict(X_test, verbose=0)

Accuracy is:15.5641108751297%


In [45]:
train_data.dropna(how='any',inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data.dropna(how='any',inplace=True)


In [47]:
test_data.dropna(how='any',inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data.dropna(how='any',inplace=True)


In [48]:
train_data.to_csv("train_data.csv",index=None)

In [49]:
test_data.to_csv("test_data.csv",index=None)