In [1]:
import pandas as pd

flight_df = pd.read_csv("flights_historical/train_1.csv")

# reformat departure_time
flight_df['departure_time'] = flight_df['departure_time'].apply(str).apply(lambda x: '0'+x if len(x) != 4 else x)

# reformat the dates
flight_df['month'] = flight_df['month'].apply(lambda x: '0'+x[2:] if len(x) != 4 else x[2:])
flight_df['day_of_month'] = flight_df['day_of_month'].apply(lambda x: '0'+x[2:] if len(x) != 4 else x[2:])
flight_df['day_of_week'] = flight_df['day_of_week'].apply(lambda x: x[2:])
flight_df['is_delayed'] = flight_df['is_delayed'].astype('category').cat.codes

flight_df[['departure_time', 'month', 'day_of_month', 'day_of_week']] = flight_df[['departure_time', 'month', 'day_of_month', 'day_of_week']].apply(pd.to_numeric)

flight_df

Unnamed: 0,unique_carrier,origin,destination,is_delayed,departure_time,month,day_of_month,day_of_week,distance
0,RQ,ULX,TVQ,0,1143,12,16,5,1587
1,C6,URX,DIE,0,2356,3,22,3,2454
2,WT,HSR,VWP,1,2012,1,14,5,1437
3,NB,UGR,CNY,0,1626,10,14,6,328
4,WT,HTE,RJB,0,1112,5,18,3,957
...,...,...,...,...,...,...,...,...,...
89995,WP,ULQ,MQV,0,2056,6,20,1,448
89996,OZ,FSS,QFT,0,630,12,16,6,618
89997,AG,FPH,MQV,0,1525,1,31,2,83
89998,JG,TVQ,TCA,0,1310,12,6,2,146


In [2]:
# one-hot encoding/dummy variables for categorical data
dummy_df = flight_df.copy()
dummy_df = pd.get_dummies(dummy_df, columns=['unique_carrier', 'origin', 'destination'], drop_first=True)
dummy_df = dummy_df.rename(columns = {'is_delayed': 'target'})

numeric_cols = ['departure_time', 'month', 'day_of_month', 'day_of_week', 'distance']
category_cols = list(set(dummy_df.columns) - set(numeric_cols) - {'target'})
category_cols.sort()

dummy_df

Unnamed: 0,target,departure_time,month,day_of_month,day_of_week,distance,unique_carrier_BN,unique_carrier_C6,unique_carrier_HH,unique_carrier_HL,...,destination_YMV,destination_YPH,destination_YPU,destination_YQF,destination_YQO,destination_YRT,destination_YTL,destination_YTR,destination_YUQ,destination_YVH
0,0,1143,12,16,5,1587,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,2356,3,22,3,2454,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,2012,1,14,5,1437,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1626,10,14,6,328,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1112,5,18,3,957,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89995,0,2056,6,20,1,448,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
89996,0,630,12,16,6,618,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
89997,0,1525,1,31,2,83,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
89998,0,1310,12,6,2,146,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
# train-test split
from sklearn.model_selection import train_test_split

random_seed = 1147

train_df, test_df = train_test_split(dummy_df,test_size=0.20,random_state=random_seed, stratify=dummy_df['target'])

train_df

Unnamed: 0,target,departure_time,month,day_of_month,day_of_week,distance,unique_carrier_BN,unique_carrier_C6,unique_carrier_HH,unique_carrier_HL,...,destination_YMV,destination_YPH,destination_YPU,destination_YQF,destination_YQO,destination_YRT,destination_YTL,destination_YTR,destination_YUQ,destination_YVH
7439,0,2118,2,14,1,139,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
5518,1,1925,5,12,4,1123,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13990,0,700,7,11,1,239,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
64850,0,1252,2,21,1,460,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
53230,0,937,9,1,5,525,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37756,0,835,4,17,1,954,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
46193,0,536,8,8,1,453,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1423,0,1537,7,29,6,340,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
30221,0,747,2,15,3,496,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
# scaling numeric columns

from sklearn.preprocessing import StandardScaler
import numpy as np

scaler = StandardScaler()
scaler.fit(train_df[numeric_cols])

def get_features_and_target_arrays(df, numeric_cols, cat_cols, scaler):
    X_numeric_scaled = scaler.transform(df[numeric_cols])
    X_categorical = df[cat_cols].to_numpy()
    X = np.hstack((X_categorical, X_numeric_scaled))
    y = df['target']
    return X, y

X_train, y_train = get_features_and_target_arrays(train_df, numeric_cols, category_cols, scaler)
X_test, y_test = get_features_and_target_arrays(test_df, numeric_cols, category_cols, scaler)

In [5]:
X_train.shape

(72000, 602)

In [6]:
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

model = Sequential([
  Dense(100, activation='tanh', input_shape=(602,)),
  Dense(80, activation='relu'),
  Dense(60, activation='tanh'),
#   Dense(70, activation='relu'),
#   Dense(60, activation='tanh'),
  Dense(40, activation='relu'),
  Dense(30, activation='tanh'),
  Dense(20, activation='relu'),
  Dense(10, activation='tanh'),
  Dense(1, activation='sigmoid'),
])

# model.summary()

model.compile(
  optimizer=Adam(lr=0.1),
  loss='binary_crossentropy',
  metrics=['accuracy'],
)

model.fit(
  X_train,
  y_train,
  epochs=20,
  batch_size=10,
  validation_split=0.1
)

y_pred_nn = model.predict(X_test)

model.evaluate(
  X_test,
  y_test
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


[0.48801469802856445, 0.8095555305480957]

In [7]:
y_pred_nn[1]

array([0.17211014], dtype=float32)

In [8]:
from sklearn.metrics import log_loss, roc_auc_score, recall_score, precision_score, average_precision_score, f1_score, classification_report, accuracy_score, plot_roc_curve, plot_precision_recall_curve, plot_confusion_matrix, confusion_matrix

def get_performance_metrics(y_test, y_pred, title):
    print('Performance metrics for', title)
    acc = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1score = f1_score(y_test, y_pred)
    
    print('Accuracy = {:.5f}'.format(acc))
    print('Precision = {:.5f}'.format(precision))
    print('Recall = {:.5f}'.format(recall))
    print('F1 score = {:.5f}'.format(f1score))
    print()
          
    return [acc, precision, recall, f1score]

new_preds = np.zeros(y_pred_nn.shape)
i = 0

for y in y_pred_nn:
    if y > 0.5:
        new_preds[i] = 1
    else:
        new_preds[i] = 0
    i += 1
    
nn_perf = get_performance_metrics(y_test, new_preds, 'Simple Neural Network')

Performance metrics for Simple Neural Network
Accuracy = 0.80956
Precision = 0.00000
Recall = 0.00000
F1 score = 0.00000



  _warn_prf(average, modifier, msg_start, len(result))


In [10]:
for x in new_preds:
    if x == 1:
        print("ook")