### Load Data & Preprocessing

In [None]:
import sklearn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import linear_model, model_selection, metrics, neural_network, preprocessing
import warnings
warnings.filterwarnings('ignore')

path_airlines = "./datasets/airlines.csv"
path_airport = "./datasets/airports.csv"
path_flights = "./datasets/flights.csv"

df_airlines = pd.read_csv(path_airlines, low_memory=False)
df_airport = pd.read_csv(path_airport, low_memory=False)
df_flights = pd.read_csv(path_flights, low_memory=False)

In [None]:
delay_details = ['AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'AIRLINE_DELAY', 'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY']
columns_categorical = ['MONTH', 'DAY_OF_WEEK', 'AIRLINE', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT']
columns_numerical = ['SCHEDULED_DEPARTURE', 'DEPARTURE_DELAY', 'SCHEDULED_TIME', 'ARRIVAL_DELAY']


Given the information before the flight, we want to predict (1) whether it is delayed at arrival (>15min) (2) how long it is delayed. 

### Classification
In this part, we use some flight information to predict the flight arrival delay, i.e.
'MONTH', 'DAY_OF_WEEK', 'AIRLINE', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT', 'SCHEDULED_DEPARTURE', 'DEPARTURE_DELAY', 'SCHEDULED_TIME' to predict if
'ARRIVAL_DELAY' > 15

Here, we cannot include `delay details` column, otherwise the accuracy can easily reach 1.00 by the baseline logistic model. 

In [None]:
# only consider delayed variable
df_flights_cal = df_flights.sample(100000)

df_flights_cal[delay_details] = df_flights_cal[delay_details].fillna(0)
# empty those row with >10% missing values
missing_columns = df_flights_cal.loc[:, df_flights_cal.isna().mean() >= 0.1].columns
df_flights_cal = df_flights_cal.drop(missing_columns, axis=1)
df_flights_cal.dropna(inplace=True)

# discard cancelled or diverted flights.
df_flights_cal = df_flights_cal[df_flights_cal['CANCELLED'] != 1]
df_flights_cal = df_flights_cal[df_flights_cal['DIVERTED'] != 1]

# transform df_airlines to dict
dict_airlines = df_airlines.set_index('IATA_CODE')['AIRLINE'].to_dict()

# For those airline origin/dest airport not in IATA code, we replace them with 'OTHERS'
df_flights_cal.loc[~df_flights_cal['AIRLINE'].isin(df_airlines['IATA_CODE'].values),'AIRLINE']='OTHERS'
df_flights_cal.loc[~df_flights_cal['ORIGIN_AIRPORT'].isin(df_airport['IATA_CODE'].values),'ORIGIN_AIRPORT']='OTHERS'
df_flights_cal.loc[~df_flights_cal['DESTINATION_AIRPORT'].isin(df_airport['IATA_CODE'].values),'DESTINATION_AIRPORT']='OTHERS'

# transform HHMM into minutes.
def mintues(formatted_time):
    if formatted_time == 2400:
        formatted_time = 0
    formatted_time = "{0:04d}".format(int(formatted_time))
    return int(formatted_time[0:2])*60+int(formatted_time[2:4])
df_flights_cal['SCHEDULED_DEPARTURE'] = df_flights_cal['SCHEDULED_DEPARTURE'].apply(mintues)
df_flights_cal['SCHEDULED_ARRIVAL'] = df_flights_cal['SCHEDULED_ARRIVAL'].apply(mintues)

# Keeping those item (see eda.ipynb for reason)
df_flights_cal_keep = df_flights_cal[columns_categorical + columns_numerical].copy()

delay = []
for row in df_flights_cal_keep['ARRIVAL_DELAY']:
    if row > 15:
        delay.append(1)
    else:
        delay.append(0)  
df_flights_cal_keep['IS_DELAY'] = delay
sns.heatmap(df_flights_cal_keep.corr())

# convert categorical to onehot
df_flights_cal_dummies = pd.get_dummies(df_flights_cal_keep[columns_categorical].astype(str))
df_flights_cal_keep = df_flights_cal_keep.drop(columns_categorical, axis=1)
df_flights_cal_keep=pd.concat([df_flights_cal_keep,df_flights_cal_dummies],axis=1)
df_flights_cal_keep.shape

In [None]:
df_flights_cal_keep['IS_DELAY'].value_counts()

The dataset is unbalanced. We create and balance the training dataset in the following. We keep the test dataset untouched, since we wouldn't have access to the target variable to perform resampling. 

In [None]:
from sklearn.utils import resample
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

X_imbalanced = df_flights_cal_keep.drop(['ARRIVAL_DELAY', 'IS_DELAY'],axis=1)
X_imbalanced_columns = X_imbalanced.columns
X_imbalanced = scaler.fit_transform(X_imbalanced)
y_imbalanced = df_flights_cal_keep['IS_DELAY']
X_train_imbalanced, X_test, y_train_imbalanced, y_test = model_selection.train_test_split(X_imbalanced, y_imbalanced, test_size=0.2, random_state=0)

X_train_oversampled, y_train_oversampled = resample(X_train_imbalanced[y_train_imbalanced == 1], y_train_imbalanced[y_train_imbalanced == 1], replace=True, 
                                                    n_samples=X_train_imbalanced[y_train_imbalanced == 0].shape[0],random_state=0)
X_train = np.vstack((X_train_imbalanced[y_train_imbalanced == 0].copy(), X_train_oversampled))
y_train = np.hstack((y_train_imbalanced[y_train_imbalanced == 0].copy(), y_train_oversampled))
X_train = pd.DataFrame(X_train, columns=X_imbalanced_columns)


In [None]:
from sklearn.linear_model import LogisticRegression
logit = LogisticRegression().fit(X_train, y_train)
y_pred = logit.predict(X_test)
print(metrics.classification_report(y_test, y_pred, target_names=["NOT DELAYED", "DELAYED"]))
disp_logit = metrics.PrecisionRecallDisplay.from_estimator(logit, X_test, y_test, name="Logistic Regression")


In [None]:
# Decision Tree
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(max_depth=100).fit(X_train, y_train)
y_pred = dt.predict(X_test)
print(metrics.classification_report(y_test, y_pred, target_names=["NOT DELAYED", "DELAYED"]))
disp_dt = metrics.PrecisionRecallDisplay.from_estimator(dt, X_test, y_test, name="Decision Tree")

In [None]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier().fit(X_train, y_train)
y_pred = rf.predict(X_test)
print(metrics.classification_report(y_test, y_pred, target_names=["NOT DELAYED", "DELAYED"]))
disp_rf = metrics.PrecisionRecallDisplay.from_estimator(rf, X_test, y_test, name="Random Forest")

In [None]:
# MLP
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier().fit(X_train, y_train)
y_pred = mlp.predict(X_test)
print(metrics.classification_report(y_test, y_pred, target_names=["NOT DELAYED", "DELAYED"]))
disp_mlp = metrics.PrecisionRecallDisplay.from_estimator(mlp, X_test, y_test, name="MLP classifier")

In [None]:
fig, ax = plt.subplots()
disp_rf.plot(ax=ax, label='RandomForest')
disp_logit.plot(ax=ax, label='Logistic Regression')
disp_dt.plot(ax=ax, label='Decision Tree')


We find that logistic regression is the best algorithm. Then, we analyze the effect of airport and airline. 

In [None]:
logit_feature_dict = dict(zip(logit.feature_names_in_.tolist(), logit.coef_[0].tolist()))

In [None]:
airline_keys = [(x, np.exp(logit_feature_dict[x])) for x in logit_feature_dict.keys() if 'AIRLINE' in x]
airline_keys.sort(key=lambda x: x[1])

In [None]:
airline_keys

In [None]:
origin_airport_keys = [(x, np.exp(logit_feature_dict[x])) for x in logit_feature_dict.keys() if 'ORIGIN_AIRPORT' in x]
origin_airport_keys.sort(key=lambda x: x[1])

In [None]:
origin_airport_keys[-10:]

In [None]:
logit_feature_dict

### Regression
In this part, we use some delayed flight information to predict the exact value of flight delay, i.e.
'MONTH', 'DAY_OF_WEEK', 'AIRLINE', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT', 'SCHEDULED_DEPARTURE', 'DEPARTURE_DELAY', 'SCHEDULED_TIME',
as well as flight `delay details`, i.e. 'AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'AIRLINE_DELAY', 'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY'
 to predict the value of flight delay. 

In [None]:
# only consider delayed variable
df_flights_reg = df_flights.sample(100000)

df_flights_reg[delay_details] = df_flights_reg[delay_details].fillna(0)
# empty those column with >10% missing values
missing_columns = df_flights_reg.loc[:, df_flights_reg.isna().mean() >= 0.1].columns
df_flights_reg = df_flights_reg.drop(missing_columns, axis=1)
df_flights_reg.dropna(inplace=True)
# discard cancelled or diverted flights.
df_flights_reg = df_flights_reg[df_flights_reg['CANCELLED'] != 1]
df_flights_reg = df_flights_reg[df_flights_reg['DIVERTED'] != 1]

# transform df_airlines to dict
dict_airlines = df_airlines.set_index('IATA_CODE')['AIRLINE'].to_dict()

# For those airline origin/dest airport not in IATA code, we replace them with 'OTHERS'
df_flights_reg.loc[~df_flights_reg['AIRLINE'].isin(df_airlines['IATA_CODE'].values),'AIRLINE']='OTHERS'
df_flights_reg.loc[~df_flights_reg['ORIGIN_AIRPORT'].isin(df_airport['IATA_CODE'].values),'ORIGIN_AIRPORT']='OTHERS'
df_flights_reg.loc[~df_flights_reg['DESTINATION_AIRPORT'].isin(df_airport['IATA_CODE'].values),'DESTINATION_AIRPORT']='OTHERS'

# transform HHMM into minutes.
def mintues(formatted_time):
    if formatted_time == 2400:
        formatted_time = 0
    formatted_time = "{0:04d}".format(int(formatted_time))
    return int(formatted_time[0:2])*60+int(formatted_time[2:4])
df_flights_reg['SCHEDULED_DEPARTURE'] = df_flights_reg['SCHEDULED_DEPARTURE'].apply(mintues)
df_flights_reg['SCHEDULED_ARRIVAL'] = df_flights_reg['SCHEDULED_ARRIVAL'].apply(mintues)

# Keeping those item (see eda.ipynb for reason)
df_flights_reg_keep = df_flights_reg[columns_categorical + columns_numerical + delay_details].copy()

# # convert categorical to onehot
df_flights_reg_dummies = pd.get_dummies(df_flights_reg_keep[columns_categorical].astype(str))
df_flights_reg_keep = df_flights_reg_keep.drop(columns_categorical, axis=1)
df_flights_reg_keep=pd.concat([df_flights_reg_keep,df_flights_reg_dummies],axis=1)
df_flights_reg_keep.shape

In [None]:
# prepare dataset and test variable
def all_test_regression(model, X_test, y_test):
    pred = model.predict(X_test)
    print("R^2: ", model.score(X_test, y_test))
    print("RMSD: ", metrics.mean_squared_error(pred, y_test, squared=False))


X = df_flights_cal_keep.drop(['ARRIVAL_DELAY'],axis=1)
Y = df_flights_cal_keep['ARRIVAL_DELAY']
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, Y, test_size=0.2, random_state=0)

#### Linear Model

In [None]:
from sklearn import linear_model
lm = linear_model.LinearRegression().fit(X_train, y_train)
all_test_regression(lm, X_test, y_test)

In [None]:
from sklearn.tree import DecisionTreeRegressor
dt = DecisionTreeRegressor().fit(X_train, y_train)
all_test_regression(dt, X_test, y_test)

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor().fit(X_train, y_train)
all_test_regression(rf, X_test, y_test)