In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import numpy as np
import pandas as pd

from collections import Counter
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score,precision_score,recall_score
from sklearn.preprocessing import LabelEncoder

#warnings
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Loading Dataset
transport_df = pd.read_csv("/content/drive/MyDrive/data/Transport_5_years/combined_transport_Sep.csv")

In [None]:
len(transport_df)

2594209

In [None]:
# Dropping columns which are irrelevant / Directly relate to delay
data = transport_df.drop(['OP_UNIQUE_CARRIER','OP_CARRIER_AIRLINE_ID','TAIL_NUM', 'ORIGIN_CITY_MARKET_ID',
                          'ORIGIN_AIRPORT_ID','ORIGIN_AIRPORT_SEQ_ID','DEST_AIRPORT_ID','DEST_AIRPORT_SEQ_ID',
                          'CANCELLATION_CODE', 'CARRIER_DELAY', 'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'NAS_DELAY', 'LATE_AIRCRAFT_DELAY'], axis=1)
data = data.set_index('OP_CARRIER_FL_NUM')

In [None]:
nulls = data.isna().sum()
nulls

YEAR                       0
MONTH                      0
DAY_OF_MONTH               0
DAY_OF_WEEK                0
FL_DATE                    0
ORIGIN                     0
ORIGIN_CITY_NAME           0
ORIGIN_STATE_ABR           0
ORIGIN_STATE_NM            0
DEST_CITY_MARKET_ID        0
DEST                       0
DEST_CITY_NAME             0
DEST_STATE_ABR             0
DEST_STATE_NM              0
DEP_TIME               33245
DEP_DELAY              33249
DEP_DELAY_NEW          33249
DEP_DEL15              33249
ARR_TIME               34752
ARR_DELAY              39409
ARR_DELAY_NEW          39409
ARR_DEL15              39409
CANCELLED                  0
DIVERTED                   0
DISTANCE                   0
dtype: int64

In [None]:
# Dropping Rows with Null Values
data.dropna(inplace=True)
len(data)

2554800

In [None]:
# Helper function to create Time-Blocks
def arr_time(x):

  if x >= 600 and x < 1200:
    return 'MORNING'
  elif x>=1200 and x<1600:
    return 'AFTERNOON'
  elif x>=1600 and x<2000:
    return 'EVENING'
  elif x>=2000 and x<0:
    return 'NIGHT'
  elif x>=0 and x<600:
    return 'MIDNIGHT'

data['ARR_TIME'] = data['ARR_TIME'].astype('int')
data['ARR_TIME_BLOCK'] = data['ARR_TIME'].apply(lambda x :arr_time(x))
data['DEP_TIME'] = data['DEP_TIME'].astype('int')
data['DEP_TIME_BLOCK'] = data['DEP_TIME'].apply(lambda x :arr_time(x))
data = data.drop(columns=[ 'ARR_TIME', 'DEP_TIME'])
data.reset_index(inplace=True)

In [None]:
# Label Encoding
label_encoder = LabelEncoder()
label_encoder.fit(pd.concat([data['DEST'], data['ORIGIN']]))

data["ORIGIN"] = label_encoder.transform(data['ORIGIN'])
data["DEST"] = label_encoder.transform(data['DEST'])
data["PATH"] = data["ORIGIN"] - data["DEST"]

data = data.drop(columns=[ 'DEP_DELAY_NEW', 'ORIGIN', 'DEST'])
data.reset_index(inplace=True)

data["IsDelay"] = np.where((data['DEP_DEL15'] == 1) | (data['ARR_DEL15'] == 1), 1, 0)
data = data.drop(columns=['DEP_DEL15', 'ARR_DEL15'])

In [None]:
data = data.drop(columns=['FL_DATE', 'index', 'ORIGIN_CITY_NAME',	'ORIGIN_STATE_ABR',	'ORIGIN_STATE_NM',	'DEST_CITY_MARKET_ID',	'DEST_CITY_NAME',	'DEST_STATE_ABR', 'DEST_STATE_NM'])

In [None]:
data["ARR_TIME_BLOCK"] = label_encoder.fit_transform(data['ARR_TIME_BLOCK'])
data["DEP_TIME_BLOCK"] = label_encoder.fit_transform(data['DEP_TIME_BLOCK'])

In [None]:
data = data.drop(columns=['YEAR', 'DEP_DELAY', 'ARR_DELAY', 'ARR_DELAY_NEW', 'CANCELLED'])

In [None]:
# Creating Train-Test Split
X = data.drop(columns=["IsDelay"])
Y = data["IsDelay"]

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state = 0)

In [None]:
# Normalization
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Logistic Regression

from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

model.score(X_test_scaled ,y_test)

0.8225771097541882

In [None]:
# Random Forest

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# {'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 200}
# Best params determined from Grid Search
rf_model = RandomForestClassifier(max_depth=20, min_samples_leaf=2, min_samples_split=5, n_estimators=200, random_state=0)
# rf_model = RandomForestClassifier(n_estimators=100, random_state=0)
rf_model.fit(X_train_scaled, y_train)

y_pred = rf_model.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of Random Forest model:", accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy of Random Forest model: 0.8309879442617818
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.99      0.91    525380
           1       0.71      0.08      0.14    113320

    accuracy                           0.83    638700
   macro avg       0.77      0.54      0.52    638700
weighted avg       0.81      0.83      0.77    638700



In [None]:
# XG-Boost

import xgboost as xgb

xgb_classifier = xgb.XGBClassifier()
xgb_classifier.fit(X_train_scaled, y_train)

y_pred = xgb_classifier.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of Random Forest model:", accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy of Random Forest model: 0.8287427587286677
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.99      0.90    525380
           1       0.65      0.08      0.14    113320

    accuracy                           0.83    638700
   macro avg       0.74      0.53      0.52    638700
weighted avg       0.80      0.83      0.77    638700



In [None]:
# Suport Vector Classifier

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

svm_model = SVC(kernel='linear', C=1.0, random_state=0)

svm_model.fit(X_train_scaled, y_train)

y_pred_svm = svm_model.predict(X_test_scaled)

accuracy_svm = accuracy_score(y_test, y_pred_svm)
print("Accuracy of SVM model:", accuracy_svm)
print("Classification Report for SVM:\n", classification_report(y_test, y_pred_svm))

In [None]:
# Grid Search for Random Forest

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

rf = RandomForestClassifier(random_state=0)

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2, scoring='accuracy')

grid_search.fit(X_train_scaled, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

rf_best = grid_search.best_estimator_
y_pred = rf_best.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy with optimized Random Forest:", accuracy)