### Import required libraries


In [1]:
import os
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import joblib

In [2]:
df1 = pd.read_csv("flights.csv")

### Data description

In [None]:
pd.set_option('display.max_columns',None)
df1

In [None]:
pd.set_option('display.float_format', '{:.2f}'.format)

df1.describe()

In [None]:
pd.set_option('display.max_columns', None)
cancelled_flights = df1[df1['CANCELLED'] == 1]
cancelled_flights


#### Creating new dataframe without cancelled flights

In [None]:
df2 = df1[df1['CANCELLED'] == 0]
df2

### Dropping Cancellation Reason column

In [7]:
df2 = df2.drop(columns=['CANCELLATION_REASON'])

### Analysing the reason for other null values

In [None]:
pd.set_option('display.max_columns',None)
rows_with_null_arr_time = df2[df2['ARRIVAL_TIME'].isnull()]
rows_with_null_arr_time


### Dropping Arrival_Time rows with null values

In [None]:
df2.dropna(subset=['ARRIVAL_TIME'], inplace=True)

### Diverted Flights


In [None]:
pd.set_option('display.max_columns',None)
diverted_flights = df2[df2['DIVERTED']==1]
diverted_flights


### Filtering the diverted flights from the dataset

In [11]:
df2 = df2[df2['DIVERTED'] != 1]


In [12]:
delay_columns = ['AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'AIRLINE_DELAY', 'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY']

for column in delay_columns:
    df2[column].fillna(0, inplace=True)

### Dropping categorical columns (new dataframe)

In [13]:
df3 = df2.drop(['AIRLINE', 'TAIL_NUMBER', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT'], axis=1)

In [14]:
df3.drop(columns=['YEAR', 'DIVERTED', 'CANCELLED'], inplace=True)

### Preparation for model training

In [15]:
# Remove the irrelevant features from the dataset
irrelevant_features = ['WHEELS_ON', 'WHEELS_OFF', 'MONTH', 'AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY', 'DISTANCE']
df_clean = df3.drop(columns=irrelevant_features)

# Transform DEPARTURE_DELAY into binary target variable
y_binary = df_clean['DEPARTURE_DELAY'].apply(lambda x: 1 if x > 15 else 0)
y_binary = y_binary.rename('Delay')

# Split the updated dataset into features (X) and target variable (y)
X = df_clean.drop(columns=['DEPARTURE_DELAY'])
y = pd.DataFrame(y_binary)

In [16]:
df3 = pd.concat([df3, y], axis=1)


### Imbalance

In [None]:
ax = y.value_counts().sort_values().plot(kind="barh", color=["r", "g"])
ax.set_axisbelow(True)
ax.grid()
totals= []
for i in ax.patches:
    totals.append(i.get_width())
total = sum(totals)
for i in ax.patches:
     ax.text(i.get_width()+.3, i.get_y()+.20, 
     str(round((i.get_width()/total)*100, 2))+'%', 
     fontsize=10, color='black')
plt.title("Delay", fontsize=20)
plt.xlabel("Count", fontsize=14)
plt.ylabel("Class", fontsize=14)
plt.show()
print(y.value_counts())


### Point Biserial Correlation for Quantitative, Continuous Features

In [None]:
# from scipy.stats import pointbiserialr

# # List of numeric column names
# numeric_cols = df3.select_dtypes(include=['number']).columns.tolist()
# target = df3["Delay"]
# # Calculate point-biserial correlation for each numeric column
# pbc = []
# for col in numeric_cols:
#     ans = pointbiserialr(df3[col], target)
#     pbc.append([col, ans[0], ans[1]])

# # Create DataFrame from the correlation results
# pbc_corr = pd.DataFrame(pbc, columns=["Feature", "CorrCoeff", "pValue"]).sort_values(by="CorrCoeff", ascending=False).reset_index(drop=True)
# pbc_corr


In [None]:
# plt.figure(figsize=(7, 5))
# pbc_corr = pbc_corr.set_index("Feature")
# heatmap = sns.heatmap(pbc_corr[["CorrCoeff"]].sort_values(by="CorrCoeff", ascending=False), vmin=-1, vmax=1, annot=True, cmap="BrBG")
# heatmap.set_title("PBC with Delay", fontdict={"fontsize":18}, pad=16);

#### Data Splitting:

In [20]:
y = y.values.ravel()

In [21]:
# Splitting into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)


### Imbalance in training data

In [None]:
# Plotting the percentage of observations that fall under each class
ax = pd.DataFrame(y_train).value_counts().sort_values().plot(kind="barh", color=["r", "g"])
ax.set_axisbelow(True)
ax.grid()
totals= []
for i in ax.patches:
    totals.append(i.get_width())
total = sum(totals)
for i in ax.patches:
     ax.text(i.get_width()+.3, i.get_y()+.20, 
     str(round((i.get_width()/total)*100, 2))+'%', 
     fontsize=10, color='black')
plt.title("Delay", fontsize=20)
plt.xlabel("Count", fontsize=14)
plt.ylabel("Class", fontsize=14)
plt.show()
print(pd.DataFrame(y_train).value_counts())


### Random Undersampling

In [23]:
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(random_state=42)
X_train, y_train = rus.fit_resample(X_train, y_train)

### Balanced training data

In [None]:
# Plotting the percentage of observations that fall under each class
ax = pd.DataFrame(y_train).value_counts().sort_values().plot(kind="barh", color=["r", "g"])
ax.set_axisbelow(True)
ax.grid()
totals= []
for i in ax.patches:
    totals.append(i.get_width())
total = sum(totals)
for i in ax.patches:
     ax.text(i.get_width()+.3, i.get_y()+.20, 
     str(round((i.get_width()/total)*100, 2))+'%', 
     fontsize=10, color='black')
plt.title("Delay", fontsize=20)
plt.xlabel("Count", fontsize=14)
plt.ylabel("Class", fontsize=14)
plt.show()
print(pd.DataFrame(y_train).value_counts())

In [None]:
df3['Delay'].value_counts()

In [None]:
795593 - 238798

In [None]:
209911 - 62854

In [None]:
556795 + 147057

### Support Vector Machine

In [25]:
svm_model = SVC(kernel='linear', random_state=42,probability=True)
svm_model.fit(X_train, y_train)

In [None]:
# Predict on the testing set
pred_2 = svm_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, pred_2)
report = classification_report(y_test, pred_2)

print(f"Accuracy: {accuracy}")
print("Classification Report:\n", report)

confusion_matrix(y_true=y_test,y_pred=pred_2)
cm = confusion_matrix(y_test,pred_2,labels=(1,0))
cm

In [None]:
# Plotting the confusion matrix
plt.figure(figsize=(6, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Delayed', 'Not Delayed'], yticklabels=['Delayed', 'Not Delayed'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

In [None]:
model_2 = 'SVM_final.pkl'
joblib.dump(svm_model, model_2)

### Feature selection based on SVM model

In [None]:
# model_2 = 'SVM.pkl'
# if os.path.exists(model_2):
#     loaded_model_2 = joblib.load(model_2)

# # Step 2: Extract Coefficients
# coefficients = loaded_model_2.coef_

# # Step 3: Map Coefficients to Features
# feature_names = X.columns  # Assuming X is your feature matrix
# coefficients_map = dict(zip(feature_names, coefficients[0]))

# # Step 4: Analyze Coefficients
# # Print coefficients of each feature
# for feature, coefficient in coefficients_map.items():
#     print(f"Feature: {feature}, Coefficient: {coefficient}")

# # Step 5: Identify Important Features
# # Sort features based on absolute coefficient values
# important_features = sorted(coefficients_map, key=lambda x: abs(coefficients_map[x]), reverse=True)
# print("Important Features:", important_features)


In [None]:
# from sklearn.feature_selection import RFE

# # Initialize the SVM classifier
# # svm_model = SVC(kernel='linear')
# loaded_svm = joblib.load("svm_model.pkl")

# # Initialize RFE
# rfe = RFE(estimator=loaded_svm, n_features_to_select=16)  # Select the number of features you want to keep

# # Fit RFE
# rfe.fit(X_train, y_train)

# # Get the selected features
# selected_features = pd.DataFrame({'Feature': X_train.columns, 'Selected': rfe.support_, 'Ranking': rfe.ranking_})

# # Print the selected features
# print("Selected Features:")
# print(selected_features[selected_features['Selected'] == True])


### ROC Curve

In [None]:
y_pred_proba = svm_model.predict_proba(X_test)[:, 1]  # Get predicted probabilities for positive class

# Step 3: Get True Labels
y_true = y_test

In [None]:
from sklearn.metrics import roc_auc_score, roc_curve

roc_auc = roc_auc_score(y_true, y_pred_proba)

# Compute ROC curve
fpr, tpr, thresholds = roc_curve(y_true, y_pred_proba)

# Plot ROC curve
import matplotlib.pyplot as plt
plt.figure()
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
plt.show()
