In [15]:
# ------------------------------
# Data Manipulation & Analysis
# ------------------------------
import pandas as pd
import numpy as np

# ------------------------------
# Visualization
# ------------------------------
import seaborn as sns
import matplotlib.pyplot as plt

# ------------------------------
# Machine Learning
# ------------------------------
# Core
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import (
    classification_report, 
    confusion_matrix, 
    accuracy_score
)
from sklearn.base import clone

# Models
from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier
)
from sklearn.linear_model import LogisticRegression
import xgboost as xgb

# Class Imbalance
from imblearn.over_sampling import SMOTE

# ------------------------------
# System Utilities
# ------------------------------
import datetime
import warnings
import gc
import psutil
from tqdm import tqdm

# ------------------------------
# Configuration
# ------------------------------
warnings.filterwarnings('ignore')
pd.set_option("display.max_columns", None)

# STEP 1 - loading and merging

In [2]:
flights = pd.read_csv("archive/flights.csv")
airports = pd.read_csv('archive/airports.csv')
airlines = pd.read_csv('archive/airlines.csv')

  flights = pd.read_csv("archive/flights.csv")


In [3]:
flights[['AIR_SYSTEM_DELAY','SECURITY_DELAY','AIRLINE_DELAY','LATE_AIRCRAFT_DELAY','WEATHER_DELAY']] = flights[['AIR_SYSTEM_DELAY','SECURITY_DELAY','AIRLINE_DELAY','LATE_AIRCRAFT_DELAY','WEATHER_DELAY']].fillna(0)

In [4]:
# -----------------------
# STEP 0: Join Airlines
# -----------------------
flights = flights.merge(
    airlines,
    left_on="AIRLINE", right_on="IATA_CODE",
    how="left"
).drop(columns=["IATA_CODE"])  # drop duplicate airline code

In [5]:
# -----------------------
# STEP 1: Join Airports (Origin)
# -----------------------
flights = flights.merge(
    airports.add_prefix("ORG_"),
    left_on="ORIGIN_AIRPORT", right_on="ORG_IATA_CODE",
    how="left"
).drop(columns=["ORG_IATA_CODE", "ORG_AIRPORT"])

In [6]:
# -----------------------
# STEP 2: Join Airports (Destination)
# -----------------------
flights = flights.merge(
    airports.add_prefix("DEST_"),
    left_on="DESTINATION_AIRPORT", right_on="DEST_IATA_CODE",
    how="left"
).drop(columns=["DEST_IATA_CODE", "DESTINATION_AIRPORT"])

In [7]:
flights = flights.dropna(subset = ['ORG_LATITUDE','ORG_LONGITUDE','DEST_LATITUDE','DEST_LONGITUDE'])

In [8]:
flights.columns

Index(['YEAR', 'MONTH', 'DAY', 'DAY_OF_WEEK', 'AIRLINE_x', 'FLIGHT_NUMBER',
       'TAIL_NUMBER', 'ORIGIN_AIRPORT', 'SCHEDULED_DEPARTURE',
       'DEPARTURE_TIME', 'DEPARTURE_DELAY', 'TAXI_OUT', 'WHEELS_OFF',
       'SCHEDULED_TIME', 'ELAPSED_TIME', 'AIR_TIME', 'DISTANCE', 'WHEELS_ON',
       'TAXI_IN', 'SCHEDULED_ARRIVAL', 'ARRIVAL_TIME', 'ARRIVAL_DELAY',
       'DIVERTED', 'CANCELLED', 'CANCELLATION_REASON', 'AIR_SYSTEM_DELAY',
       'SECURITY_DELAY', 'AIRLINE_DELAY', 'LATE_AIRCRAFT_DELAY',
       'WEATHER_DELAY', 'AIRLINE_y', 'ORG_CITY', 'ORG_STATE', 'ORG_COUNTRY',
       'ORG_LATITUDE', 'ORG_LONGITUDE', 'DEST_AIRPORT', 'DEST_CITY',
       'DEST_STATE', 'DEST_COUNTRY', 'DEST_LATITUDE', 'DEST_LONGITUDE'],
      dtype='object')

# STEP 2 - preprocessing

In [9]:
import pandas as pd

# Compute mean arrival delay
df=flights.copy()
# mean_delay = df["ARRIVAL_DELAY"].mean()
mean_delay=15

# Create target column using mean as threshold
df["DELAYED"] = (df["ARRIVAL_DELAY"] >= mean_delay).astype(int)

# -----------------------------
# STEP 1: Create DATE column
# -----------------------------
df["DATE"] = pd.to_datetime(df[["YEAR", "MONTH", "DAY"]])

# -----------------------------
# STEP 2: Extract time-based features
# -----------------------------
# Departure and Arrival hours (e.g., 1345 → 13)
df["DEPARTURE_HOUR"] = (df["SCHEDULED_DEPARTURE"] // 100).astype(int)
df["ARRIVAL_HOUR"] = (df["SCHEDULED_ARRIVAL"] // 100).astype(int)

# -----------------------------
# STEP 3: Drop redundant columns
# -----------------------------
drop_cols = [
    # Date parts (replaced by DATE + derived hours)
    "YEAR", "MONTH", "DAY",
    "SCHEDULED_DEPARTURE", "SCHEDULED_ARRIVAL",
    
    # Flight identifiers (too high cardinality)
    "FLIGHT_NUMBER", "TAIL_NUMBER",
    
    # Duplicate airline column
    "AIRLINE_y",
    
    # Redundant airport info (covered by airport codes)
    "ORG_CITY", "ORG_STATE", "ORG_COUNTRY",
    "DEST_CITY", "DEST_STATE", "DEST_COUNTRY",
    
    # Drop coordinates (we keep DISTANCE instead)
    "ORG_LATITUDE", "ORG_LONGITUDE",
    "DEST_LATITUDE", "DEST_LONGITUDE",
    
    # 🚫 Post-flight / leakage features
    "DEPARTURE_TIME", "DEPARTURE_DELAY",
    "TAXI_OUT", "TAXI_IN", "WHEELS_OFF", "WHEELS_ON",
    "ELAPSED_TIME", "AIR_TIME",
    "ARRIVAL_TIME", "ARRIVAL_DELAY",  # target will be created separately
    "DIVERTED", "CANCELLED", "CANCELLATION_REASON",
    "AIR_SYSTEM_DELAY", "SECURITY_DELAY", "AIRLINE_DELAY",
    "LATE_AIRCRAFT_DELAY", "WEATHER_DELAY"
]


df = df.drop(columns=[col for col in drop_cols if col in df.columns])

# Correct way to rename column
df = df.rename(columns={'AIRLINE_x': 'AIRLINE'})

df = df.dropna(subset=['SCHEDULED_TIME'])
#EXTRACT MORE FEATURES
import pandas as pd
import holidays

# Extract month
df["MONTH"] = df["DATE"].dt.month

# Extract weekend (1 = weekend, 0 = weekday)
df["IS_WEEKEND"] = df["DATE"].dt.dayofweek.isin([5, 6]).astype(int)

# Extract holidays (example: US holidays, you can change to 'India' etc.)
us_holidays = holidays.US(years=df["DATE"].dt.year.unique())
df["IS_HOLIDAY"] = df["DATE"].isin(us_holidays).astype(int)

# Drop DATE itself (after feature engineering)
df = df.drop(columns=["DATE"])

# -----------------------------
# STEP 4: Verify remaining features
# -----------------------------
print("Remaining columns:\n", df.columns.tolist())

Remaining columns:
 ['DAY_OF_WEEK', 'AIRLINE', 'ORIGIN_AIRPORT', 'SCHEDULED_TIME', 'DISTANCE', 'DEST_AIRPORT', 'DELAYED', 'DEPARTURE_HOUR', 'ARRIVAL_HOUR', 'MONTH', 'IS_WEEKEND', 'IS_HOLIDAY']


  df["IS_HOLIDAY"] = df["DATE"].isin(us_holidays).astype(int)


1. Encode Categorical Features
2. Scale Numerical Features
3. Handle Class Imbalance

In [None]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE
import numpy as np

# Identify categorical and numerical columns
categorical_features = ['AIRLINE', 'ORIGIN_AIRPORT', 'DEST_AIRPORT']
numerical_features = ['DAY_OF_WEEK', 'SCHEDULED_TIME', 'DISTANCE', 
                     'DEPARTURE_HOUR', 'ARRIVAL_HOUR', 'MONTH']

# 1. Encode Categorical Features
label_encoders = {}
for col in categorical_features:
    label_encoders[col] = LabelEncoder()
    df[col] = label_encoders[col].fit_transform(df[col])

# 2. Scale Numerical Features
scaler = StandardScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# 3. Handle Class Imbalance
# First, let's check class distribution
print("Class distribution before SMOTE:")
print(df['DELAYED'].value_counts(normalize=True))

# Separate features and target
X = df.drop('DELAYED', axis=1)
y = df['DELAYED']
# Apply SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Convert back to DataFrame
df_balanced = pd.DataFrame(X_resampled, columns=X.columns)
df_balanced['DELAYED'] = y_resampled

print("\nClass distribution after SMOTE:")
print(df_balanced['DELAYED'].value_counts(normalize=True))

# Show the final shape of the balanced dataset
print("\nFinal dataset shape:", df_balanced.shape)

Class distribution before SMOTE:
DELAYED
0    0.811799
1    0.188201
Name: proportion, dtype: float64


# STEP 3 - model implementation

In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb

# ----------------------------
# 1. Split dataset
# ----------------------------
X = df_balanced.drop('DELAYED', axis=1)
y = df_balanced['DELAYED']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# ----------------------------
# 2. Initialize models
# ----------------------------
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'XGBoost': xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

# ----------------------------
# 3. Train + Evaluate
# ----------------------------
for name, model in models.items():
    print(f"\n{'-'*20} {name} {'-'*20}")
    
    # Train model
    model.fit(X_train, y_train)
    
    # Predict once (reuse later)
    y_pred = model.predict(X_test)

    # Accuracy + classification report
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    # Cross-validation score (parallelized, discard scores after printing)
    cv_result = cross_val_score(model, X_train, y_train, cv=5, n_jobs=-1)
    print(f"\nCross-validation accuracy: {np.mean(cv_result):.4f} (+/- {np.std(cv_result) * 2:.4f})")
    del cv_result  # free memory
    
    # Feature importance (tree models only)
    if hasattr(model, 'feature_importances_'):
        importances = zip(X_train.columns, model.feature_importances_)
        top5 = sorted(importances, key=lambda x: x[1], reverse=True)[:5]
        print("\nTop 5 most important features:")
        for feature, score in top5:
            print(f"{feature}: {score:.4f}")
    
    # Confusion Matrix (plotted one by one to save memory)
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt='d', cmap="Blues")
    plt.title(f'{name} Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()
    plt.close()
    
    # Free predictions + model after use
    del y_pred
    del model


-------------------- Random Forest --------------------


KeyboardInterrupt: 