# Initial Setup

In [1]:
# Load Virtual Environment

!& "c:\Users\tbran\Python\repos\Semester 3 Repos\capstone\.venv\Scripts\Activate.ps1"


& was unexpected at this time.


In [6]:
# Core
import os
import re
import glob
import time
import unicodedata

import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

# Dates & holidays
import holidays

# Statsmodels
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Scikit-learn: preprocessing & pipelines
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from category_encoders import TargetEncoder
import category_encoders as ce  # if you need additional encoders

# Scikit-learn: models
from sklearn.linear_model import (
    LinearRegression,
    Lasso,
    Ridge,
    ElasticNet,
    LogisticRegression
)
from sklearn.cross_decomposition import PLSRegression
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Scikit-learn: model selection & metrics
from sklearn.model_selection import (
    train_test_split,
    GridSearchCV,
    RandomizedSearchCV
)
from sklearn.metrics import (
    r2_score,
    mean_squared_error,
    mean_absolute_error,
    accuracy_score,
    roc_auc_score,
    classification_report,
    confusion_matrix
)

# Feature selection
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

# Distributions
from scipy.stats import loguniform

In [7]:
project_path = 'C:/Users/tbran/Python/repos/Semester 3 Repos/capstone/'
data_path = project_path + 'data/'
src_path = project_path + 'src/'
model_path = project_path + 'models/'

# Initialize results list
results = []

# Data Prep 

## Data Prep Functions

In [4]:
def optimize_dataframe(df, datetime_cols=None, fillna=False):
    """
    Cleans and optimizes a DataFrame:
    - Converts object datetime columns to datetime64
    - Converts object columns with repeated values to category
    - Downcasts numeric columns to smallest safe type
    - Optionally fills NaNs before downcasting
    
    Parameters:
        df (pd.DataFrame): The DataFrame to optimize
        datetime_cols (list): List of column names to convert to datetime
        fillna (bool): If True, fills NaNs before downcasting
    """
    
    start_mem = df.memory_usage(deep=True).sum() / 1024**2
    print(f"Memory usage before optimization: {start_mem:.2f} MB")
    
    df = df.copy()
    
    # 1. Convert datetime columns
    if datetime_cols:
        for col in datetime_cols:
            if col in df.columns:
                df[col] = pd.to_datetime(df[col], errors='coerce')
    
    # 2. Convert object columns to category if appropriate
    obj_cols = df.select_dtypes(include=['object']).columns
    for col in obj_cols:
        num_unique = df[col].nunique()
        num_total = len(df[col])
        if num_unique / num_total < 0.5:  # heuristic: less than 50% unique
            df[col] = df[col].astype('category')
    
    # 3. Downcast numeric columns
    int_cols = df.select_dtypes(include=['int64', 'int32']).columns
    float_cols = df.select_dtypes(include=['float64', 'float32']).columns
    
    for col in int_cols:
        if fillna and df[col].isnull().any():
            df[col] = df[col].fillna(0)
        df[col] = pd.to_numeric(df[col], downcast='integer')
    
    for col in float_cols:
        if fillna and df[col].isnull().any():
            df[col] = df[col].fillna(df[col].mean())
        df[col] = pd.to_numeric(df[col], downcast='float')
    
    end_mem = df.memory_usage(deep=True).sum() / 1024**2
    print(f"Memory usage after optimization: {end_mem:.2f} MB")
    print(f"Reduced by {100 * (start_mem - end_mem) / start_mem:.1f}%")
    
    return df


def clean_column_names(df, remove_accents=True):
    """
    Cleans DataFrame column names:
    - Strips whitespace
    - Converts to lowercase
    - Replaces spaces & special chars with underscores
    - Removes duplicate underscores
    - Optionally removes accents
    
    Parameters:
        df (pd.DataFrame): DataFrame whose columns to clean
        remove_accents (bool): If True, strips accents from characters
    
    Returns:
        pd.DataFrame: DataFrame with cleaned column names
    """
    def _clean(col):
        col = col.strip().lower()
        if remove_accents:
            col = ''.join(
                c for c in unicodedata.normalize('NFKD', col)
                if not unicodedata.combining(c)
            )
        col = re.sub(r'[^0-9a-zA-Z]+', '_', col)  # replace non-alphanumeric with _
        col = re.sub(r'_+', '_', col)             # collapse multiple underscores
        col = col.strip('_')                      # remove leading/trailing underscores
        return col
    
    df = df.copy()
    df.columns = [_clean(c) for c in df.columns]
    return df


def build_dual_preprocessors(df, target, feature_cols,
                             high_card_threshold=20, scale_numeric=False):
    """
    Build regression + tree preprocessors using an explicit feature list.
    """
    X = df[feature_cols].copy()
    y = df[target]

    # Identify column types
    cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
    num_cols = X.select_dtypes(include=['number']).columns.tolist()

    # Split categorical into low/high cardinality
    low_card_cols = [c for c in cat_cols if X[c].nunique() <= high_card_threshold]
    high_card_cols = [c for c in cat_cols if X[c].nunique() > high_card_threshold]

    regression_preprocessor = ColumnTransformer(
        transformers=[
            ('low_card', OneHotEncoder(handle_unknown='ignore', sparse_output=False), low_card_cols),
            ('high_card', ce.TargetEncoder(), high_card_cols),
            ('num', StandardScaler() if scale_numeric else 'passthrough', num_cols)
        ]
    )

    tree_preprocessor = ColumnTransformer(
        transformers=[
            ('cat', 'passthrough', cat_cols),
            ('num', 'passthrough', num_cols)
        ]
    )

    return regression_preprocessor, tree_preprocessor, X, y


def build_preprocessing_pipeline(df, target, 
                                  high_card_threshold=20, 
                                  scale_numeric=False):
    """
    Builds a preprocessing pipeline for linear regression:
    - One-hot encodes low-cardinality categorical columns
    - Target encodes high-cardinality categorical columns
    - Optionally scales numeric columns
    
    Parameters:
        df (pd.DataFrame): Input DataFrame (including target column)
        target (str): Name of target column
        high_card_threshold (int): Unique value cutoff for high-cardinality
        scale_numeric (bool): Whether to scale numeric features
        
    Returns:
        pipeline (ColumnTransformer): Preprocessing transformer
        low_card_cols (list): Low-cardinality categorical columns
        high_card_cols (list): High-cardinality categorical columns
        num_cols (list): Numeric columns
    """
    
    # Separate features and target
    X = df.drop(columns=[target])
    
    # Identify column types
    cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
    num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
    
    # Split categorical into low/high cardinality
    low_card_cols = [col for col in cat_cols if X[col].nunique() <= high_card_threshold]
    high_card_cols = [col for col in cat_cols if X[col].nunique() > high_card_threshold]
    
    # Transformers
    low_card_transformer = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    high_card_transformer = TargetEncoder()
    num_transformer = StandardScaler() if scale_numeric else 'passthrough'
    
    # Column transformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('low_card', low_card_transformer, low_card_cols),
            ('high_card', high_card_transformer, high_card_cols),
            ('num', num_transformer, num_cols)
        ]
    )
    
    return preprocessor, low_card_cols, high_card_cols, num_cols

def add_interaction_terms(df, features):
    """
    Adds pairwise interaction terms between given features.
    """
    poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
    interaction_array = poly.fit_transform(df[features])
    interaction_df = pd.DataFrame(interaction_array, columns=poly.get_feature_names_out(features))
    return pd.concat([df.reset_index(drop=True), interaction_df], axis=1)

def preprocess_features(df, categorical_cols, numeric_cols):
    """
    Returns a ColumnTransformer that one-hot encodes categorical columns
    and passes numeric columns through unchanged.
    """
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_cols),
            ('num', 'passthrough', numeric_cols)
        ]
    )
    return preprocessor

def transform_with_names(preprocessor, X, y=None):
    """Fit/transform and return a DataFrame with feature names preserved."""
    Xt = preprocessor.fit_transform(X, y)
    cols = preprocessor.get_feature_names_out()
    return pd.DataFrame(Xt, columns=cols, index=X.index)


def create_features_mend(df):
    df = df.copy()

    # Build U.S. holiday calendar for relevant years
    us_holidays = holidays.US(years=range(2015, 2027))

    # Build a list of (holiday_date, holiday_name)
    holiday_items = list(us_holidays.items())

    # Create a set of all holiday dates ±3 days
    holiday_buffer = {}
    for h_date, h_name in holiday_items:
        for offset in range(-3, 4):  # -3, -2, -1, 0, +1, +2, +3
            holiday_buffer[h_date + pd.Timedelta(days=offset)] = h_name

    # Binary flag: within ±3 days of a holiday
    if "scheduleddepartdatetime" in df.columns:
        print("Adding holiday features...")
        df['is_holiday_period'] = df['scheduleddepartdatetime'].dt.date.isin(holiday_buffer.keys())

        # Categorical holiday name (or "None")
        def get_holiday_name(d):
            return holiday_buffer.get(d, "None")

        df['holiday_name'] = df['scheduleddepartdatetime'].dt.date.apply(get_holiday_name)
    else:
        print("Column 'scheduleddepartdatetime' not found, skipping holiday features.")

    # --- Existing engineered features ---
    if "scheduleddepartdatetime" in df.columns:
        print("Adding time-based features...")
        df["dayofweek"] = df["scheduleddepartdatetime"].dt.dayofweek
        df["month"] = df["scheduleddepartdatetime"].dt.month
    else:
        print("Skipping time-based features.")

    if {"origin","dest"}.issubset(df.columns):
        print("Adding route feature...")
        df["route"] = df["origin"].astype(str) + "_" + df["dest"].astype(str)
    else:
        print("Skipping route feature.")

    if {"marketshareorigin","marketsharedest"}.issubset(df.columns):
        print("Adding marketshare_diff...")
        df["marketshare_diff"] = df["marketshareorigin"] - df["marketsharedest"]
    else:
        print("Skipping marketshare_diff.")

    if {"hhiorigin","hhidest"}.issubset(df.columns):
        print("Adding hhi_diff...")
        df["hhi_diff"] = df["hhiorigin"] - df["hhidest"]
    else:
        print("Skipping hhi_diff.")

    if {"temperature","windspeed"}.issubset(df.columns):
        print("Adding temp_wind_interaction...")
        df["temp_wind_interaction"] = df["temperature"] * df["windspeed"]
    else:
        print("Skipping temp_wind_interaction.")

    if {"temperature","windgustspeed"}.issubset(df.columns):
        print("Adding temp_windgust_interaction...")
        df["temp_windgust_interaction"] = df["temperature"] * df["windgustspeed"]
    else:
        print("Skipping temp_windgust_interaction.")

    if {"windspeed","windgustspeed"}.issubset(df.columns):
        print("Adding wind_gust_diff...")
        df["wind_gust_diff"] = df["windspeed"] - df["windgustspeed"]
    else:
        print("Skipping wind_gust_diff.")

    if {"raindummy","windspeed"}.issubset(df.columns):
        print("Adding rain_wind_interaction...")
        df["rain_wind_interaction"] = df["raindummy"] * df["windspeed"]
    else:
        print("Skipping rain_wind_interaction.")

    if {"snowdummy","windspeed"}.issubset(df.columns):
        print("Adding snow_wind_interaction...")
        df["snow_wind_interaction"] = df["snowdummy"] * df["windspeed"]
    else:
        print("Skipping snow_wind_interaction.")

    if {"raindummy","windgustspeed"}.issubset(df.columns):
        print("Adding rain_wind_gust_interaction...")
        df["rain_wind_gust_interaction"] = df["raindummy"] * df["windgustspeed"]
    else:
        print("Skipping rain_wind_gust_interaction.")

    if {"snowdummy","windgustspeed"}.issubset(df.columns):
        print("Adding snow_wind_gust_interaction...")
        df["snow_wind_gust_interaction"] = df["snowdummy"] * df["windgustspeed"]
    else:
        print("Skipping snow_wind_gust_interaction.")

    if {"originmetropop","destmetropop"}.issubset(df.columns):
        print("Adding metropop_diff...")
        df["metropop_diff"] = df["originmetropop"] - df["destmetropop"]
    else:
        print("Skipping metropop_diff.")

    if {"originmetrogdppercapita","destmetrogdppercapita"}.issubset(df.columns):
        print("Adding metrogdp_diff...")
        df["metrogdp_diff"] = df["originmetrogdppercapita"] - df["destmetrogdppercapita"]
    else:
        print("Skipping metrogdp_diff.")

    return df

def engineer_flight_features_light(df, datetime_col="scheduleddepartdatetime",
                                   origin_col="origin", dest_col="dest",
                                   carrier_col="uniquecarrier", delay_col="depdelay",
                                   distance_col="distance"):
    df = df.copy()

    # --- Datetime parts ---
    dt = pd.to_datetime(df[datetime_col])
    df["year"] = dt.dt.year
    df["month"] = dt.dt.month
    df["day"] = dt.dt.day
    df["hour"] = dt.dt.hour
    df["date"] = dt.dt.floor("D")
    df["is_weekend"] = dt.dt.dayofweek >= 5

    # --- Route & distance ---
    df["route"] = df[origin_col].astype(str) + "_" + df[dest_col].astype(str)
    if distance_col in df.columns:
        df["distance_bin"] = pd.cut(df[distance_col],
                                    bins=[0,500,1500,3000,10000],
                                    labels=["short","medium","long","ultra"])

    # --- Congestion (lighter via transform) ---
    df["hourly_origin_flights"] = (
        df.groupby([origin_col,"date","hour"])[delay_col].transform("count")
    )
    df["daily_route_flights"] = (
        df.groupby([origin_col,dest_col,"date"])[delay_col].transform("count")
    )

    # --- Weather (light) ---
    if "temperature" in df.columns:
        df["is_extreme_temp"] = (df["temperature"] < 0) | (df["temperature"] > 35)
    if {"raindummy","snowdummy","windgustdummy"}.issubset(df.columns):
        df["stormy"] = (df["raindummy"]|df["snowdummy"]|df["windgustdummy"]).astype(int)

    # --- Market/demand ---
    if {"capacity","numflights"}.issubset(df.columns):
        df["capacity_utilization"] = df["numflights"] / df["capacity"].replace(0, np.nan)

    return df


def engineer_flight_features_heavy(
    df,
    datetime_col="scheduleddepartdatetime",
    origin_col="origin",
    dest_col="dest",
    carrier_col="uniquecarrier",
    delay_col="depdelay",
    distance_col="distance",
    window=7
):
    """
    Engineer advanced features for flight delay prediction.
    Optimized for speed, with print statements and timing checkpoints.
    """

    start_time = time.time()
    df = df.copy()
    print("Starting feature engineering...")

    # --- Precompute datetime parts once ---
    t0 = time.time()
    if datetime_col in df.columns:
        dt = pd.to_datetime(df[datetime_col])
        df["year"] = dt.dt.year
        df["month"] = dt.dt.month
        df["day"] = dt.dt.day
        df["hour"] = dt.dt.hour
        df["date"] = dt.dt.floor("D")
        df["quarter"] = dt.dt.quarter
        df["is_weekend"] = dt.dt.dayofweek >= 5
        df["part_of_day"] = pd.cut(
            df["hour"],
            bins=[0,5,11,16,21,24],
            labels=["late_night","morning","midday","evening","night"],
            right=False
        )
        df["days_since_year_start"] = (
            dt - pd.to_datetime(df["year"].astype(str) + "-01-01")
        ).dt.days
    print(f"Datetime features done in {time.time()-t0:.2f}s")

    # --- Route & distance features ---
    t0 = time.time()
    if {origin_col, dest_col}.issubset(df.columns):
        if f"largehubairport{origin_col}" in df.columns and f"largehubairport{dest_col}" in df.columns:
            df["hub_to_hub"] = (
                (df[f"largehubairport{origin_col}"] == 1) &
                (df[f"largehubairport{dest_col}"] == 1)
            ).astype(int)
        df["route"] = df[origin_col].astype(str) + "_" + df[dest_col].astype(str)

    if distance_col in df.columns:
        df["distance_bin"] = pd.cut(
            df[distance_col],
            bins=[0,500,1500,3000,10000],
            labels=["short","medium","long","ultra"]
        )
    print(f"Route & distance features done in {time.time()-t0:.2f}s")

    # --- Congestion features ---
    t0 = time.time()
    if {"date","hour",origin_col}.issubset(df.columns):
        hourly_counts = (
            df.groupby([origin_col,"date","hour"])
              .size()
              .rename("hourly_origin_flights")
              .reset_index()
        )
        df = df.merge(hourly_counts, on=[origin_col,"date","hour"], how="left")

    if {"date",origin_col,dest_col}.issubset(df.columns):
        daily_counts = (
            df.groupby([origin_col,dest_col,"date"])
              .size()
              .rename("daily_route_flights")
              .reset_index()
        )
        df = df.merge(daily_counts, on=[origin_col,dest_col,"date"], how="left")
    print(f"Congestion features done in {time.time()-t0:.2f}s")

    # --- Weather features ---
    t0 = time.time()
    if "temperature" in df.columns:
        df["is_extreme_temp"] = (df["temperature"] < 0) | (df["temperature"] > 35)
        if "month" in df.columns:
            monthly_means = df.groupby("month")["temperature"].transform("mean")
            df["temp_anomaly"] = df["temperature"] - monthly_means

    if {"raindummy","snowdummy","windgustdummy"}.issubset(df.columns):
        df["stormy"] = (
            (df["raindummy"]==1) | (df["snowdummy"]==1) | (df["windgustdummy"]==1)
        ).astype(int)
    print(f"Weather features done in {time.time()-t0:.2f}s")

    # --- Rolling averages ---
    t0 = time.time()
    if {origin_col, delay_col}.issubset(df.columns):
        df = df.sort_values([origin_col, datetime_col])
        df["rolling_origin_delay"] = (
            df.groupby(origin_col)[delay_col]
              .rolling(window, min_periods=1)
              .mean()
              .reset_index(level=0, drop=True)
        )

    if {dest_col, delay_col}.issubset(df.columns):
        df = df.sort_values([dest_col, datetime_col])
        df["rolling_dest_delay"] = (
            df.groupby(dest_col)[delay_col]
              .rolling(window, min_periods=1)
              .mean()
              .reset_index(level=0, drop=True)
        )

    if {carrier_col, delay_col}.issubset(df.columns):
        df = df.sort_values([carrier_col, datetime_col])
        df["rolling_carrier_delay"] = (
            df.groupby(carrier_col)[delay_col]
              .rolling(window, min_periods=1)
              .mean()
              .reset_index(level=0, drop=True)
        )

    if {"route", delay_col}.issubset(df.columns):
        df = df.sort_values(["route", datetime_col])
        df["rolling_route_delay"] = (
            df.groupby("route")[delay_col]
              .rolling(window, min_periods=1)
              .mean()
              .reset_index(level=0, drop=True)
        )
    print(f"Rolling averages done in {time.time()-t0:.2f}s")

    # --- Market/demand features ---
    t0 = time.time()
    if {"capacity","numflights"}.issubset(df.columns):
        df["capacity_utilization"] = (
            df["numflights"] / df["capacity"].replace(0, np.nan)
        )

    if {origin_col, dest_col, carrier_col}.issubset(df.columns):
        df["route_carrier_count"] = (
            df.groupby([origin_col,dest_col])[carrier_col].transform("nunique")
        )
    print(f"Market/demand features done in {time.time()-t0:.2f}s")

    # --- Interaction features ---
    t0 = time.time()
    if {"is_holiday_period","monopolyroute"}.issubset(df.columns):
        df["holiday_monopoly"] = (
            df["is_holiday_period"].astype(int) * df["monopolyroute"].astype(int)
        )

    if {"is_extreme_temp","hourly_origin_flights"}.issubset(df.columns):
        df["extreme_temp_congestion"] = (
            df["is_extreme_temp"].astype(int) * df["hourly_origin_flights"]
        )
    print(f"Interaction features done in {time.time()-t0:.2f}s")

    print(f"Total feature engineering time: {time.time()-start_time:.2f}s")
    return df



## Data Prep Steps

In [8]:
file_name = 'MendeleyDelayData.csv'
df_mend = pd.read_csv(data_path + file_name)

df_mend = optimize_dataframe(
    df_mend,
    datetime_cols=['scheduleddepartdatetime'],
    fillna=True
)
df_mend = clean_column_names(df_mend)

df_mend_id_cols = ['originairportid', 'destairportid', ]
df_mend_cat_cols = ['origin', 'dest', 'uniquecarrier', 'tailnum', 'origincityname', 'originstate', ]
df_mend_date_cols = ['scheduleddepartdatetime', ]
df_mend_target_cols = ['depdelay','arrdelay',]
df_mend_feature_cols = [col for col in df_mend.columns if col not in df_mend_id_cols + df_mend_cat_cols + df_mend_date_cols + df_mend_target_cols]

# Remove outliers from dataframe
df_mend_clean = df_mend[df_mend['depdelay'] >= -30]

# Create engineered features
df_mend_clean = create_features_mend(df_mend_clean)


Memory usage before optimization: 1008.24 MB
Memory usage after optimization: 150.66 MB
Reduced by 85.1%
Adding holiday features...
Adding time-based features...
Adding route feature...
Adding marketshare_diff...
Adding hhi_diff...
Adding temp_wind_interaction...
Adding temp_windgust_interaction...
Adding wind_gust_diff...
Adding rain_wind_interaction...
Adding snow_wind_interaction...
Adding rain_wind_gust_interaction...
Adding snow_wind_gust_interaction...
Adding metropop_diff...
Adding metrogdp_diff...


In [9]:
df_mend_clean = engineer_flight_features_light(df_mend_clean)


  df.groupby([origin_col,"date","hour"])[delay_col].transform("count")
  df.groupby([origin_col,dest_col,"date"])[delay_col].transform("count")


In [10]:

# drop leakage columns, ID columns, and date columns
df_mend_clean = df_mend_clean.drop(columns=['arrdelay'] + df_mend_id_cols + df_mend_date_cols).copy()

reg_prep_mend, tree_prep_mend, X_mend, y_mend_numeric = build_dual_preprocessors(df_mend_clean, target='depdelay', feature_cols=df_mend_feature_cols, high_card_threshold=30, scale_numeric=True)

# Create binary target for classification (15 min delay threshold)
y_mend_binary_15 = (y_mend_numeric >= 15).astype(int)

X_reg_mend = transform_with_names(reg_prep_mend, X_mend, y_mend_numeric)
X_tree_mend = transform_with_names(tree_prep_mend, X_mend, y_mend_numeric)

# Week 1 Notebook – Linear Regression 1
Each week, you will apply the concepts of that week to your Integrated Capstone Project’s dataset. In preparation for Milestone One, create a Jupyter Notebook (similar to in Module B, semester two) that illustrates these lessons. There are no specific questions to answer in your Jupyter Notebook files in this course; your general goal is to analyze your data, using the methods you have learned about in this course and in this program, and draw interesting conclusions. 

For Week 1, include concepts such as linear regression with polynomial terms, interaction terms, multicollinearity, variance inflation factor and regression, and categorical and continuous features. Complete your Jupyter Notebook homework by 11:59 pm ET on Sunday. 

## Week 1 Helper Functions

In [11]:

def regression_summary(X, y):
    """
    Fits an OLS regression model using statsmodels and prints the summary.
    """
    X_const = sm.add_constant(X)
    model = sm.OLS(y, X_const).fit()
    return model.summary()

def fit_polynomial_regression(X, y, degree=2):
    """
    Fits a polynomial regression model and returns the fitted model and transformed features.
    """
    poly = PolynomialFeatures(degree=degree, include_bias=False)
    X_poly = poly.fit_transform(X)
    model = LinearRegression()
    model.fit(X_poly, y)
    return model, poly


def calculate_vif(df, features=None, vif_thresh=10.0):
    """
    Calculate Variance Inflation Factor (VIF) safely:
    - Removes constant columns
    - Removes perfectly collinear columns
    - Returns sorted VIF table
    
    Parameters:
        df (pd.DataFrame): DataFrame with numeric features
        features (list): Optional list of features to check; defaults to all numeric
        vif_thresh (float): Threshold for flagging high VIF
    
    Returns:
        pd.DataFrame: VIF table
    """
    # Select numeric columns if features not provided
    if features is None:
        features = df.select_dtypes(include=[np.number]).columns.tolist()
    
    X = df[features].copy()
    
    # 1. Drop constant columns
    constant_cols = [col for col in X.columns if X[col].nunique() <= 1]
    if constant_cols:
        print(f"Dropping constant columns: {constant_cols}")
        X.drop(columns=constant_cols, inplace=True)
    
    # 2. Drop perfectly collinear columns
    corr_matrix = X.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    perfect_corr_cols = [col for col in upper.columns if any(upper[col] == 1.0)]
    if perfect_corr_cols:
        print(f"Dropping perfectly collinear columns: {perfect_corr_cols}")
        X.drop(columns=perfect_corr_cols, inplace=True)
    
    # 3. Calculate VIF
    X_const = X.assign(const=1)
    vif_data = pd.DataFrame({
        "feature": X.columns,
        "VIF": [variance_inflation_factor(X_const.values, i) for i in range(len(X.columns))]
    })
    
    # 4. Sort by VIF
    vif_data.sort_values(by="VIF", ascending=False, inplace=True)
    
    # 5. Flag high VIF
    vif_data["High_VIF"] = vif_data["VIF"] > vif_thresh
    
    return vif_data

## Mendeley Delay Dataset

### Linear Regression


In [12]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X_mend, y_mend_numeric, test_size=0.2, random_state=42
)

# Build pipeline
linreg_pipe = Pipeline(steps=[
    ("preprocessor", reg_prep_mend),
    ("model", LinearRegression())
])

# Fit
linreg_pipe.fit(X_train, y_train)

# Predictions
y_pred_train = linreg_pipe.predict(X_train)
y_pred_test = linreg_pipe.predict(X_test)

# Metrics
train_r2 = r2_score(y_train, y_pred_train)
test_r2 = r2_score(y_test, y_pred_test)
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))

print(f"Train R²: {train_r2:.3f}")
print(f"Test R²: {test_r2:.3f}")
print(f"Test RMSE: {test_rmse:.3f}")

# Store results
results.append({
    'model': 'Linear Regression',
    'train_r2': train_r2,
    'test_r2': test_r2, 
    'test_rmse': test_rmse
})

Train R²: 0.041
Test R²: 0.041
Test RMSE: 34.423


### Polynomial Regression

In [13]:

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X_mend, y_mend_numeric, test_size=0.2, random_state=42
)

# Polynomial regression pipeline
poly_reg_pipe = Pipeline(steps=[
    ("preprocessor", reg_prep_mend),                # your ColumnTransformer
    ("poly", PolynomialFeatures(degree=2, include_bias=False)),  # expand features
    ("model", LinearRegression())
])

# Fit
poly_reg_pipe.fit(X_train, y_train)

# Predictions
y_pred_train = poly_reg_pipe.predict(X_train)
y_pred_test = poly_reg_pipe.predict(X_test)

# Metrics
train_r2 = r2_score(y_train, y_pred_train)
test_r2 = r2_score(y_test, y_pred_test)
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))

print(f"Train R²: {train_r2:.3f}")
print(f"Test R²: {test_r2:.3f}")
print(f"Test RMSE: {test_rmse:.3f}")

# Store results
results.append({
    'model': 'Polynomial Regression',
    'train_r2': train_r2,
    'test_r2': test_r2,
    'test_rmse': test_rmse
})

Train R²: 0.061
Test R²: 0.060
Test RMSE: 34.098


### VIF: Variable Inflation Factor

In [14]:
vif_table = calculate_vif(df_mend_clean, features=X_mend.columns.tolist(), vif_thresh=10.0)
print(vif_table)

  vif = 1. / (1. - r_squared_i)


                    feature           VIF  High_VIF
4       nonhubairportorigin           inf      True
5     smallhubairportorigin           inf      True
14   mediumhubairlineorigin           inf      True
15    largehubairlineorigin           inf      True
7     largehubairportorigin           inf      True
6    mediumhubairportorigin           inf      True
8         nonhubairportdest           inf      True
9       smallhubairportdest           inf      True
11      largehubairportdest           inf      True
10     mediumhubairportdest           inf      True
12      nonhubairlineorigin           inf      True
13    smallhubairlineorigin           inf      True
18     mediumhubairlinedest           inf      True
16        nonhubairlinedest  9.007199e+15      True
17      smallhubairlinedest  9.007199e+15      True
19      largehubairlinedest  3.002400e+15      True
35               temp_20_30  3.112421e+02      True
34               temp_10_20  2.888751e+02      True
33          

# Week 2 Notebook - Linear Regression 2

For Week 2, include concepts such as linear regression with lasso, ridge, and elastic net regression. This homework will be submitted for peer review and feedback in Week 3 in the assignment titled 3.4 Peer Review: Week 2 Jupyter Notebook. Complete your Jupyter Notebook homework by 11:59 pm ET on Sunday.

## Mendeley Delay Data

### Lasso Regression

In [20]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X_mend, y_mend_numeric, test_size=0.2, random_state=42
)

# Build pipeline with preprocessing + Lasso
lasso_pipe = Pipeline(steps=[
    ("preprocessor", reg_prep_mend),
    ("model", Lasso(alpha=0.1, max_iter=10000, random_state=42))
])

# Fit
lasso_pipe.fit(X_train, y_train)

# Predictions
y_pred_train = lasso_pipe.predict(X_train)
y_pred_test = lasso_pipe.predict(X_test)

# Metrics
train_r2 = r2_score(y_train, y_pred_train)
test_r2 = r2_score(y_test, y_pred_test)
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))

print(f"Train R²: {train_r2:.3f}")
print(f"Test R²: {test_r2:.3f}")
print(f"Test RMSE: {test_rmse:.3f}")

#store results
results.append({
    'model': 'Lasso Regression',
    'train_r2': train_r2,
    'test_r2': test_r2,
    'test_rmse': test_rmse
})  

Train R²: 0.041
Test R²: 0.041
Test RMSE: 34.432


### Lasso Grid Search

In [None]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X_mend, y_mend_numeric, test_size=0.2, random_state=42
)

# Pipeline: preprocessing + Lasso
lasso_pipe = Pipeline(steps=[
    ("preprocessor", reg_prep_mend),
    ("model", Lasso(max_iter=10000, random_state=42))
])

# Grid of hyperparameters to search
param_grid = {
    "model__alpha": [0.001, 0.01, 0.1, 1, 10]
}

# Grid search with 5-fold CV
grid = GridSearchCV(
    lasso_pipe,
    param_grid,
    cv=5,
    scoring="neg_mean_squared_error",
    n_jobs=-1
)

# Fit grid search
grid.fit(X_train, y_train)

# Best parameters
print("Best alpha:", grid.best_params_["model__alpha"])

# Evaluate on test set
y_pred_test = grid.predict(X_test)
test_r2 = r2_score(y_test, y_pred_test)
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))

print(f"Test R²: {test_r2:.3f}")
print(f"Test RMSE: {test_rmse:.3f}")

# Store results
results.append({
    'model': 'Lasso Regression (Tuned)',
    'best_alpha': grid.best_params_["model__alpha"],
    'train_r2': grid.best_score_,
    'test_r2': test_r2,
    'test_rmse': test_rmse
})

### Ridge Regression

In [None]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X_mend, y_mend_numeric, test_size=0.2, random_state=42
)

# Build pipeline with preprocessing + Ridge
ridge_pipe = Pipeline(steps=[
    ("preprocessor", reg_prep_mend),
    ("model", Ridge(alpha=1.0, max_iter=10000, random_state=42))
])

# Fit
ridge_pipe.fit(X_train, y_train)

# Predictions
y_pred_train = ridge_pipe.predict(X_train)
y_pred_test = ridge_pipe.predict(X_test)

# Metrics
train_r2 = r2_score(y_train, y_pred_train)
test_r2 = r2_score(y_test, y_pred_test)
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))

print(f"Train R²: {train_r2:.3f}")
print(f"Test R²: {test_r2:.3f}")
print(f"Test RMSE: {test_rmse:.3f}")

#store results
results.append({
    'model': 'Ridge Regression',
    'train_r2': train_r2,
    'test_r2': test_r2, 
    'test_rmse': test_rmse
})

R^2 score (default alpha): 0.04376642307060696


### Ridge Grid Search

In [None]:
# Pipeline: preprocessing + Ridge
ridge_pipe = Pipeline(steps=[
    ("preprocessor", reg_prep_mend),
    ("model", Ridge(max_iter=10000, random_state=42))
])

# Grid of hyperparameters
ridge_param_grid = {
    "model__alpha": [0.01, 0.1, 1, 10, 100]
}

ridge_grid = GridSearchCV(
    ridge_pipe,
    ridge_param_grid,
    cv=5,
    scoring="neg_mean_squared_error",
    n_jobs=-1
)

ridge_grid.fit(X_train, y_train)

print("Best Ridge alpha:", ridge_grid.best_params_["model__alpha"])

y_pred_test = ridge_grid.predict(X_test)
print("Ridge Test R²:", r2_score(y_test, y_pred_test))
print("Ridge Test RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_test)))

#store results
results.append({
    'model': 'Ridge Regression (Tuned)',
    'best_alpha': ridge_grid.best_params_["model__alpha"],
    'train_r2': ridge_grid.best_score_,
    'test_r2': r2_score(y_test, y_pred_test),
    'test_rmse': np.sqrt(mean_squared_error(y_test, y_pred_test))
})

Best alpha: 1000
Best CV R^2: 0.043972332094467756
Test R^2: 0.04377132510993642


### Elastic Net

In [None]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X_mend, y_mend_numeric, test_size=0.2, random_state=42
)

# Build pipeline with preprocessing + Elastic Net
elasticnet_pipe = Pipeline(steps=[
    ("preprocessor", reg_prep_mend),
    ("model", ElasticNet(alpha=0.1, l1_ratio=0.5, max_iter=10000, random_state=42))
])
# Fit
elasticnet_pipe.fit(X_train, y_train)

# Predictions
y_pred_train = elasticnet_pipe.predict(X_train)
y_pred_test = elasticnet_pipe.predict(X_test)

# Metrics
train_r2 = r2_score(y_train, y_pred_train)
test_r2 = r2_score(y_test, y_pred_test)
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))

print(f"Train R²: {train_r2:.3f}")
print(f"Test R²: {test_r2:.3f}")
print(f"Test RMSE: {test_rmse:.3f}")

#store results
results.append({
    'model': 'Elastic Net Regression',
    'train_r2': train_r2,
    'test_r2': test_r2, 
    'test_rmse': test_rmse
})

R^2 score (default alpha, l1_ratio): 0.033545260335402505


### Elastic Net Grid Search

In [None]:
# Pipeline: preprocessing + ElasticNet
elastic_pipe = Pipeline(steps=[
    ("preprocessor", reg_prep_mend),
    ("model", ElasticNet(max_iter=10000, random_state=42))
])

# Grid of hyperparameters
elastic_param_grid = {
    "model__alpha": [0.001, 0.01, 0.1, 1, 10],
    "model__l1_ratio": [0.2, 0.5, 0.8]  # balance between L1 and L2
}

elastic_grid = GridSearchCV(
    elastic_pipe,
    elastic_param_grid,
    cv=5,
    scoring="neg_mean_squared_error",
    n_jobs=-1
)

elastic_grid.fit(X_train, y_train)

print("Best ElasticNet params:", elastic_grid.best_params_)

y_pred_test = elastic_grid.predict(X_test)
print("ElasticNet Test R²:", r2_score(y_test, y_pred_test))
print("ElasticNet Test RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_test)))

#store results
results.append({
    'model': 'Elastic Net Regression (Tuned)',
    'best_params': elastic_grid.best_params_,
    'train_r2': elastic_grid.best_score_,
    'test_r2': r2_score(y_test, y_pred_test),
    'test_rmse': np.sqrt(mean_squared_error(y_test, y_pred_test))
})

KeyboardInterrupt: 

# Week 3 Notebook - Linear Regression 3

For Week 3, include concepts such as linear regression with forward and backward selection, PCR, and PLSR. Complete your Jupyter Notebook homework by 11:59 pm ET on Sunday. 

## Mendeley Delay Data

### Forward Selection

#### Using SFS

In [None]:
# Base pipeline: preprocessing + linear regression
linreg_pipe = Pipeline(steps=[
    ("preprocessor", reg_prep_mend),
    ("model", LinearRegression())
])

# Forward selection wrapper
sfs = SFS(
    linreg_pipe,
    k_features="best",                # keep adding until performance stops improving
    forward=True,
    floating=False,
    scoring="neg_mean_squared_error", # optimize RMSE
    cv=5,
    n_jobs=-1
)

# Fit on raw features
sfs = sfs.fit(X_mend, y_mend_numeric)

# Extract CV scores (negative MSE → convert to RMSE)
cv_scores = list(sfs.get_metric_dict().values())
num_features = [d["feature_idx"] for d in cv_scores]
rmse_scores = [np.sqrt(-d["avg_score"]) for d in cv_scores]

# Plot improvement curve
plt.figure(figsize=(8,5))
plt.plot(range(1, len(rmse_scores)+1), rmse_scores, marker="o")
plt.xlabel("Number of Features Selected")
plt.ylabel("CV RMSE")
plt.title("Forward Selection Improvement Curve")
plt.grid(True)
plt.show()

# Best subset
print("Best number of features:", sfs.k_features_)
print("Selected features:", sfs.k_feature_names_)
print("Best CV RMSE:", min(rmse_scores))

#store results
results.append({
    'model': 'Forward Selection Linear Regression',
    'num_features': sfs.k_features_,
    'selected_features': sfs.k_feature_names_,
    'best_cv_rmse': min(rmse_scores)
})

Selected features: ['low_card__uniquecarrier_9E', 'low_card__uniquecarrier_B6', 'low_card__uniquecarrier_CO', 'low_card__uniquecarrier_DL', 'low_card__uniquecarrier_FL', 'low_card__uniquecarrier_MQ', 'low_card__uniquecarrier_UA', 'low_card__uniquecarrier_XE', 'high_card__dest', 'high_card__tailnum', 'high_card__origincityname', 'high_card__originstate', 'num__smallhubairportorigin', 'num__nonhubairportdest', 'num__largehubairportdest', 'num__year', 'num__month', 'num__dayofmonth', 'num__dayofweek', 'num__scheduledhour', 'num__loadfactor', 'num__numflights', 'num__temperature', 'num__temp_ninfty_n10', 'num__temp_n10_0', 'num__windspeedsquare', 'num__windgustdummy', 'num__raindummy', 'num__raintracedummy', 'num__snowdummy', 'num__snowtracedummy', 'num__destmetrogdppercapita']
Model Accuracy (R²): -0.0439
Model RMSE: 1372.6907


#### Using Stepwise Function

This was abandoned due to performance issues. Relying on SFS instead.

In [None]:
# def forward_selection(X, y, threshold_in=0.01, verbose=True):
#     """Forward selection based on p-values from statsmodels OLS"""
#     included = []
#     while True:
#         changed = False
#         excluded = list(set(X.columns) - set(included))
#         new_pval = pd.Series(index=excluded, dtype=float)
#         for new_column in excluded:
#             model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included + [new_column]]))).fit()
#             new_pval[new_column] = model.pvalues[new_column]
#         best_pval = new_pval.min() if not new_pval.empty else None
#         if best_pval is not None and best_pval < threshold_in:
#             best_feature = new_pval.idxmin()
#             included.append(best_feature)
#             changed = True
#             if verbose:
#                 print(f'Add {best_feature:30} with p-value {best_pval:.6}')
#         if not changed:
#             break
#     return included
# # Transform train/test into numeric DataFrames
# preprocessor.fit(X_train, y_train)

# X_train_trans = pd.DataFrame(
#     preprocessor.transform(X_train),
#     columns=preprocessor.get_feature_names_out(),
#     index=X_train.index
# )
# X_test_trans = pd.DataFrame(
#     preprocessor.transform(X_test),
#     columns=preprocessor.get_feature_names_out(),
#     index=X_test.index
# )

# # Run forward selection
# forward_features = forward_selection(X_train_trans, y_train)
# print("Forward-selected features:", forward_features)

# # Fit final model
# final_model = sm.OLS(y_train, sm.add_constant(X_train_trans[forward_features])).fit()

# # Predict on test set
# y_pred = final_model.predict(sm.add_constant(X_test_trans[forward_features]))

# # Evaluate accuracy
# r2 = r2_score(y_test, y_pred)
# rmse = mean_squared_error(y_test, y_pred)

# print(f"Forward Selection Model Accuracy (R²): {r2:.4f}")
# print(f"Forward Selection Model RMSE: {rmse:.4f}")

Add high_card__tailnum             with p-value 0.0
Add num__scheduledhour             with p-value 1.07019e-156
Add num__raindummy                 with p-value 7.40738e-87
Add high_card__dest                with p-value 6.44347e-59
Add high_card__origincityname      with p-value 6.17954e-47
Add num__snowdummy                 with p-value 6.48087e-32
Add num__snowtracedummy            with p-value 3.82627e-16
Add num__numflights                with p-value 7.88944e-15
Add num__windgustdummy             with p-value 1.34108e-14
Add num__year                      with p-value 1.63149e-13
Add num__largehubairportdest       with p-value 5.15059e-08
Add num__raintracedummy            with p-value 3.68001e-07
Add high_card__originstate         with p-value 4.61725e-06
Add num__temp_n10_0                with p-value 5.76293e-05
Add num__windspeedsquare           with p-value 0.000150945
Add num__destmetrogdppercapita     with p-value 0.000336916
Add num__dayofmonth                with p-value

### Backward Selection

#### Using SFS

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.linear_model import LinearRegression

# Base pipeline: preprocessing + linear regression
linreg_pipe = Pipeline(steps=[
    ("preprocessor", reg_prep_mend),
    ("model", LinearRegression())
])

# Backward selection wrapper
sbs = SFS(
    linreg_pipe,
    k_features="best",                # keep removing until optimal subset
    forward=False,                    # backward elimination
    floating=False,
    scoring="neg_mean_squared_error", # optimize RMSE
    cv=5,
    n_jobs=-1
)

# Fit on raw features
sbs = sbs.fit(X_mend, y_mend_numeric)

# Extract CV scores (negative MSE → RMSE)
cv_scores = list(sbs.get_metric_dict().values())
rmse_scores = [np.sqrt(-d["avg_score"]) for d in cv_scores]

# Plot improvement curve
plt.figure(figsize=(8,5))
plt.plot(range(len(rmse_scores), 0, -1), rmse_scores, marker="o")
plt.xlabel("Number of Features Remaining")
plt.ylabel("CV RMSE")
plt.title("Backward Selection Improvement Curve")
plt.grid(True)
plt.show()

# Best subset
print("Best number of features:", sbs.k_features_)
print("Selected features:", sbs.k_feature_names_)
print("Best CV RMSE:", min(rmse_scores))

#store results
results.append({
    'model': 'Backward Selection Linear Regression',
    'num_features': sbs.k_features_,
    'selected_features': sbs.k_feature_names_,
    'best_cv_rmse': min(rmse_scores)
})

Backward-selected features: ['low_card__uniquecarrier_AS', 'low_card__uniquecarrier_B6', 'low_card__uniquecarrier_CO', 'low_card__uniquecarrier_OO', 'low_card__uniquecarrier_US', 'low_card__uniquecarrier_WN', 'low_card__uniquecarrier_XE', 'high_card__dest', 'high_card__tailnum', 'high_card__origincityname', 'high_card__originstate', 'num__smallhubairportorigin', 'num__mediumhubairportdest', 'num__largehubairportdest', 'num__year', 'num__month', 'num__dayofmonth', 'num__dayofweek', 'num__scheduledhour', 'num__loadfactor', 'num__numflights', 'num__temperature', 'num__temp_0_10', 'num__temp_10_20', 'num__temp_20_30', 'num__windspeedsquare', 'num__windgustdummy', 'num__raindummy', 'num__raintracedummy', 'num__snowdummy', 'num__snowtracedummy', 'num__destmetrogdppercapita']
Backward Selection Model Accuracy (R²): -0.0439
Backward Selection Model RMSE: 1372.7010


#### Using Stepwise

Abandoned due to performance issues. Relying on SFS

In [None]:
# def backward_elimination(X, y, threshold_out=0.05, verbose=True):
#     """Backward elimination based on p-values from statsmodels OLS.
#        Assumes X is a numeric DataFrame (already preprocessed)."""
#     included = list(X.columns)
#     while True:
#         changed = False
#         model = sm.OLS(y, sm.add_constant(X[included])).fit()
#         # exclude intercept
#         pvalues = model.pvalues.iloc[1:]
#         worst_pval = pvalues.max() if not pvalues.empty else None
#         if worst_pval is not None and worst_pval > threshold_out:
#             worst_feature = pvalues.idxmax()
#             included.remove(worst_feature)
#             changed = True
#             if verbose:
#                 print(f'Drop {worst_feature:30} with p-value {worst_pval:.6}')
#         if not changed:
#             break
#     return included

# # --- Transform train/test into numeric DataFrames ---
# preprocessor.fit(X_train, y_train)

# X_train_trans = pd.DataFrame(
#     preprocessor.transform(X_train),
#     columns=preprocessor.get_feature_names_out(),
#     index=X_train.index
# )
# X_test_trans = pd.DataFrame(
#     preprocessor.transform(X_test),
#     columns=preprocessor.get_feature_names_out(),
#     index=X_test.index
# )

# # --- Run backward elimination on transformed data ---
# backward_features = backward_elimination(X_train_trans, y_train)
# print("Backward-selected features:", backward_features)

# # --- Fit final model on selected features ---
# final_model = sm.OLS(y_train, sm.add_constant(X_train_trans[backward_features])).fit()

# # Predict on test set
# y_pred = final_model.predict(sm.add_constant(X_test_trans[backward_features]))

# # --- Evaluate accuracy ---
# r2 = r2_score(y_test, y_pred)
# rmse = mean_squared_error(y_test, y_pred)

# print(f"Backward Elimination Model Accuracy (R²): {r2:.4f}")
# print(f"Backward Elimination Model RMSE: {rmse:.4f}")

Drop num__distance                  with p-value 0.994829
Drop num__largehubairlinedest       with p-value 0.966156
Drop num__nonhubairlinedest         with p-value 0.999495
Drop num__originmetrogdppercapita   with p-value 0.938646
Drop num__hhidest                   with p-value 0.934772
Drop num__windgustspeed             with p-value 0.929315
Drop num__mediumhubairportorigin    with p-value 0.920231
Drop high_card__origin              with p-value 0.87493
Drop low_card__uniquecarrier_WN     with p-value 0.852824
Drop low_card__uniquecarrier_F9     with p-value 0.897424
Drop low_card__uniquecarrier_US     with p-value 0.85234
Drop num__mediumhubairlinedest      with p-value 0.833208
Drop num__nonhubairportorigin       with p-value 0.828218
Drop low_card__uniquecarrier_YV     with p-value 0.812314
Drop num__hhiorigin                 with p-value 0.666242
Drop num__marketshareorigin         with p-value 0.838139
Drop low_card__uniquecarrier_AS     with p-value 0.678525
Drop num__capaci

### PCR
Principal Component Regression

In [None]:

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X_mend, y_mend_numeric, test_size=0.2, random_state=42
)

# PCR pipeline: preprocessing → PCA → Linear Regression
pcr_pipe = Pipeline(steps=[
    ("preprocessor", reg_prep_mend),   # your ColumnTransformer
    ("pca", PCA()),                    # dimensionality reduction
    ("model", LinearRegression())
])

# Grid search over number of components
param_grid = {
    "pca__n_components": [5, 10, 20, 40, 60]  # tune based on dataset size
}

pcr_grid = GridSearchCV(
    pcr_pipe,
    param_grid,
    cv=5,
    scoring="neg_mean_squared_error",
    n_jobs=-1
)

# Fit
pcr_grid.fit(X_train, y_train)

# Best number of components
print("Best n_components:", pcr_grid.best_params_["pca__n_components"])

# Evaluate on test set
y_pred_test = pcr_grid.predict(X_test)
test_r2 = r2_score(y_test, y_pred_test)
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))

print(f"PCR Test R²: {test_r2:.3f}")
print(f"PCR Test RMSE: {test_rmse:.3f}")

# Store results for comparison
results.append({
    "Model": "PCR",
    "Best Params": pcr_grid.best_params_,
    "Test R²": test_r2,
    "Test RMSE": test_rmse
})


PCR R2: -0.07829785712740689
PCR RMSE: 1417.9651431890868


### PLSR
Partial Least Squares Regression

In [None]:
from sklearn.cross_decomposition import PLSRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import pandas as pd

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X_mend, y_mend_numeric, test_size=0.2, random_state=42
)

# PLSR pipeline: preprocessing → PLSRegression
pls_pipe = Pipeline(steps=[
    ("preprocessor", reg_prep_mend),   # your ColumnTransformer
    ("model", PLSRegression())
])

# Grid search over number of components
param_grid = {
    "model__n_components": [2, 5, 10, 20, 40]  # tune based on dataset size
}

pls_grid = GridSearchCV(
    pls_pipe,
    param_grid,
    cv=5,
    scoring="neg_mean_squared_error",
    n_jobs=-1
)

# Fit
pls_grid.fit(X_train, y_train)

# Best number of components
print("Best n_components:", pls_grid.best_params_["model__n_components"])

# Evaluate on test set
y_pred_test = pls_grid.predict(X_test)
test_r2 = r2_score(y_test, y_pred_test)
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))

print(f"PLSR Test R²: {test_r2:.3f}")
print(f"PLSR Test RMSE: {test_rmse:.3f}")

# Store results for comparison
results.append({
    "Model": "PLSR",
    "Best Params": pls_grid.best_params_,
    "Test R²": test_r2,
    "Test RMSE": test_rmse
})

PLSR R2: -0.04399321509843657
PLSR RMSE: 1372.854428811666


# Week 4 Notebook - Logistic Regression and Feature Scaling

For Week 4, include concepts such as logistic regression and feature scaling. This homework should be submitted for peer review in the assignment titled 4.3 Peer Review: Week 4 Jupyter Notebook. Complete and submit your Jupyter Notebook homework by 11:59pm ET on Sunday. 

## Mendeley Delay Data

### Log Regression: Basic

In [None]:

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X_mend, y_mend_binary_15, test_size=0.2, random_state=42, stratify=y_mend_binary_15
)

# Build pipeline: preprocessing + logistic regression
logreg_pipe = Pipeline(steps=[
    ("preprocessor", reg_prep_mend),
    ("model", LogisticRegression(max_iter=1000, solver="liblinear"))
])

# Fit
logreg_pipe.fit(X_train, y_train)

# Predictions
y_pred_train = logreg_pipe.predict(X_train)
y_pred_test = logreg_pipe.predict(X_test)
y_pred_proba = logreg_pipe.predict_proba(X_test)[:, 1]

# Metrics
train_acc = accuracy_score(y_train, y_pred_train)
test_acc = accuracy_score(y_test, y_pred_test)
test_auc = roc_auc_score(y_test, y_pred_proba)

print(f"Train Accuracy: {train_acc:.3f}")
print(f"Test Accuracy: {test_acc:.3f}")
print(f"Test AUC: {test_auc:.3f}")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_test))
print("Classification Report:\n", classification_report(y_test, y_pred_test))

# Store results
results.append({
    "Model": "Logistic Regression",
    "Train Accuracy": train_acc,
    "Test Accuracy": test_acc,
    "Test AUC": test_auc
})


Logistic Regression Results
Accuracy: 0.6042

Classification Report:
               precision    recall  f1-score   support

           0       0.69      0.63      0.66      6088
           1       0.49      0.57      0.53      3912

    accuracy                           0.60     10000
   macro avg       0.59      0.60      0.59     10000
weighted avg       0.62      0.60      0.61     10000



### Log Regression: Random Search

In [None]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X_mend, y_mend_binary_15, test_size=0.2, random_state=42, stratify=y_mend_binary_15
)

# Pipeline: preprocessing + logistic regression
logreg_pipe = Pipeline(steps=[
    ("preprocessor", reg_prep_mend),
    ("model", LogisticRegression(max_iter=5000, solver="saga", penalty="l1"))  
    # saga supports both L1 and L2
])

# Parameter distributions for random search
param_distributions = {
    "model__C": loguniform(1e-3, 1e3),   # inverse regularization strength
    "model__penalty": ["l1", "l2"],      # try both penalties
    "model__solver": ["saga"]            # saga works with both l1 and l2
}

# Randomized search
random_search = RandomizedSearchCV(
    logreg_pipe,
    param_distributions=param_distributions,
    n_iter=20,                # number of random samples
    cv=5,
    scoring="roc_auc",
    n_jobs=-1,
    random_state=42
)

# Fit
random_search.fit(X_train, y_train)

print("Best Params:", random_search.best_params_)

# Evaluate on test set
y_pred_test = random_search.predict(X_test)
y_pred_proba = random_search.predict_proba(X_test)[:, 1]

test_acc = accuracy_score(y_test, y_pred_test)
test_auc = roc_auc_score(y_test, y_pred_proba)

print(f"Test Accuracy: {test_acc:.3f}")
print(f"Test AUC: {test_auc:.3f}")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_test))
print("Classification Report:\n", classification_report(y_test, y_pred_test))

# Store results
results.append({
    "Model": "Logistic Regression (Random Search)",
    "Best Params": random_search.best_params_,
    "Test Accuracy": test_acc,
    "Test AUC": test_auc
})

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Parameters: {'classifier__C': np.float64(0.008632008168602538), 'classifier__class_weight': 'balanced', 'classifier__penalty': 'l2', 'classifier__solver': 'lbfgs'}
Best CV Score: 0.5688355837791884

Random Search Logistic Regression Results
Accuracy: 0.6279

Classification Report:
               precision    recall  f1-score   support

           0       0.72      0.64      0.68      6088
           1       0.52      0.61      0.56      3912

    accuracy                           0.63     10000
   macro avg       0.62      0.62      0.62     10000
weighted avg       0.64      0.63      0.63     10000



### Log Regession: Grid Search

In [None]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X_mend, y_mend_binary_15, test_size=0.2, random_state=42, stratify=y_mend_binary_15
)

# Pipeline: preprocessing + logistic regression
logreg_pipe = Pipeline(steps=[
    ("preprocessor", reg_prep_mend),
    ("model", LogisticRegression(max_iter=5000, solver="saga"))  
    # saga supports both L1 and L2
])

# Grid of hyperparameters
param_grid = {
    "model__C": [0.001, 0.01, 0.1, 1, 10, 100],   # regularization strength
    "model__penalty": ["l1", "l2"]                # L1 (Lasso) or L2 (Ridge)
}

# Grid search
grid = GridSearchCV(
    logreg_pipe,
    param_grid,
    cv=5,
    scoring="roc_auc",
    n_jobs=-1
)

# Fit
grid.fit(X_train, y_train)

# Best parameters
print("Best Params:", grid.best_params_)

# Evaluate on test set
y_pred_test = grid.predict(X_test)
y_pred_proba = grid.predict_proba(X_test)[:, 1]

test_acc = accuracy_score(y_test, y_pred_test)
test_auc = roc_auc_score(y_test, y_pred_proba)

print(f"Test Accuracy: {test_acc:.3f}")
print(f"Test AUC: {test_auc:.3f}")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_test))
print("Classification Report:\n", classification_report(y_test, y_pred_test))

# Store results
results.append({
    "Model": "Logistic Regression (Grid Search)",
    "Best Params": grid.best_params_,
    "Test Accuracy": test_acc,
    "Test AUC": test_auc
})


# Week 5 - Support Vector Machines

For Week 5, include concepts such as support vector machines, the kernel trick, and regularization for support vector machines. 

## Mendeley Delay Data

### SVM Basic

In [None]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X_mend, y_mend_binary_15, test_size=0.2, random_state=42, stratify=y_mend_binary_15
)

# Pipeline: preprocessing + SVM (RBF kernel by default)
svm_pipe = Pipeline(steps=[
    ("preprocessor", reg_prep_mend),
    ("model", SVC(kernel="rbf", probability=True, random_state=42))
])

# Fit
svm_pipe.fit(X_train, y_train)

# Predictions
y_pred_test = svm_pipe.predict(X_test)
y_pred_proba = svm_pipe.predict_proba(X_test)[:, 1]

# Metrics
print("Basic SVM Accuracy:", accuracy_score(y_test, y_pred_test))
print("Basic SVM AUC:", roc_auc_score(y_test, y_pred_proba))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_test))
print("Classification Report:\n", classification_report(y_test, y_pred_test))

#store results
results.append({
    "Model": "SVM",
    "Test Accuracy": accuracy_score(y_test, y_pred_test),
    "Test AUC": roc_auc_score(y_test, y_pred_proba)
})

### SVM Random Search

In [None]:

# Pipeline: preprocessing + SVM
svm_pipe = Pipeline(steps=[
    ("preprocessor", reg_prep_mend),
    ("model", SVC(probability=True, random_state=42))
])

# Parameter distributions
param_distributions = {
    "model__kernel": ["linear", "rbf"],
    "model__C": loguniform(1e-3, 1e3),   # regularization strength
    "model__gamma": ["scale", "auto", 0.01, 0.1, 1]  # only relevant for RBF
}

# Randomized search
svm_random = RandomizedSearchCV(
    svm_pipe,
    param_distributions=param_distributions,
    n_iter=20,                # number of random samples
    cv=5,
    scoring="roc_auc",
    n_jobs=-1,
    random_state=42
)

# Fit
svm_random.fit(X_train, y_train)

# Best parameters
print("Best Params:", svm_random.best_params_)

# Evaluate on test set
y_pred_test = svm_random.predict(X_test)
y_pred_proba = svm_random.predict_proba(X_test)[:, 1]

print("Random Search SVM Accuracy:", accuracy_score(y_test, y_pred_test))
print("Random Search SVM AUC:", roc_auc_score(y_test, y_pred_proba))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_test))
print("Classification Report:\n", classification_report(y_test, y_pred_test))

#store results
results.append({
    "Model": "SVM (Random Search)",
    "Best Params": svm_random.best_params_,
    "Test Accuracy": accuracy_score(y_test, y_pred_test),
    "Test AUC": roc_auc_score(y_test, y_pred_proba)
})

### Kernel Trick: Linear vs. RBF

In [None]:


# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X_mend, y_mend_binary_15, test_size=0.2, random_state=42, stratify=y_mend_binary_15
)

# Pipeline: preprocessing + SVM
svm_pipe = Pipeline(steps=[
    ("preprocessor", reg_prep_mend),
    ("model", SVC(kernel="rbf", probability=True, random_state=42))
])

# Grid of hyperparameters (RBF only)
param_grid = {
    "model__C": [0.01, 0.1, 1, 10, 100],
    "model__gamma": [0.001, 0.01, 0.1, 1, "scale"]
}

# Grid search
svm_grid = GridSearchCV(
    svm_pipe,
    param_grid,
    cv=5,
    scoring="roc_auc",
    n_jobs=-1,
    return_train_score=True
)

svm_grid.fit(X_train, y_train)

# Collect results into DataFrame
cv_results = pd.DataFrame(svm_grid.cv_results_)

# Pivot table for heatmap (mean test AUC)
heatmap_data = cv_results.pivot(
    index="param_model__C",
    columns="param_model__gamma",
    values="mean_test_score"
)

# Plot heatmap
plt.figure(figsize=(8,6))
sns.heatmap(heatmap_data, annot=True, fmt=".3f", cmap="viridis")
plt.title("SVM RBF Kernel: AUC across C and gamma")
plt.ylabel("C (Regularization)")
plt.xlabel("Gamma (Kernel Width)")
plt.show()

# Best params + score
print("Best Params:", svm_grid.best_params_)
print("Best CV AUC:", svm_grid.best_score_)

# Week 6 - Decision Trees and Random Forests 

For Week 6, include concepts such as decision trees and random forests.

## Mendeley Delay Data

### Decision Tree Classifier

In [None]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X_mend, y_mend_binary_15, test_size=0.2, random_state=42, stratify=y_mend_binary_15
)

# Pipeline: preprocessing + Decision Tree
dt_pipe = Pipeline(steps=[
    ("preprocessor", reg_prep_mend),
    ("model", DecisionTreeClassifier(random_state=42))
])

# Grid of hyperparameters
dt_param_grid = {
    "model__max_depth": [3, 5, 10, None],
    "model__min_samples_split": [2, 5, 10],
    "model__min_samples_leaf": [1, 2, 5]
}

# Grid search
dt_grid = GridSearchCV(
    dt_pipe,
    dt_param_grid,
    cv=5,
    scoring="roc_auc",
    n_jobs=-1
)

dt_grid.fit(X_train, y_train)

# Evaluate
y_pred_test = dt_grid.predict(X_test)
y_pred_proba = dt_grid.predict_proba(X_test)[:, 1]

print("Decision Tree Best Params:", dt_grid.best_params_)
print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_test))
print("Decision Tree AUC:", roc_auc_score(y_test, y_pred_proba))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_test))
print("Classification Report:\n", classification_report(y_test, y_pred_test))

# Store results
results.append({
    "Model": "Decision Tree",
    "Best Params": dt_grid.best_params_,
    "Test Accuracy": accuracy_score(y_test, y_pred_test),
    "Test AUC": roc_auc_score(y_test, y_pred_proba)
})

### Random Forest Classifier

#### RF w/ Grid Search

In [None]:

# Pipeline: preprocessing + Random Forest
rf_pipe = Pipeline(steps=[
    ("preprocessor", reg_prep_mend),
    ("model", RandomForestClassifier(random_state=42))
])

# Grid of hyperparameters
rf_param_grid = {
    "model__n_estimators": [100, 200],
    "model__max_depth": [5, 10, None],
    "model__min_samples_split": [2, 5],
    "model__min_samples_leaf": [1, 2]
}

# Grid search
rf_grid = GridSearchCV(
    rf_pipe,
    rf_param_grid,
    cv=5,
    scoring="roc_auc",
    n_jobs=-1
)

rf_grid.fit(X_train, y_train)

# Evaluate
y_pred_test = rf_grid.predict(X_test)
y_pred_proba = rf_grid.predict_proba(X_test)[:, 1]

print("Random Forest Best Params:", rf_grid.best_params_)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_test))
print("Random Forest AUC:", roc_auc_score(y_test, y_pred_proba))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_test))
print("Classification Report:\n", classification_report(y_test, y_pred_test))

# Store results
results.append({
    "Model": "Random Forest",
    "Best Params": rf_grid.best_params_,
    "Test Accuracy": accuracy_score(y_test, y_pred_test),
    "Test AUC": roc_auc_score(y_test, y_pred_proba)
})

# Optional: view results table
results_df = pd.DataFrame(results)
print(results_df)

#### RF Feature Importance

In [None]:
# Get feature names after preprocessing
# reg_prep_mend is your ColumnTransformer
feature_names = reg_prep_mend.get_feature_names_out()

# Extract feature importances from the best Random Forest
best_rf = rf_grid.best_estimator_.named_steps["model"]
importances = best_rf.feature_importances_

# Put into DataFrame for sorting
feat_imp_df = pd.DataFrame({
    "Feature": feature_names,
    "Importance": importances
}).sort_values(by="Importance", ascending=False)

# Plot top 20 features
plt.figure(figsize=(10,6))
sns.barplot(x="Importance", y="Feature", data=feat_imp_df.head(20), palette="viridis")
plt.title("Random Forest Feature Importances (Top 20)")
plt.xlabel("Importance Score")
plt.ylabel("Feature")
plt.tight_layout()
plt.show()

# Print top features
print(feat_imp_df.head(20))

# Week 7 - Milestone 1

## Review Results

In [None]:
# Convert list of dicts into DataFrame
results_df = pd.DataFrame(results)

# Sort by test_r2 (descending) or test_rmse (ascending)
results_df_sorted = results_df.sort_values(by="test_r2", ascending=False)

print(results_df_sorted)

In [None]:
plt.figure(figsize=(8,5))
sns.barplot(data=results_df, x="model", y="test_r2", palette="viridis")
plt.title("Model Comparison (Test R²)")
plt.xticks(rotation=45)
plt.show()

plt.figure(figsize=(8,5))
sns.barplot(data=results_df, x="model", y="test_rmse", palette="magma")
plt.title("Model Comparison (Test RMSE)")
plt.xticks(rotation=45)
plt.show()

# Scrapyard

Previously completed work no longer needed

## Week 1: USDOT On Time Dataset

In [None]:
# Use glob to find all matching CSV files
all_files = glob.glob(os.path.join(data_path, "T_ONTIME_REPORTING_2025*.csv"))

# Read and combine them
dfs = [pd.read_csv(f) for f in all_files]
combined_df = pd.concat(dfs, ignore_index=True)

print("Files combined:", len(all_files))
print("Final shape:", combined_df.shape)

  dfs = [pd.read_csv(f) for f in all_files]
  dfs = [pd.read_csv(f) for f in all_files]
  dfs = [pd.read_csv(f) for f in all_files]
  dfs = [pd.read_csv(f) for f in all_files]
  dfs = [pd.read_csv(f) for f in all_files]


Files combined: 5
Final shape: (2906929, 109)


In [None]:
# Drop diverted columns
combined_df = combined_df.drop(combined_df.filter(regex=r"^DIV\d+").columns, axis=1)

usdot_df = optimize_dataframe(
    combined_df,
    datetime_cols=['fl_date'],
    fillna=True
)
usdot_df = clean_column_names(usdot_df)

Memory usage before optimization: 3779.75 MB
Memory usage after optimization: 535.69 MB
Reduced by 85.8%


In [None]:
# Get column categories

id_cols = ['op_carrier_airline_id', 'origin_airport_id', 'origin_airport_seq_id', 'origin_city_market_id', 'origin_state_fips', 'origin_wac', 'dest_airport_id', 'dest_airport_seq_id', 'dest_city_market_id', 'dest_state_fips', 'dest_wac', 'crs_dep_time', 'crs_arr_time']
cat_cols = ['op_unique_carrier', 'op_carrier', 'tail_num', 'origin', 'origin_city_name', 'origin_state_abr', 'origin_state_nm', 'dest', 'dest_city_name', 'dest_state_abr', 'dest_state_nm', 'dest_state_fips', 'dest_wac', 'dep_time_blk', 'arr_time_blk', 'cancellation_code,']
date_cols = ['fl_date', ]
target_cols = ['dep_delay', 'dep_delay_new', 'dep_del15', 'dep_delay_group', 'arr_delay', 'arr_delay_new', 'arr_del15', 'arr_delay_group', 'carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay']
feature_cols = [col for col in usdot_df.columns if col not in id_cols + cat_cols + date_cols + target_cols]

In [None]:
# drop leakage columns for linear regression
TARGET_COLUMN = 'dep_delay'
leakage_cols = [x for x in target_cols if x != TARGET_COLUMN]
usdot_df_lin = usdot_df.drop(leakage_cols + id_cols + date_cols, axis=1, errors="ignore").copy()

preprocessor, low_card, high_card, num_cols = build_preprocessing_pipeline(
    usdot_df_lin, 
    target=TARGET_COLUMN, 
    high_card_threshold=20, 
    scale_numeric=True
)

print("Low-cardinality categorical:", low_card)
print("High-cardinality categorical:", high_card)
print("Numeric columns:", num_cols)

X = usdot_df_lin.drop(columns=TARGET_COLUMN)
y = usdot_df_lin[TARGET_COLUMN]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

usdot_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

usdot_model.fit(X_train, y_train)
print("R^2 score:", usdot_model.score(X_test, y_test))

Low-cardinality categorical: ['op_unique_carrier', 'op_carrier', 'dep_time_blk', 'arr_time_blk', 'cancellation_code']
High-cardinality categorical: ['tail_num', 'origin', 'origin_city_name', 'origin_state_abr', 'origin_state_nm', 'dest', 'dest_city_name', 'dest_state_abr', 'dest_state_nm']
Numeric columns: ['year', 'quarter', 'month', 'day_of_month', 'day_of_week', 'op_carrier_fl_num', 'dep_time', 'taxi_out', 'wheels_off', 'wheels_on', 'taxi_in', 'arr_time', 'cancelled', 'diverted', 'crs_elapsed_time', 'actual_elapsed_time', 'air_time', 'flights', 'distance', 'distance_group', 'first_dep_time', 'total_add_gtime', 'longest_add_gtime', 'div_airport_landings', 'div_reached_dest', 'div_actual_elapsed_time', 'div_arr_delay', 'div_distance']
R^2 score: 0.06941058350132379


In [None]:
vif_table = calculate_vif(usdot_df)
print(vif_table)

Dropping constant columns: ['year', 'flights']


KeyboardInterrupt: 

In [None]:
# Remove large datasets
for name, obj in list(globals().items()):
    if isinstance(obj, pd.DataFrame) and len(obj) > 10000:
        del globals()[name]
        print(f"Dropped DataFrame: {name}")


Dropped DataFrame: df
Dropped DataFrame: df_lin
Dropped DataFrame: X
Dropped DataFrame: X_train
Dropped DataFrame: X_test
Dropped DataFrame: combined_df
Dropped DataFrame: usdot_df
Dropped DataFrame: usdot_df_lin


## Week 2: USDOT On Time Dataset

### Data Prep

In [None]:
# Use glob to find all matching CSV files
all_files = glob.glob(os.path.join(data_path, "T_ONTIME_REPORTING_2025*.csv"))

# Read and combine them
dfs = [pd.read_csv(f) for f in all_files]
combined_df = pd.concat(dfs, ignore_index=True)

print("Files combined:", len(all_files))
print("Final shape:", combined_df.shape)

# Drop diverted columns
combined_df = combined_df.drop(combined_df.filter(regex=r"^DIV\d+").columns, axis=1)

usdot_df = optimize_dataframe(
    combined_df,
    datetime_cols=['fl_date'],
    fillna=True
)
usdot_df = clean_column_names(usdot_df)

# Get column categories

id_cols = ['op_carrier_airline_id', 'origin_airport_id', 'origin_airport_seq_id', 'origin_city_market_id', 'origin_state_fips', 'origin_wac', 'dest_airport_id', 'dest_airport_seq_id', 'dest_city_market_id', 'dest_state_fips', 'dest_wac', 'crs_dep_time', 'crs_arr_time']
cat_cols = ['op_unique_carrier', 'op_carrier', 'tail_num', 'origin', 'origin_city_name', 'origin_state_abr', 'origin_state_nm', 'dest', 'dest_city_name', 'dest_state_abr', 'dest_state_nm', 'dest_state_fips', 'dest_wac', 'dep_time_blk', 'arr_time_blk', 'cancellation_code,']
date_cols = ['fl_date', ]
target_cols = ['dep_delay', 'dep_delay_new', 'dep_del15', 'dep_delay_group', 'arr_delay', 'arr_delay_new', 'arr_del15', 'arr_delay_group', 'carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay']
feature_cols = [col for col in usdot_df.columns if col not in id_cols + cat_cols + date_cols + target_cols]

# drop leakage columns for linear regression
TARGET_COLUMN = 'dep_delay'
leakage_cols = [x for x in target_cols if x != TARGET_COLUMN]
usdot_df_lin = usdot_df.drop(leakage_cols + id_cols + date_cols, axis=1, errors="ignore").copy()

preprocessor, low_card, high_card, num_cols = build_preprocessing_pipeline(
    usdot_df_lin, 
    target=TARGET_COLUMN, 
    high_card_threshold=20, 
    scale_numeric=True
)

print("Low-cardinality categorical:", low_card)
print("High-cardinality categorical:", high_card)
print("Numeric columns:", num_cols)

X = usdot_df_lin.drop(columns=TARGET_COLUMN)
y = usdot_df_lin[TARGET_COLUMN]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


  dfs = [pd.read_csv(f) for f in all_files]
  dfs = [pd.read_csv(f) for f in all_files]
  dfs = [pd.read_csv(f) for f in all_files]
  dfs = [pd.read_csv(f) for f in all_files]
  dfs = [pd.read_csv(f) for f in all_files]


Files combined: 5
Final shape: (2906929, 109)
Memory usage before optimization: 3779.75 MB
Memory usage after optimization: 535.69 MB
Reduced by 85.8%
Low-cardinality categorical: ['op_unique_carrier', 'op_carrier', 'dep_time_blk', 'arr_time_blk', 'cancellation_code']
High-cardinality categorical: ['tail_num', 'origin', 'origin_city_name', 'origin_state_abr', 'origin_state_nm', 'dest', 'dest_city_name', 'dest_state_abr', 'dest_state_nm']
Numeric columns: ['year', 'quarter', 'month', 'day_of_month', 'day_of_week', 'op_carrier_fl_num', 'dep_time', 'taxi_out', 'wheels_off', 'wheels_on', 'taxi_in', 'arr_time', 'cancelled', 'diverted', 'crs_elapsed_time', 'actual_elapsed_time', 'air_time', 'flights', 'distance', 'distance_group', 'first_dep_time', 'total_add_gtime', 'longest_add_gtime', 'div_airport_landings', 'div_reached_dest', 'div_actual_elapsed_time', 'div_arr_delay', 'div_distance']


### Lasso Regression

In [None]:
# Default Lasso 

lasso = Lasso(max_iter=10000, random_state=42)

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', lasso)
])

model.fit(X_train, y_train)
print("R^2 score:", model.score(X_test, y_test))

R^2 score: 0.04595741706259315


In [None]:
# Lasso with tuned alpha with cross-validation
param_grid = {
    'regressor__alpha': [0.001, 0.01, 0.1, 1, 10, 100]
}

grid = GridSearchCV(model, param_grid, cv=5, scoring='r2', n_jobs=-1)
grid.fit(X_train, y_train)

print("Best alpha:", grid.best_params_['regressor__alpha'])
print("Best CV R^2:", grid.best_score_)
print("Test R^2:", grid.score(X_test, y_test))

KeyboardInterrupt: 

### Ridge Regression

In [None]:
#Ridge with default alpha

ridge = Ridge(max_iter=10000, random_state=42)

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', ridge)
])

# Option 1: Fit with default alpha
model.fit(X_train, y_train)
print("R^2 score (default alpha):", model.score(X_test, y_test))


R^2 score (default alpha): 0.06941067445046167


In [None]:
# Ridge with tune alpha with cross-validation
param_grid = {
    'regressor__alpha': [0.01, 0.1, 1, 10, 100, 1000]
}

grid = GridSearchCV(model, param_grid, cv=5, scoring='r2', n_jobs=-1)
grid.fit(X_train, y_train)

print("Best alpha:", grid.best_params_['regressor__alpha'])
print("Best CV R^2:", grid.best_score_)
print("Test R^2:", grid.score(X_test, y_test))


### Elastic Net

In [None]:
# Replace regressor with ElasticNet
elastic = ElasticNet(max_iter=10000, random_state=42)

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', elastic)
])

# Option 1: Fit with default parameters
model.fit(X_train, y_train)
print("R^2 score (default alpha, l1_ratio):", model.score(X_test, y_test))



R^2 score (default alpha, l1_ratio): 0.04155075820659526


In [None]:
# Option 2: Tune alpha and l1_ratio with cross-validation
param_grid = {
    'regressor__alpha': [0.001, 0.01, 0.1, 1, 10],
    'regressor__l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9]
}

grid = GridSearchCV(model, param_grid, cv=5, scoring='r2', n_jobs=-1)
grid.fit(X_train, y_train)

print("Best params:", grid.best_params_)
print("Best CV R^2:", grid.best_score_)
print("Test R^2:", grid.score(X_test, y_test))


In [None]:
# Remove large datasets
for name, obj in list(globals().items()):
    if isinstance(obj, pd.DataFrame) and len(obj) > 10000:
        del globals()[name]
        print(f"Dropped DataFrame: {name}")


Dropped DataFrame: combined_df
Dropped DataFrame: usdot_df
Dropped DataFrame: usdot_df_lin
Dropped DataFrame: X
Dropped DataFrame: X_train
Dropped DataFrame: X_test


## Week 3: USDOT On Time Dataset

### Data Prep

In [None]:
# Use glob to find all matching CSV files
all_files = glob.glob(os.path.join(data_path, "T_ONTIME_REPORTING_2025*.csv"))
SAMPLE_SIZE = 50000

# Read and combine them
dfs = [pd.read_csv(f) for f in all_files]
combined_df = pd.concat(dfs, ignore_index=True)

print("Files combined:", len(all_files))
print("Final shape:", combined_df.shape)

# Drop diverted columns
combined_df = combined_df.drop(combined_df.filter(regex=r"^DIV\d+").columns, axis=1)
combined_df.columns = combined_df.columns.str.lower()

usdot_df = optimize_dataframe(
    combined_df,
    datetime_cols=['fl_date'],
    fillna=True
)
usdot_df = clean_column_names(usdot_df)

if SAMPLE_SIZE >= 0:
    print("Sampling the dataset to", SAMPLE_SIZE)
    bins = [-np.inf, -1, 0, 15, 60, 180, np.inf]
    labels = ['early', 'on_time', 'small_delay', 'moderate_delay', 'long_delay', 'extreme_delay']

    usdot_df['depdelay_bin'] = pd.cut(usdot_df['dep_delay'], bins=bins, labels=labels)
    usdot_df = (
        usdot_df.groupby('depdelay_bin', group_keys=False)
        .apply(lambda x: x.sample(
            n=min(int(SAMPLE_SIZE * len(x) / len(usdot_df)), len(x)), 
            random_state=42
        ))
        .reset_index(drop=True)
    )

    usdot_df = usdot_df.drop(columns=['depdelay_bin'])

# Get column categories

id_cols = ['op_carrier_airline_id', 'origin_airport_id', 'origin_airport_seq_id', 'origin_city_market_id', 'origin_state_fips', 'origin_wac', 'dest_airport_id', 'dest_airport_seq_id', 'dest_city_market_id', 'dest_state_fips', 'dest_wac', 'crs_dep_time', 'crs_arr_time']
cat_cols = ['op_unique_carrier', 'op_carrier', 'tail_num', 'origin', 'origin_city_name', 'origin_state_abr', 'origin_state_nm', 'dest', 'dest_city_name', 'dest_state_abr', 'dest_state_nm', 'dest_state_fips', 'dest_wac', 'dep_time_blk', 'arr_time_blk', 'cancellation_code,']
date_cols = ['fl_date', ]
target_cols = ['dep_delay', 'dep_delay_new', 'dep_del15', 'dep_delay_group', 'arr_delay', 'arr_delay_new', 'arr_del15', 'arr_delay_group', 'carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay']
feature_cols = [col for col in usdot_df.columns if col not in id_cols + cat_cols + date_cols + target_cols]

# drop leakage columns for linear regression
TARGET_COLUMN = 'dep_delay'
leakage_cols = [x for x in target_cols if x != TARGET_COLUMN]
usdot_df_lin = usdot_df.drop(leakage_cols + id_cols + date_cols, axis=1, errors="ignore").copy()

preprocessor, low_card, high_card, num_cols = build_preprocessing_pipeline(
    usdot_df_lin, 
    target=TARGET_COLUMN, 
    high_card_threshold=20, 
    scale_numeric=True
)

print("Low-cardinality categorical:", low_card)
print("High-cardinality categorical:", high_card)
print("Numeric columns:", num_cols)

X = usdot_df_lin.drop(columns=TARGET_COLUMN)
y = usdot_df_lin[TARGET_COLUMN]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


  dfs = [pd.read_csv(f) for f in all_files]
  dfs = [pd.read_csv(f) for f in all_files]
  dfs = [pd.read_csv(f) for f in all_files]
  dfs = [pd.read_csv(f) for f in all_files]
  dfs = [pd.read_csv(f) for f in all_files]


Files combined: 5
Final shape: (2906929, 109)
Memory usage before optimization: 3779.75 MB


  df[col] = pd.to_datetime(df[col], errors='coerce')


Memory usage after optimization: 552.31 MB
Reduced by 85.4%
Sampling the dataset to 50000


  usdot_df.groupby('depdelay_bin', group_keys=False)


Low-cardinality categorical: ['op_unique_carrier', 'op_carrier', 'dep_time_blk', 'arr_time_blk', 'cancellation_code']
High-cardinality categorical: ['tail_num', 'origin', 'origin_city_name', 'origin_state_abr', 'origin_state_nm', 'dest', 'dest_city_name', 'dest_state_abr', 'dest_state_nm']
Numeric columns: ['year', 'quarter', 'month', 'day_of_month', 'day_of_week', 'op_carrier_fl_num', 'dep_time', 'taxi_out', 'wheels_off', 'wheels_on', 'taxi_in', 'arr_time', 'cancelled', 'diverted', 'crs_elapsed_time', 'actual_elapsed_time', 'air_time', 'flights', 'distance', 'distance_group', 'first_dep_time', 'total_add_gtime', 'longest_add_gtime', 'div_airport_landings', 'div_reached_dest', 'div_actual_elapsed_time', 'div_arr_delay', 'div_distance']


  .apply(lambda x: x.sample(


### Forward Selection

In [None]:
# Fit preprocessor with y (important for supervised encoders)
preprocessor.fit(X_train, y_train)

X_train_trans = preprocessor.transform(X_train)
X_test_trans  = preprocessor.transform(X_test)

lin_reg = LinearRegression()

sfs_forward = SFS(
    lin_reg,
    k_features='best',
    forward=True,
    floating=False,
    scoring='r2',
    cv=3,
    n_jobs=-1,
    verbose=2
)

sfs_forward = sfs_forward.fit(X_train_trans, y_train)

# Map indices back to feature names
feature_names = preprocessor.get_feature_names_out()
selected_features = [feature_names[i] for i in sfs_forward.k_feature_idx_]
print("Selected features:", selected_features)

# --- Evaluate model accuracy ---
# Restrict to selected features
X_train_sel = X_train_trans[:, sfs_forward.k_feature_idx_]
X_test_sel  = X_test_trans[:, sfs_forward.k_feature_idx_]

# Fit final model
lin_reg.fit(X_train_sel, y_train)
y_pred = lin_reg.predict(X_test_sel)


r2 = r2_score(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred)

print(f"Model Accuracy (R²): {r2:.4f}")
print(f"Model RMSE: {rmse:.4f}")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  76 out of 107 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done 107 out of 107 | elapsed:    0.2s finished

[2025-09-28 14:42:18] Features: 1/107 -- score: 0.14339819239562326[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 106 out of 106 | elapsed:    0.2s finished

[2025-09-28 14:42:18] Features: 2/107 -- score: 0.15745076680487888[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  74 out of 105 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done 105 out of 105 | elapsed:    0.3s finished

[2025-09-28 14:42:19] Features: 3/107 -- score: 0.1637377783872315[Para

Selected features: ['low_card__op_unique_carrier_AA', 'low_card__op_unique_carrier_AS', 'low_card__op_unique_carrier_DL', 'low_card__op_unique_carrier_F9', 'low_card__op_unique_carrier_HA', 'low_card__op_unique_carrier_NK', 'low_card__op_unique_carrier_OO', 'low_card__op_unique_carrier_UA', 'low_card__op_unique_carrier_WN', 'low_card__op_unique_carrier_YX', 'low_card__op_carrier_AA', 'low_card__op_carrier_AS', 'low_card__op_carrier_DL', 'low_card__op_carrier_F9', 'low_card__op_carrier_HA', 'low_card__op_carrier_NK', 'low_card__op_carrier_OO', 'low_card__op_carrier_UA', 'low_card__op_carrier_WN', 'low_card__op_carrier_YX', 'low_card__dep_time_blk_0001-0559', 'low_card__dep_time_blk_0600-0659', 'low_card__dep_time_blk_0700-0759', 'low_card__dep_time_blk_0800-0859', 'low_card__dep_time_blk_0900-0959', 'low_card__dep_time_blk_1400-1459', 'low_card__dep_time_blk_1700-1759', 'low_card__dep_time_blk_1800-1859', 'low_card__dep_time_blk_1900-1959', 'low_card__dep_time_blk_2000-2059', 'low_card_


[2025-09-28 14:50:28] Features: 107/107 -- score: 0.18481102822461867

### Backward Selection

In [None]:
# --- Backward selection ---
sfs_backward = SFS(
    lin_reg,
    k_features='best',
    forward=False,
    floating=False,
    scoring='r2',
    cv=3,
    n_jobs=-1,
    verbose=2
)

# Important: fit preprocessor with y
preprocessor.fit(X_train, y_train)
X_train_trans = preprocessor.transform(X_train)
X_test_trans  = preprocessor.transform(X_test)

sfs_backward = sfs_backward.fit(X_train_trans, y_train)

# Map indices back to feature names
feature_names = preprocessor.get_feature_names_out()
selected_features = [feature_names[i] for i in sfs_backward.k_feature_idx_]
print("Backward-selected features:", selected_features)

# --- Evaluate model accuracy ---
# Restrict to selected features
X_train_sel = X_train_trans[:, sfs_backward.k_feature_idx_]
X_test_sel  = X_test_trans[:, sfs_backward.k_feature_idx_]

# Fit final model
lin_reg.fit(X_train_sel, y_train)
y_pred = lin_reg.predict(X_test_sel)

# Metrics
r2 = r2_score(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred)

print(f"Backward Selection Model Accuracy (R²): {r2:.4f}")
print(f"Backward Selection Model RMSE: {rmse:.4f}")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    8.2s
[Parallel(n_jobs=-1)]: Done 107 out of 107 | elapsed:   55.2s finished

[2025-09-28 14:52:27] Features: 106/1 -- score: 0.18499935318473468[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    7.7s
[Parallel(n_jobs=-1)]: Done 106 out of 106 | elapsed:   53.9s finished

[2025-09-28 14:53:21] Features: 105/1 -- score: 0.18516577826394912[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    7.9s
[Parallel(n_jobs=-1)]: Done 105 out of 105 | elapsed:   51.9s finished

[2025-09-28 14:54:13] Features: 104/1 -- score: 0.18531519681955885[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    7.8s
[Parallel(n_jobs=-1)]: Done

Backward-selected features: ['low_card__op_unique_carrier_AA', 'low_card__op_unique_carrier_AS', 'low_card__op_unique_carrier_B6', 'low_card__op_unique_carrier_F9', 'low_card__op_unique_carrier_G4', 'low_card__op_unique_carrier_NK', 'low_card__op_unique_carrier_YX', 'low_card__op_carrier_DL', 'low_card__op_carrier_HA', 'low_card__op_carrier_MQ', 'low_card__op_carrier_OH', 'low_card__op_carrier_UA', 'low_card__dep_time_blk_0001-0559', 'low_card__dep_time_blk_0600-0659', 'low_card__dep_time_blk_0700-0759', 'low_card__dep_time_blk_0800-0859', 'low_card__dep_time_blk_0900-0959', 'low_card__dep_time_blk_1000-1059', 'low_card__dep_time_blk_1100-1159', 'low_card__dep_time_blk_1200-1259', 'low_card__dep_time_blk_1300-1359', 'low_card__dep_time_blk_1400-1459', 'low_card__dep_time_blk_1500-1559', 'low_card__dep_time_blk_1600-1659', 'low_card__dep_time_blk_1700-1759', 'low_card__dep_time_blk_1900-1959', 'low_card__dep_time_blk_2200-2259', 'low_card__arr_time_blk_0001-0559', 'low_card__arr_time_bl


[2025-09-28 15:15:18] Features: 1/1 -- score: 0.14339819239562326

### PCR
Principal Component Regression

In [None]:
# Build pipeline: preprocessing → PCA → Linear Regression
pcr = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=20)),   # choose number of components
    ('regressor', LinearRegression())
])

pcr.fit(X_train, y_train)

y_pred_pcr = pcr.predict(X_test)

print("PCR R2:", r2_score(y_test, y_pred_pcr))
print("PCR RMSE:", mean_squared_error(y_test, y_pred_pcr))

PCR R2: -0.08136605278516584
PCR RMSE: 3419.3802488901792


### PLSR
Partial Least Squares Regression

## Week 4: USDOT On Time Dataset

### Data Prep

In [None]:
# Use glob to find all matching CSV files
all_files = glob.glob(os.path.join(data_path, "T_ONTIME_REPORTING_2025*.csv"))
SAMPLE_SIZE = 50000

# Read and combine them
dfs = [pd.read_csv(f) for f in all_files]
combined_df = pd.concat(dfs, ignore_index=True)

print("Files combined:", len(all_files))
print("Final shape:", combined_df.shape)

# Drop diverted columns
combined_df = combined_df.drop(combined_df.filter(regex=r"^DIV\d+").columns, axis=1)
combined_df.columns = combined_df.columns.str.lower()

usdot_df = optimize_dataframe(
    combined_df,
    datetime_cols=['fl_date'],
    fillna=True
)
usdot_df = clean_column_names(usdot_df)

if SAMPLE_SIZE >= 0:
    print("Sampling the dataset to", SAMPLE_SIZE)
    bins = [-np.inf, -1, 0, 15, 60, 180, np.inf]
    labels = ['early', 'on_time', 'small_delay', 'moderate_delay', 'long_delay', 'extreme_delay']

    usdot_df['depdelay_bin'] = pd.cut(usdot_df['dep_delay'], bins=bins, labels=labels)
    usdot_df = (
        usdot_df.groupby('depdelay_bin', group_keys=False)
        .apply(lambda x: x.sample(
            n=min(int(SAMPLE_SIZE * len(x) / len(usdot_df)), len(x)), 
            random_state=42
        ))
        .reset_index(drop=True)
    )

    usdot_df = usdot_df.drop(columns=['depdelay_bin'])

# Get column categories

id_cols = ['op_carrier_airline_id', 'origin_airport_id', 'origin_airport_seq_id', 'origin_city_market_id', 'origin_state_fips', 'origin_wac', 'dest_airport_id', 'dest_airport_seq_id', 'dest_city_market_id', 'dest_state_fips', 'dest_wac', 'crs_dep_time', 'crs_arr_time']
cat_cols = ['op_unique_carrier', 'op_carrier', 'tail_num', 'origin', 'origin_city_name', 'origin_state_abr', 'origin_state_nm', 'dest', 'dest_city_name', 'dest_state_abr', 'dest_state_nm', 'dest_state_fips', 'dest_wac', 'dep_time_blk', 'arr_time_blk', 'cancellation_code,']
date_cols = ['fl_date', ]
target_cols = ['dep_delay', 'dep_delay_new', 'dep_del15', 'dep_delay_group', 'arr_delay', 'arr_delay_new', 'arr_del15', 'arr_delay_group', 'carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay']
feature_cols = [col for col in usdot_df.columns if col not in id_cols + cat_cols + date_cols + target_cols]

# drop leakage columns for linear regression
TARGET_COLUMN = 'dep_delay'
leakage_cols = [x for x in target_cols if x != TARGET_COLUMN]
usdot_df_lin = usdot_df.drop(leakage_cols + id_cols + date_cols, axis=1, errors="ignore").copy()

preprocessor, low_card, high_card, num_cols = build_preprocessing_pipeline(
    usdot_df_lin, 
    target=TARGET_COLUMN, 
    high_card_threshold=20, 
    scale_numeric=True
)

print("Low-cardinality categorical:", low_card)
print("High-cardinality categorical:", high_card)
print("Numeric columns:", num_cols)


# --- Create binary target: 1 if depdelay > 0, else 0 ---
usdot_df_lin['depdelay_binary'] = (usdot_df_lin['dep_delay'] > 0).astype(int)

# Update target variable
X = usdot_df_lin.drop(columns=['dep_delay', 'depdelay_binary'])
y = usdot_df_lin['depdelay_binary']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


  dfs = [pd.read_csv(f) for f in all_files]
  dfs = [pd.read_csv(f) for f in all_files]
  dfs = [pd.read_csv(f) for f in all_files]
  dfs = [pd.read_csv(f) for f in all_files]
  dfs = [pd.read_csv(f) for f in all_files]


Files combined: 5
Final shape: (2906929, 109)
Memory usage before optimization: 3779.75 MB


  df[col] = pd.to_datetime(df[col], errors='coerce')


Memory usage after optimization: 552.31 MB
Reduced by 85.4%
Sampling the dataset to 50000


  usdot_df.groupby('depdelay_bin', group_keys=False)


Low-cardinality categorical: ['op_unique_carrier', 'op_carrier', 'dep_time_blk', 'arr_time_blk', 'cancellation_code']
High-cardinality categorical: ['tail_num', 'origin', 'origin_city_name', 'origin_state_abr', 'origin_state_nm', 'dest', 'dest_city_name', 'dest_state_abr', 'dest_state_nm']
Numeric columns: ['year', 'quarter', 'month', 'day_of_month', 'day_of_week', 'op_carrier_fl_num', 'dep_time', 'taxi_out', 'wheels_off', 'wheels_on', 'taxi_in', 'arr_time', 'cancelled', 'diverted', 'crs_elapsed_time', 'actual_elapsed_time', 'air_time', 'flights', 'distance', 'distance_group', 'first_dep_time', 'total_add_gtime', 'longest_add_gtime', 'div_airport_landings', 'div_reached_dest', 'div_actual_elapsed_time', 'div_arr_delay', 'div_distance']


  .apply(lambda x: x.sample(


### Log Regression: Basic

In [None]:

# --- Logistic Regression Pipeline ---
log_reg_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(
        max_iter=1000,       # increase iterations for convergence
        solver='lbfgs',      # robust solver
        n_jobs=-1,            # parallelize
        class_weight='balanced'  # handle class imbalance
    ))
])

# --- Fit model ---
log_reg_pipe.fit(X_train, y_train)

# --- Predict ---
y_pred = log_reg_pipe.predict(X_test)

# --- Evaluate ---
print("\nLogistic Regression Results")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Logistic Regression Results
Accuracy: 0.6157

Classification Report:
               precision    recall  f1-score   support

           0       0.72      0.63      0.67      6230
           1       0.49      0.59      0.54      3770

    accuracy                           0.62     10000
   macro avg       0.60      0.61      0.60     10000
weighted avg       0.63      0.62      0.62     10000



### Log Regression: Random Search

In [None]:
# --- Define parameter distributions ---
param_dist = {
    # Regularization strength (inverse of penalty)
    'classifier__C': loguniform(1e-3, 1e3),
    
    # Penalty type (note: lbfgs only supports l2, saga supports l1/l2/elasticnet)
    'classifier__penalty': ['l2'],
    
    # Try different solvers (must be compatible with penalty)
    'classifier__solver': ['lbfgs', 'saga'],
    
    # Optionally explore class weights
    'classifier__class_weight': ['balanced', None]
}

# --- Randomized Search ---
random_search = RandomizedSearchCV(
    estimator=log_reg_pipe,
    param_distributions=param_dist,
    n_iter=20,              # number of random combinations to try
    cv=5,                   # 5-fold cross-validation
    scoring='f1',           # optimize for F1 (better for imbalance than accuracy)
    n_jobs=-1,
    random_state=42,
    verbose=2
)

# --- Fit random search ---
random_search.fit(X_train, y_train)

# --- Best parameters and score ---
print("Best Parameters:", random_search.best_params_)
print("Best CV Score:", random_search.best_score_)

# --- Evaluate on test set ---
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)

print("\nRandom Search Logistic Regression Results")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Parameters: {'classifier__C': np.float64(0.008632008168602538), 'classifier__class_weight': 'balanced', 'classifier__penalty': 'l2', 'classifier__solver': 'lbfgs'}
Best CV Score: 0.5898257289075901

Random Search Logistic Regression Results
Accuracy: 0.6348

Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.62      0.68      6230
           1       0.51      0.66      0.58      3770

    accuracy                           0.63     10000
   macro avg       0.63      0.64      0.63     10000
weighted avg       0.66      0.63      0.64     10000



In [None]:

# Preprocess first
X_train_trans = preprocessor.fit_transform(X_train, y_train)
X_test_trans  = preprocessor.transform(X_test)

# Fit PLSR with, say, 10 components
plsr = PLSRegression(n_components=10)
plsr.fit(X_train_trans, y_train)

y_pred_plsr = plsr.predict(X_test_trans)

print("PLSR R2:", r2_score(y_test, y_pred_plsr))
print("PLSR RMSE:", mean_squared_error(y_test, y_pred_plsr))

PLSR R2: -0.06250654912747011
PLSR RMSE: 3359.7447405025227
