# Initial Setup

In [9]:
# Load Virtual Environment

!& "c:\Users\tbran\Python\repos\Semester 3 Repos\capstone\.venv\Scripts\Activate.ps1"


& was unexpected at this time.


In [65]:
# Core
import pandas as pd
import numpy as np

# File handling / utilities
import glob
import os
import re
import unicodedata

# Statsmodels
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Scikit-learn: preprocessing & pipelines
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from category_encoders import TargetEncoder
from sklearn.pipeline import Pipeline


# Scikit-learn: models
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet, LogisticRegression
from sklearn.cross_decomposition import PLSRegression
from sklearn.decomposition import PCA

# Scikit-learn: model selection & metrics
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, classification_report, confusion_matrix, accuracy_score
from scipy.stats import loguniform

# Feature selection
from mlxtend.feature_selection import SequentialFeatureSelector as SFS


In [3]:
project_path = 'C:/Users/tbran/Python/repos/Semester 3 Repos/capstone/'
data_path = project_path + 'data/'
src_path = project_path + 'src/'
model_path = project_path + 'models/'

# Helper Functions

## Week 1 Functions

In [None]:
def optimize_dataframe(df, datetime_cols=None, fillna=False):
    """
    Cleans and optimizes a DataFrame:
    - Converts object datetime columns to datetime64
    - Converts object columns with repeated values to category
    - Downcasts numeric columns to smallest safe type
    - Optionally fills NaNs before downcasting
    
    Parameters:
        df (pd.DataFrame): The DataFrame to optimize
        datetime_cols (list): List of column names to convert to datetime
        fillna (bool): If True, fills NaNs before downcasting
    """
    
    start_mem = df.memory_usage(deep=True).sum() / 1024**2
    print(f"Memory usage before optimization: {start_mem:.2f} MB")
    
    df = df.copy()
    
    # 1. Convert datetime columns
    if datetime_cols:
        for col in datetime_cols:
            if col in df.columns:
                df[col] = pd.to_datetime(df[col], errors='coerce')
    
    # 2. Convert object columns to category if appropriate
    obj_cols = df.select_dtypes(include=['object']).columns
    for col in obj_cols:
        num_unique = df[col].nunique()
        num_total = len(df[col])
        if num_unique / num_total < 0.5:  # heuristic: less than 50% unique
            df[col] = df[col].astype('category')
    
    # 3. Downcast numeric columns
    int_cols = df.select_dtypes(include=['int64', 'int32']).columns
    float_cols = df.select_dtypes(include=['float64', 'float32']).columns
    
    for col in int_cols:
        if fillna and df[col].isnull().any():
            df[col] = df[col].fillna(0)
        df[col] = pd.to_numeric(df[col], downcast='integer')
    
    for col in float_cols:
        if fillna and df[col].isnull().any():
            df[col] = df[col].fillna(df[col].mean())
        df[col] = pd.to_numeric(df[col], downcast='float')
    
    end_mem = df.memory_usage(deep=True).sum() / 1024**2
    print(f"Memory usage after optimization: {end_mem:.2f} MB")
    print(f"Reduced by {100 * (start_mem - end_mem) / start_mem:.1f}%")
    
    return df


def clean_column_names(df, remove_accents=True):
    """
    Cleans DataFrame column names:
    - Strips whitespace
    - Converts to lowercase
    - Replaces spaces & special chars with underscores
    - Removes duplicate underscores
    - Optionally removes accents
    
    Parameters:
        df (pd.DataFrame): DataFrame whose columns to clean
        remove_accents (bool): If True, strips accents from characters
    
    Returns:
        pd.DataFrame: DataFrame with cleaned column names
    """
    def _clean(col):
        col = col.strip().lower()
        if remove_accents:
            col = ''.join(
                c for c in unicodedata.normalize('NFKD', col)
                if not unicodedata.combining(c)
            )
        col = re.sub(r'[^0-9a-zA-Z]+', '_', col)  # replace non-alphanumeric with _
        col = re.sub(r'_+', '_', col)             # collapse multiple underscores
        col = col.strip('_')                      # remove leading/trailing underscores
        return col
    
    df = df.copy()
    df.columns = [_clean(c) for c in df.columns]
    return df



def build_preprocessing_pipeline(df, target, 
                                  high_card_threshold=20, 
                                  scale_numeric=False):
    """
    Builds a preprocessing pipeline for linear regression:
    - One-hot encodes low-cardinality categorical columns
    - Target encodes high-cardinality categorical columns
    - Optionally scales numeric columns
    
    Parameters:
        df (pd.DataFrame): Input DataFrame (including target column)
        target (str): Name of target column
        high_card_threshold (int): Unique value cutoff for high-cardinality
        scale_numeric (bool): Whether to scale numeric features
        
    Returns:
        pipeline (ColumnTransformer): Preprocessing transformer
        low_card_cols (list): Low-cardinality categorical columns
        high_card_cols (list): High-cardinality categorical columns
        num_cols (list): Numeric columns
    """
    
    # Separate features and target
    X = df.drop(columns=[target])
    
    # Identify column types
    cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
    num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
    
    # Split categorical into low/high cardinality
    low_card_cols = [col for col in cat_cols if X[col].nunique() <= high_card_threshold]
    high_card_cols = [col for col in cat_cols if X[col].nunique() > high_card_threshold]
    
    # Transformers
    low_card_transformer = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    high_card_transformer = TargetEncoder()
    num_transformer = StandardScaler() if scale_numeric else 'passthrough'
    
    # Column transformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('low_card', low_card_transformer, low_card_cols),
            ('high_card', high_card_transformer, high_card_cols),
            ('num', num_transformer, num_cols)
        ]
    )
    
    return preprocessor, low_card_cols, high_card_cols, num_cols



def calculate_vif(df, features=None, vif_thresh=10.0):
    """
    Calculate Variance Inflation Factor (VIF) safely:
    - Removes constant columns
    - Removes perfectly collinear columns
    - Returns sorted VIF table
    
    Parameters:
        df (pd.DataFrame): DataFrame with numeric features
        features (list): Optional list of features to check; defaults to all numeric
        vif_thresh (float): Threshold for flagging high VIF
    
    Returns:
        pd.DataFrame: VIF table
    """
    # Select numeric columns if features not provided
    if features is None:
        features = df.select_dtypes(include=[np.number]).columns.tolist()
    
    X = df[features].copy()
    
    # 1. Drop constant columns
    constant_cols = [col for col in X.columns if X[col].nunique() <= 1]
    if constant_cols:
        print(f"Dropping constant columns: {constant_cols}")
        X.drop(columns=constant_cols, inplace=True)
    
    # 2. Drop perfectly collinear columns
    corr_matrix = X.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    perfect_corr_cols = [col for col in upper.columns if any(upper[col] == 1.0)]
    if perfect_corr_cols:
        print(f"Dropping perfectly collinear columns: {perfect_corr_cols}")
        X.drop(columns=perfect_corr_cols, inplace=True)
    
    # 3. Calculate VIF
    X_const = X.assign(const=1)
    vif_data = pd.DataFrame({
        "feature": X.columns,
        "VIF": [variance_inflation_factor(X_const.values, i) for i in range(len(X.columns))]
    })
    
    # 4. Sort by VIF
    vif_data.sort_values(by="VIF", ascending=False, inplace=True)
    
    # 5. Flag high VIF
    vif_data["High_VIF"] = vif_data["VIF"] > vif_thresh
    
    return vif_data

def fit_polynomial_regression(X, y, degree=2):
    """
    Fits a polynomial regression model and returns the fitted model and transformed features.
    """
    poly = PolynomialFeatures(degree=degree, include_bias=False)
    X_poly = poly.fit_transform(X)
    model = LinearRegression()
    model.fit(X_poly, y)
    return model, poly

def add_interaction_terms(df, features):
    """
    Adds pairwise interaction terms between given features.
    """
    poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
    interaction_array = poly.fit_transform(df[features])
    interaction_df = pd.DataFrame(interaction_array, columns=poly.get_feature_names_out(features))
    return pd.concat([df.reset_index(drop=True), interaction_df], axis=1)

def preprocess_features(df, categorical_cols, numeric_cols):
    """
    Returns a ColumnTransformer that one-hot encodes categorical columns
    and passes numeric columns through unchanged.
    """
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_cols),
            ('num', 'passthrough', numeric_cols)
        ]
    )
    return preprocessor

def regression_summary(X, y):
    """
    Fits an OLS regression model using statsmodels and prints the summary.
    """
    X_const = sm.add_constant(X)
    model = sm.OLS(y, X_const).fit()
    return model.summary()

# Week 1 Notebook – Linear Regression 1
Each week, you will apply the concepts of that week to your Integrated Capstone Project’s dataset. In preparation for Milestone One, create a Jupyter Notebook (similar to in Module B, semester two) that illustrates these lessons. There are no specific questions to answer in your Jupyter Notebook files in this course; your general goal is to analyze your data, using the methods you have learned about in this course and in this program, and draw interesting conclusions. 

For Week 1, include concepts such as linear regression with polynomial terms, interaction terms, multicollinearity, variance inflation factor and regression, and categorical and continuous features. Complete your Jupyter Notebook homework by 11:59 pm ET on Sunday. 

## Mendeley Delay Dataset

In [5]:
file_name = 'MendeleyDelayData.csv'
df = pd.read_csv(data_path + file_name)

In [6]:
df = optimize_dataframe(
    df,
    datetime_cols=['scheduleddepartdatetime'],
    fillna=True
)
df = clean_column_names(df)

Memory usage before optimization: 1008.24 MB
Memory usage after optimization: 150.66 MB
Reduced by 85.1%


In [7]:
# Get column categories

id_cols = ['originairportid', 'destairportid', ]
cat_cols = ['origin', 'dest', 'uniquecarrier', 'tailnum', 'origincityname', 'originstate', ]
date_cols = ['scheduleddepartdatetime', ]
target_cols = ['depdelay','arrdelay',]
feature_cols = [col for col in df.columns if col not in id_cols + cat_cols + date_cols + target_cols]

In [8]:
# drop leakage columns for linear regression
df_lin = df.drop(columns=['arrdelay'] + id_cols + date_cols).copy()

preprocessor, low_card, high_card, num_cols = build_preprocessing_pipeline(
    df_lin, 
    target='depdelay', 
    high_card_threshold=20, 
    scale_numeric=True
)

print("Low-cardinality categorical:", low_card)
print("High-cardinality categorical:", high_card)
print("Numeric columns:", num_cols)

X = df_lin.drop(columns=['depdelay'])
y = df_lin['depdelay']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

model.fit(X_train, y_train)
print("R^2 score:", model.score(X_test, y_test))

Low-cardinality categorical: ['uniquecarrier']
High-cardinality categorical: ['origin', 'dest', 'tailnum', 'origincityname', 'originstate']
Numeric columns: ['marketshareorigin', 'marketsharedest', 'hhiorigin', 'hhidest', 'nonhubairportorigin', 'smallhubairportorigin', 'mediumhubairportorigin', 'largehubairportorigin', 'nonhubairportdest', 'smallhubairportdest', 'mediumhubairportdest', 'largehubairportdest', 'nonhubairlineorigin', 'smallhubairlineorigin', 'mediumhubairlineorigin', 'largehubairlineorigin', 'nonhubairlinedest', 'smallhubairlinedest', 'mediumhubairlinedest', 'largehubairlinedest', 'year', 'month', 'dayofmonth', 'dayofweek', 'scheduledhour', 'capacity', 'loadfactor', 'numflights', 'distance', 'monopolyroute', 'temperature', 'temp_ninfty_n10', 'temp_n10_0', 'temp_0_10', 'temp_10_20', 'temp_20_30', 'temp_30_40', 'temp_40_infty', 'windspeed', 'windspeedsquare', 'windgustdummy', 'windgustspeed', 'raindummy', 'raintracedummy', 'snowdummy', 'snowtracedummy', 'originmetropop', 'o

In [9]:
vif_table = calculate_vif(df)
print(vif_table)

  vif = 1. / (1. - r_squared_i)


                    feature           VIF  High_VIF
13      largehubairportdest           inf      True
12     mediumhubairportdest           inf      True
10        nonhubairportdest           inf      True
11      smallhubairportdest           inf      True
7     smallhubairportorigin           inf      True
6       nonhubairportorigin           inf      True
18        nonhubairlinedest           inf      True
19      smallhubairlinedest           inf      True
16   mediumhubairlineorigin           inf      True
17    largehubairlineorigin           inf      True
20     mediumhubairlinedest           inf      True
21      largehubairlinedest           inf      True
15    smallhubairlineorigin           inf      True
14      nonhubairlineorigin           inf      True
8    mediumhubairportorigin  9.007199e+15      True
9     largehubairportorigin  9.007199e+15      True
39               temp_20_30  3.112541e+02      True
38               temp_10_20  2.888918e+02      True
37          

## USDOT On Time Dataset

In [10]:
# Use glob to find all matching CSV files
all_files = glob.glob(os.path.join(data_path, "T_ONTIME_REPORTING_2025*.csv"))

# Read and combine them
dfs = [pd.read_csv(f) for f in all_files]
combined_df = pd.concat(dfs, ignore_index=True)

print("Files combined:", len(all_files))
print("Final shape:", combined_df.shape)

  dfs = [pd.read_csv(f) for f in all_files]
  dfs = [pd.read_csv(f) for f in all_files]
  dfs = [pd.read_csv(f) for f in all_files]
  dfs = [pd.read_csv(f) for f in all_files]
  dfs = [pd.read_csv(f) for f in all_files]


Files combined: 5
Final shape: (2906929, 109)


In [11]:
# Drop diverted columns
combined_df = combined_df.drop(combined_df.filter(regex=r"^DIV\d+").columns, axis=1)

usdot_df = optimize_dataframe(
    combined_df,
    datetime_cols=['fl_date'],
    fillna=True
)
usdot_df = clean_column_names(usdot_df)

Memory usage before optimization: 3779.75 MB
Memory usage after optimization: 535.69 MB
Reduced by 85.8%


In [12]:
# Get column categories

id_cols = ['op_carrier_airline_id', 'origin_airport_id', 'origin_airport_seq_id', 'origin_city_market_id', 'origin_state_fips', 'origin_wac', 'dest_airport_id', 'dest_airport_seq_id', 'dest_city_market_id', 'dest_state_fips', 'dest_wac', 'crs_dep_time', 'crs_arr_time']
cat_cols = ['op_unique_carrier', 'op_carrier', 'tail_num', 'origin', 'origin_city_name', 'origin_state_abr', 'origin_state_nm', 'dest', 'dest_city_name', 'dest_state_abr', 'dest_state_nm', 'dest_state_fips', 'dest_wac', 'dep_time_blk', 'arr_time_blk', 'cancellation_code,']
date_cols = ['fl_date', ]
target_cols = ['dep_delay', 'dep_delay_new', 'dep_del15', 'dep_delay_group', 'arr_delay', 'arr_delay_new', 'arr_del15', 'arr_delay_group', 'carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay']
feature_cols = [col for col in usdot_df.columns if col not in id_cols + cat_cols + date_cols + target_cols]

In [13]:
# drop leakage columns for linear regression
TARGET_COLUMN = 'dep_delay'
leakage_cols = [x for x in target_cols if x != TARGET_COLUMN]
usdot_df_lin = usdot_df.drop(leakage_cols + id_cols + date_cols, axis=1, errors="ignore").copy()

preprocessor, low_card, high_card, num_cols = build_preprocessing_pipeline(
    usdot_df_lin, 
    target=TARGET_COLUMN, 
    high_card_threshold=20, 
    scale_numeric=True
)

print("Low-cardinality categorical:", low_card)
print("High-cardinality categorical:", high_card)
print("Numeric columns:", num_cols)

X = usdot_df_lin.drop(columns=TARGET_COLUMN)
y = usdot_df_lin[TARGET_COLUMN]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

usdot_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

usdot_model.fit(X_train, y_train)
print("R^2 score:", usdot_model.score(X_test, y_test))

Low-cardinality categorical: ['op_unique_carrier', 'op_carrier', 'dep_time_blk', 'arr_time_blk', 'cancellation_code']
High-cardinality categorical: ['tail_num', 'origin', 'origin_city_name', 'origin_state_abr', 'origin_state_nm', 'dest', 'dest_city_name', 'dest_state_abr', 'dest_state_nm']
Numeric columns: ['year', 'quarter', 'month', 'day_of_month', 'day_of_week', 'op_carrier_fl_num', 'dep_time', 'taxi_out', 'wheels_off', 'wheels_on', 'taxi_in', 'arr_time', 'cancelled', 'diverted', 'crs_elapsed_time', 'actual_elapsed_time', 'air_time', 'flights', 'distance', 'distance_group', 'first_dep_time', 'total_add_gtime', 'longest_add_gtime', 'div_airport_landings', 'div_reached_dest', 'div_actual_elapsed_time', 'div_arr_delay', 'div_distance']
R^2 score: 0.06941058350132379


In [14]:
vif_table = calculate_vif(usdot_df)
print(vif_table)

Dropping constant columns: ['year', 'flights']
                    feature           VIF  High_VIF
11          dest_airport_id  5.861101e+09      True
12      dest_airport_seq_id  5.861097e+09      True
6         origin_airport_id  5.727668e+09      True
7     origin_airport_seq_id  5.727665e+09      True
35      actual_elapsed_time  5.031438e+03      True
36                 air_time  4.704893e+03      True
29            arr_delay_new  7.782086e+02      True
18                dep_delay  4.491302e+02      True
19            dep_delay_new  3.941995e+02      True
39            carrier_delay  2.318702e+02      True
28                arr_delay  1.658624e+02      True
43      late_aircraft_delay  1.573233e+02      True
30                arr_del15  1.516176e+02      True
34         crs_elapsed_time  1.389059e+02      True
22                 taxi_out  1.003038e+02      True
37                 distance  8.611603e+01      True
41                nas_delay  5.612771e+01      True
40            wea

In [15]:
# Remove large datasets
for name, obj in list(globals().items()):
    if isinstance(obj, pd.DataFrame) and len(obj) > 10000:
        del globals()[name]
        print(f"Dropped DataFrame: {name}")


Dropped DataFrame: df
Dropped DataFrame: df_lin
Dropped DataFrame: X
Dropped DataFrame: X_train
Dropped DataFrame: X_test
Dropped DataFrame: combined_df
Dropped DataFrame: usdot_df
Dropped DataFrame: usdot_df_lin


# Week 2 Notebook - Linear Regression 2

For Week 2, include concepts such as linear regression with lasso, ridge, and elastic net regression. This homework will be submitted for peer review and feedback in Week 3 in the assignment titled 3.4 Peer Review: Week 2 Jupyter Notebook. Complete your Jupyter Notebook homework by 11:59 pm ET on Sunday.

## Mendeley Delay Data

### Data Prep

In [16]:
file_name = 'MendeleyDelayData.csv'
df = pd.read_csv(data_path + file_name)

df = optimize_dataframe(
    df,
    datetime_cols=['scheduleddepartdatetime'],
    fillna=True
)
df = clean_column_names(df)

# Get column categories

id_cols = ['originairportid', 'destairportid', ]
cat_cols = ['origin', 'dest', 'uniquecarrier', 'tailnum', 'origincityname', 'originstate', ]
date_cols = ['scheduleddepartdatetime', ]
target_cols = ['depdelay','arrdelay',]
feature_cols = [col for col in df.columns if col not in id_cols + cat_cols + date_cols + target_cols]

# drop leakage columns for linear regression
df_lin = df.drop(columns=['arrdelay'] + id_cols + date_cols).copy()

preprocessor, low_card, high_card, num_cols = build_preprocessing_pipeline(
    df_lin, 
    target='depdelay', 
    high_card_threshold=20, 
    scale_numeric=True
)

print("Low-cardinality categorical:", low_card)
print("High-cardinality categorical:", high_card)
print("Numeric columns:", num_cols)

X = df_lin.drop(columns=['depdelay'])
y = df_lin['depdelay']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Memory usage before optimization: 1008.24 MB
Memory usage after optimization: 150.66 MB
Reduced by 85.1%
Low-cardinality categorical: ['uniquecarrier']
High-cardinality categorical: ['origin', 'dest', 'tailnum', 'origincityname', 'originstate']
Numeric columns: ['marketshareorigin', 'marketsharedest', 'hhiorigin', 'hhidest', 'nonhubairportorigin', 'smallhubairportorigin', 'mediumhubairportorigin', 'largehubairportorigin', 'nonhubairportdest', 'smallhubairportdest', 'mediumhubairportdest', 'largehubairportdest', 'nonhubairlineorigin', 'smallhubairlineorigin', 'mediumhubairlineorigin', 'largehubairlineorigin', 'nonhubairlinedest', 'smallhubairlinedest', 'mediumhubairlinedest', 'largehubairlinedest', 'year', 'month', 'dayofmonth', 'dayofweek', 'scheduledhour', 'capacity', 'loadfactor', 'numflights', 'distance', 'monopolyroute', 'temperature', 'temp_ninfty_n10', 'temp_n10_0', 'temp_0_10', 'temp_10_20', 'temp_20_30', 'temp_30_40', 'temp_40_infty', 'windspeed', 'windspeedsquare', 'windgustdu

### Lasso Regression

In [17]:
# Default Lasso 

lasso = Lasso(max_iter=10000, random_state=42)

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', lasso)
])

model.fit(X_train, y_train)
print("R^2 score:", model.score(X_test, y_test))

R^2 score: 0.0337074407107556


In [18]:
# Lasso with tuned alpha with cross-validation
param_grid = {
    'regressor__alpha': [0.001, 0.01, 0.1, 1, 10, 100]
}

grid = GridSearchCV(model, param_grid, cv=5, scoring='r2', n_jobs=-1)
grid.fit(X_train, y_train)

print("Best alpha:", grid.best_params_['regressor__alpha'])
print("Best CV R^2:", grid.best_score_)
print("Test R^2:", grid.score(X_test, y_test))

Best alpha: 0.01
Best CV R^2: 0.04403471123674032
Test R^2: 0.04379222831327789


### Ridge Regression

In [19]:
#Ridge with default alpha

ridge = Ridge(max_iter=10000, random_state=42)

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', ridge)
])

# Option 1: Fit with default alpha
model.fit(X_train, y_train)
print("R^2 score (default alpha):", model.score(X_test, y_test))


R^2 score (default alpha): 0.04376642307060696


In [20]:
# Ridge with tune alpha with cross-validation
param_grid = {
    'regressor__alpha': [0.01, 0.1, 1, 10, 100, 1000]
}

grid = GridSearchCV(model, param_grid, cv=5, scoring='r2', n_jobs=-1)
grid.fit(X_train, y_train)

print("Best alpha:", grid.best_params_['regressor__alpha'])
print("Best CV R^2:", grid.best_score_)
print("Test R^2:", grid.score(X_test, y_test))


Best alpha: 1000
Best CV R^2: 0.043972332094467756
Test R^2: 0.04377132510993642


### Elastic Net

In [7]:
# Replace regressor with ElasticNet
elastic = ElasticNet(max_iter=10000, random_state=42)

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', elastic)
])

# Option 1: Fit with default parameters
model.fit(X_train, y_train)
print("R^2 score (default alpha, l1_ratio):", model.score(X_test, y_test))



R^2 score (default alpha, l1_ratio): 0.033545260335402505


In [8]:
# Option 2: Tune alpha and l1_ratio with cross-validation
param_grid = {
    'regressor__alpha': [0.001, 0.1, 10],
    'regressor__l1_ratio': [0.1, 0.5, 0.9]
}

grid = GridSearchCV(model, param_grid, cv=5, scoring='r2', n_jobs=-1)
grid.fit(X_train, y_train)

print("Best params:", grid.best_params_)
print("Best CV R^2:", grid.best_score_)
print("Test R^2:", grid.score(X_test, y_test))


KeyboardInterrupt: 

In [24]:
# Remove large datasets
for name, obj in list(globals().items()):
    if isinstance(obj, pd.DataFrame) and len(obj) > 10000:
        del globals()[name]
        print(f"Dropped DataFrame: {name}")


Dropped DataFrame: df
Dropped DataFrame: df_lin
Dropped DataFrame: X
Dropped DataFrame: X_train
Dropped DataFrame: X_test


## USDOT On Time Dataset

### Data Prep

In [25]:
# Use glob to find all matching CSV files
all_files = glob.glob(os.path.join(data_path, "T_ONTIME_REPORTING_2025*.csv"))

# Read and combine them
dfs = [pd.read_csv(f) for f in all_files]
combined_df = pd.concat(dfs, ignore_index=True)

print("Files combined:", len(all_files))
print("Final shape:", combined_df.shape)

# Drop diverted columns
combined_df = combined_df.drop(combined_df.filter(regex=r"^DIV\d+").columns, axis=1)

usdot_df = optimize_dataframe(
    combined_df,
    datetime_cols=['fl_date'],
    fillna=True
)
usdot_df = clean_column_names(usdot_df)

# Get column categories

id_cols = ['op_carrier_airline_id', 'origin_airport_id', 'origin_airport_seq_id', 'origin_city_market_id', 'origin_state_fips', 'origin_wac', 'dest_airport_id', 'dest_airport_seq_id', 'dest_city_market_id', 'dest_state_fips', 'dest_wac', 'crs_dep_time', 'crs_arr_time']
cat_cols = ['op_unique_carrier', 'op_carrier', 'tail_num', 'origin', 'origin_city_name', 'origin_state_abr', 'origin_state_nm', 'dest', 'dest_city_name', 'dest_state_abr', 'dest_state_nm', 'dest_state_fips', 'dest_wac', 'dep_time_blk', 'arr_time_blk', 'cancellation_code,']
date_cols = ['fl_date', ]
target_cols = ['dep_delay', 'dep_delay_new', 'dep_del15', 'dep_delay_group', 'arr_delay', 'arr_delay_new', 'arr_del15', 'arr_delay_group', 'carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay']
feature_cols = [col for col in usdot_df.columns if col not in id_cols + cat_cols + date_cols + target_cols]

# drop leakage columns for linear regression
TARGET_COLUMN = 'dep_delay'
leakage_cols = [x for x in target_cols if x != TARGET_COLUMN]
usdot_df_lin = usdot_df.drop(leakage_cols + id_cols + date_cols, axis=1, errors="ignore").copy()

preprocessor, low_card, high_card, num_cols = build_preprocessing_pipeline(
    usdot_df_lin, 
    target=TARGET_COLUMN, 
    high_card_threshold=20, 
    scale_numeric=True
)

print("Low-cardinality categorical:", low_card)
print("High-cardinality categorical:", high_card)
print("Numeric columns:", num_cols)

X = usdot_df_lin.drop(columns=TARGET_COLUMN)
y = usdot_df_lin[TARGET_COLUMN]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


  dfs = [pd.read_csv(f) for f in all_files]
  dfs = [pd.read_csv(f) for f in all_files]
  dfs = [pd.read_csv(f) for f in all_files]
  dfs = [pd.read_csv(f) for f in all_files]
  dfs = [pd.read_csv(f) for f in all_files]


Files combined: 5
Final shape: (2906929, 109)
Memory usage before optimization: 3779.75 MB
Memory usage after optimization: 535.69 MB
Reduced by 85.8%
Low-cardinality categorical: ['op_unique_carrier', 'op_carrier', 'dep_time_blk', 'arr_time_blk', 'cancellation_code']
High-cardinality categorical: ['tail_num', 'origin', 'origin_city_name', 'origin_state_abr', 'origin_state_nm', 'dest', 'dest_city_name', 'dest_state_abr', 'dest_state_nm']
Numeric columns: ['year', 'quarter', 'month', 'day_of_month', 'day_of_week', 'op_carrier_fl_num', 'dep_time', 'taxi_out', 'wheels_off', 'wheels_on', 'taxi_in', 'arr_time', 'cancelled', 'diverted', 'crs_elapsed_time', 'actual_elapsed_time', 'air_time', 'flights', 'distance', 'distance_group', 'first_dep_time', 'total_add_gtime', 'longest_add_gtime', 'div_airport_landings', 'div_reached_dest', 'div_actual_elapsed_time', 'div_arr_delay', 'div_distance']


### Lasso Regression

In [26]:
# Default Lasso 

lasso = Lasso(max_iter=10000, random_state=42)

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', lasso)
])

model.fit(X_train, y_train)
print("R^2 score:", model.score(X_test, y_test))

R^2 score: 0.04595741706259315


In [27]:
# Lasso with tuned alpha with cross-validation
param_grid = {
    'regressor__alpha': [0.001, 0.01, 0.1, 1, 10, 100]
}

grid = GridSearchCV(model, param_grid, cv=5, scoring='r2', n_jobs=-1)
grid.fit(X_train, y_train)

print("Best alpha:", grid.best_params_['regressor__alpha'])
print("Best CV R^2:", grid.best_score_)
print("Test R^2:", grid.score(X_test, y_test))

KeyboardInterrupt: 

### Ridge Regression

In [28]:
#Ridge with default alpha

ridge = Ridge(max_iter=10000, random_state=42)

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', ridge)
])

# Option 1: Fit with default alpha
model.fit(X_train, y_train)
print("R^2 score (default alpha):", model.score(X_test, y_test))


R^2 score (default alpha): 0.06941067445046167


In [None]:
# Ridge with tune alpha with cross-validation
param_grid = {
    'regressor__alpha': [0.01, 0.1, 1, 10, 100, 1000]
}

grid = GridSearchCV(model, param_grid, cv=5, scoring='r2', n_jobs=-1)
grid.fit(X_train, y_train)

print("Best alpha:", grid.best_params_['regressor__alpha'])
print("Best CV R^2:", grid.best_score_)
print("Test R^2:", grid.score(X_test, y_test))


### Elastic Net

In [29]:
# Replace regressor with ElasticNet
elastic = ElasticNet(max_iter=10000, random_state=42)

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', elastic)
])

# Option 1: Fit with default parameters
model.fit(X_train, y_train)
print("R^2 score (default alpha, l1_ratio):", model.score(X_test, y_test))



R^2 score (default alpha, l1_ratio): 0.04155075820659526


In [None]:
# Option 2: Tune alpha and l1_ratio with cross-validation
param_grid = {
    'regressor__alpha': [0.001, 0.01, 0.1, 1, 10],
    'regressor__l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9]
}

grid = GridSearchCV(model, param_grid, cv=5, scoring='r2', n_jobs=-1)
grid.fit(X_train, y_train)

print("Best params:", grid.best_params_)
print("Best CV R^2:", grid.best_score_)
print("Test R^2:", grid.score(X_test, y_test))


In [30]:
# Remove large datasets
for name, obj in list(globals().items()):
    if isinstance(obj, pd.DataFrame) and len(obj) > 10000:
        del globals()[name]
        print(f"Dropped DataFrame: {name}")


Dropped DataFrame: combined_df
Dropped DataFrame: usdot_df
Dropped DataFrame: usdot_df_lin
Dropped DataFrame: X
Dropped DataFrame: X_train
Dropped DataFrame: X_test


# Week 3 Notebook - Linear Regression 3

For Week 3, include concepts such as linear regression with forward and backward selection, PCR, and PLSR. Complete your Jupyter Notebook homework by 11:59 pm ET on Sunday. 

## Mendeley Delay Data

### Data Prep
Added a sampling step because this is taking way too long

In [16]:
file_name = 'MendeleyDelayData.csv'
SAMPLE_SIZE = 50000
df = pd.read_csv(data_path + file_name)

df = optimize_dataframe(
    df,
    datetime_cols=['scheduleddepartdatetime'],
    fillna=True
)
df = clean_column_names(df)

if SAMPLE_SIZE:
    bins = [-np.inf, -1, 0, 15, 60, 180, np.inf]
    labels = ['early', 'on_time', 'small_delay', 'moderate_delay', 'long_delay', 'extreme_delay']

    df['depdelay_bin'] = pd.cut(df['depdelay'], bins=bins, labels=labels)


    df = (
        df.groupby('depdelay_bin', group_keys=False)
        .apply(lambda x: x.sample(
            n=min(int(SAMPLE_SIZE * len(x) / len(df)), len(x)), 
            random_state=42
        ))
        .reset_index(drop=True)
    )

    df = df.drop(columns=['depdelay_bin'])


# Get column categories
id_cols = ['originairportid', 'destairportid', ]
cat_cols = ['origin', 'dest', 'uniquecarrier', 'tailnum', 'origincityname', 'originstate', ]
date_cols = ['scheduleddepartdatetime', ]
target_cols = ['depdelay','arrdelay',]
feature_cols = [col for col in df.columns if col not in id_cols + cat_cols + date_cols + target_cols]

# drop leakage columns for linear regression
df_lin = df.drop(columns=['arrdelay'] + id_cols + date_cols).copy()

preprocessor, low_card, high_card, num_cols = build_preprocessing_pipeline(
    df_lin, 
    target='depdelay', 
    high_card_threshold=20, 
    scale_numeric=True
)

print("Low-cardinality categorical:", low_card)
print("High-cardinality categorical:", high_card)
print("Numeric columns:", num_cols)

X = df_lin.drop(columns=['depdelay'])
y = df_lin['depdelay']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Memory usage before optimization: 1008.24 MB
Memory usage after optimization: 150.66 MB
Reduced by 85.1%


  df.groupby('depdelay_bin', group_keys=False)


Low-cardinality categorical: ['uniquecarrier']
High-cardinality categorical: ['origin', 'dest', 'tailnum', 'origincityname', 'originstate']
Numeric columns: ['marketshareorigin', 'marketsharedest', 'hhiorigin', 'hhidest', 'nonhubairportorigin', 'smallhubairportorigin', 'mediumhubairportorigin', 'largehubairportorigin', 'nonhubairportdest', 'smallhubairportdest', 'mediumhubairportdest', 'largehubairportdest', 'nonhubairlineorigin', 'smallhubairlineorigin', 'mediumhubairlineorigin', 'largehubairlineorigin', 'nonhubairlinedest', 'smallhubairlinedest', 'mediumhubairlinedest', 'largehubairlinedest', 'year', 'month', 'dayofmonth', 'dayofweek', 'scheduledhour', 'capacity', 'loadfactor', 'numflights', 'distance', 'monopolyroute', 'temperature', 'temp_ninfty_n10', 'temp_n10_0', 'temp_0_10', 'temp_10_20', 'temp_20_30', 'temp_30_40', 'temp_40_infty', 'windspeed', 'windspeedsquare', 'windgustdummy', 'windgustspeed', 'raindummy', 'raintracedummy', 'snowdummy', 'snowtracedummy', 'originmetropop', 'o

  .apply(lambda x: x.sample(


### Forward Selection

#### Using SFS

In [None]:
# Fit preprocessor with y (important for supervised encoders)
preprocessor.fit(X_train, y_train)

X_train_trans = preprocessor.transform(X_train)
X_test_trans  = preprocessor.transform(X_test)

lin_reg = LinearRegression()

sfs_forward = SFS(
    lin_reg,
    k_features='best',
    forward=True,
    floating=False,
    scoring='r2',
    cv=5,
    n_jobs=-1
)

sfs_forward = sfs_forward.fit(X_train_trans, y_train)

# Map indices back to feature names
feature_names = preprocessor.get_feature_names_out()
selected_features = [feature_names[i] for i in sfs_forward.k_feature_idx_]
print("Selected features:", selected_features)

# --- Evaluate model accuracy ---
# Restrict to selected features
X_train_sel = X_train_trans[:, sfs_forward.k_feature_idx_]
X_test_sel  = X_test_trans[:, sfs_forward.k_feature_idx_]

# Fit final model
lin_reg.fit(X_train_sel, y_train)
y_pred = lin_reg.predict(X_test_sel)


r2 = r2_score(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred)

print(f"Model Accuracy (R²): {r2:.4f}")
print(f"Model RMSE: {rmse:.4f}")

Selected features: ['low_card__uniquecarrier_9E', 'low_card__uniquecarrier_B6', 'low_card__uniquecarrier_CO', 'low_card__uniquecarrier_DL', 'low_card__uniquecarrier_FL', 'low_card__uniquecarrier_MQ', 'low_card__uniquecarrier_UA', 'low_card__uniquecarrier_XE', 'high_card__dest', 'high_card__tailnum', 'high_card__origincityname', 'high_card__originstate', 'num__smallhubairportorigin', 'num__nonhubairportdest', 'num__largehubairportdest', 'num__year', 'num__month', 'num__dayofmonth', 'num__dayofweek', 'num__scheduledhour', 'num__loadfactor', 'num__numflights', 'num__temperature', 'num__temp_ninfty_n10', 'num__temp_n10_0', 'num__windspeedsquare', 'num__windgustdummy', 'num__raindummy', 'num__raintracedummy', 'num__snowdummy', 'num__snowtracedummy', 'num__destmetrogdppercapita']
Model Accuracy (R²): -0.0439
Model RMSE: 1372.6907


#### Using Stepwise Function

In [26]:
def forward_selection(X, y, threshold_in=0.01, verbose=True):
    """Forward selection based on p-values from statsmodels OLS"""
    included = []
    while True:
        changed = False
        excluded = list(set(X.columns) - set(included))
        new_pval = pd.Series(index=excluded, dtype=float)
        for new_column in excluded:
            model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included + [new_column]]))).fit()
            new_pval[new_column] = model.pvalues[new_column]
        best_pval = new_pval.min() if not new_pval.empty else None
        if best_pval is not None and best_pval < threshold_in:
            best_feature = new_pval.idxmin()
            included.append(best_feature)
            changed = True
            if verbose:
                print(f'Add {best_feature:30} with p-value {best_pval:.6}')
        if not changed:
            break
    return included
# Transform train/test into numeric DataFrames
preprocessor.fit(X_train, y_train)

X_train_trans = pd.DataFrame(
    preprocessor.transform(X_train),
    columns=preprocessor.get_feature_names_out(),
    index=X_train.index
)
X_test_trans = pd.DataFrame(
    preprocessor.transform(X_test),
    columns=preprocessor.get_feature_names_out(),
    index=X_test.index
)

# Run forward selection
forward_features = forward_selection(X_train_trans, y_train)
print("Forward-selected features:", forward_features)

# Fit final model
final_model = sm.OLS(y_train, sm.add_constant(X_train_trans[forward_features])).fit()

# Predict on test set
y_pred = final_model.predict(sm.add_constant(X_test_trans[forward_features]))

# Evaluate accuracy
r2 = r2_score(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred)

print(f"Forward Selection Model Accuracy (R²): {r2:.4f}")
print(f"Forward Selection Model RMSE: {rmse:.4f}")

Add high_card__tailnum             with p-value 0.0
Add num__scheduledhour             with p-value 1.07019e-156
Add num__raindummy                 with p-value 7.40738e-87
Add high_card__dest                with p-value 6.44347e-59
Add high_card__origincityname      with p-value 6.17954e-47
Add num__snowdummy                 with p-value 6.48087e-32
Add num__snowtracedummy            with p-value 3.82627e-16
Add num__numflights                with p-value 7.88944e-15
Add num__windgustdummy             with p-value 1.34108e-14
Add num__year                      with p-value 1.63149e-13
Add num__largehubairportdest       with p-value 5.15059e-08
Add num__raintracedummy            with p-value 3.68001e-07
Add high_card__originstate         with p-value 4.61725e-06
Add num__temp_n10_0                with p-value 5.76293e-05
Add num__windspeedsquare           with p-value 0.000150945
Add num__destmetrogdppercapita     with p-value 0.000336916
Add num__dayofmonth                with p-value

### Backward Selection

#### Using SFS

In [28]:
# --- Backward selection ---
sfs_backward = SFS(
    lin_reg,
    k_features='best',
    forward=False,
    floating=False,
    scoring='r2',
    cv=5,
    n_jobs=-1
)

# Important: fit preprocessor with y
preprocessor.fit(X_train, y_train)
X_train_trans = preprocessor.transform(X_train)
X_test_trans  = preprocessor.transform(X_test)

sfs_backward = sfs_backward.fit(X_train_trans, y_train)

# Map indices back to feature names
feature_names = preprocessor.get_feature_names_out()
selected_features = [feature_names[i] for i in sfs_backward.k_feature_idx_]
print("Backward-selected features:", selected_features)

# --- Evaluate model accuracy ---
# Restrict to selected features
X_train_sel = X_train_trans[:, sfs_backward.k_feature_idx_]
X_test_sel  = X_test_trans[:, sfs_backward.k_feature_idx_]

# Fit final model
lin_reg.fit(X_train_sel, y_train)
y_pred = lin_reg.predict(X_test_sel)

# Metrics
r2 = r2_score(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred)

print(f"Backward Selection Model Accuracy (R²): {r2:.4f}")
print(f"Backward Selection Model RMSE: {rmse:.4f}")

Backward-selected features: ['low_card__uniquecarrier_AS', 'low_card__uniquecarrier_B6', 'low_card__uniquecarrier_CO', 'low_card__uniquecarrier_OO', 'low_card__uniquecarrier_US', 'low_card__uniquecarrier_WN', 'low_card__uniquecarrier_XE', 'high_card__dest', 'high_card__tailnum', 'high_card__origincityname', 'high_card__originstate', 'num__smallhubairportorigin', 'num__mediumhubairportdest', 'num__largehubairportdest', 'num__year', 'num__month', 'num__dayofmonth', 'num__dayofweek', 'num__scheduledhour', 'num__loadfactor', 'num__numflights', 'num__temperature', 'num__temp_0_10', 'num__temp_10_20', 'num__temp_20_30', 'num__windspeedsquare', 'num__windgustdummy', 'num__raindummy', 'num__raintracedummy', 'num__snowdummy', 'num__snowtracedummy', 'num__destmetrogdppercapita']
Backward Selection Model Accuracy (R²): -0.0439
Backward Selection Model RMSE: 1372.7010


#### Using Stepwise

In [32]:
def backward_elimination(X, y, threshold_out=0.05, verbose=True):
    """Backward elimination based on p-values from statsmodels OLS.
       Assumes X is a numeric DataFrame (already preprocessed)."""
    included = list(X.columns)
    while True:
        changed = False
        model = sm.OLS(y, sm.add_constant(X[included])).fit()
        # exclude intercept
        pvalues = model.pvalues.iloc[1:]
        worst_pval = pvalues.max() if not pvalues.empty else None
        if worst_pval is not None and worst_pval > threshold_out:
            worst_feature = pvalues.idxmax()
            included.remove(worst_feature)
            changed = True
            if verbose:
                print(f'Drop {worst_feature:30} with p-value {worst_pval:.6}')
        if not changed:
            break
    return included

# --- Transform train/test into numeric DataFrames ---
preprocessor.fit(X_train, y_train)

X_train_trans = pd.DataFrame(
    preprocessor.transform(X_train),
    columns=preprocessor.get_feature_names_out(),
    index=X_train.index
)
X_test_trans = pd.DataFrame(
    preprocessor.transform(X_test),
    columns=preprocessor.get_feature_names_out(),
    index=X_test.index
)

# --- Run backward elimination on transformed data ---
backward_features = backward_elimination(X_train_trans, y_train)
print("Backward-selected features:", backward_features)

# --- Fit final model on selected features ---
final_model = sm.OLS(y_train, sm.add_constant(X_train_trans[backward_features])).fit()

# Predict on test set
y_pred = final_model.predict(sm.add_constant(X_test_trans[backward_features]))

# --- Evaluate accuracy ---
r2 = r2_score(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred)

print(f"Backward Elimination Model Accuracy (R²): {r2:.4f}")
print(f"Backward Elimination Model RMSE: {rmse:.4f}")

Drop num__distance                  with p-value 0.994829
Drop num__largehubairlinedest       with p-value 0.966156
Drop num__nonhubairlinedest         with p-value 0.999495
Drop num__originmetrogdppercapita   with p-value 0.938646
Drop num__hhidest                   with p-value 0.934772
Drop num__windgustspeed             with p-value 0.929315
Drop num__mediumhubairportorigin    with p-value 0.920231
Drop high_card__origin              with p-value 0.87493
Drop low_card__uniquecarrier_WN     with p-value 0.852824
Drop low_card__uniquecarrier_F9     with p-value 0.897424
Drop low_card__uniquecarrier_US     with p-value 0.85234
Drop num__mediumhubairlinedest      with p-value 0.833208
Drop num__nonhubairportorigin       with p-value 0.828218
Drop low_card__uniquecarrier_YV     with p-value 0.812314
Drop num__hhiorigin                 with p-value 0.666242
Drop num__marketshareorigin         with p-value 0.838139
Drop low_card__uniquecarrier_AS     with p-value 0.678525
Drop num__capaci

### PCR
Principal Component Regression

In [33]:
# Build pipeline: preprocessing → PCA → Linear Regression
pcr = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=20)),   # choose number of components
    ('regressor', LinearRegression())
])

pcr.fit(X_train, y_train)

y_pred_pcr = pcr.predict(X_test)

print("PCR R2:", r2_score(y_test, y_pred_pcr))
print("PCR RMSE:", mean_squared_error(y_test, y_pred_pcr))

PCR R2: -0.07829785712740689
PCR RMSE: 1417.9651431890868


### PLSR
Partial Least Squares Regression

In [34]:

# Preprocess first
X_train_trans = preprocessor.fit_transform(X_train, y_train)
X_test_trans  = preprocessor.transform(X_test)

# Fit PLSR with, say, 10 components
plsr = PLSRegression(n_components=10)
plsr.fit(X_train_trans, y_train)

y_pred_plsr = plsr.predict(X_test_trans)

print("PLSR R2:", r2_score(y_test, y_pred_plsr))
print("PLSR RMSE:", mean_squared_error(y_test, y_pred_plsr))

PLSR R2: -0.04399321509843657
PLSR RMSE: 1372.854428811666


## USDOT On Time Dataset

### Data Prep

In [49]:
# Use glob to find all matching CSV files
all_files = glob.glob(os.path.join(data_path, "T_ONTIME_REPORTING_2025*.csv"))
SAMPLE_SIZE = 50000

# Read and combine them
dfs = [pd.read_csv(f) for f in all_files]
combined_df = pd.concat(dfs, ignore_index=True)

print("Files combined:", len(all_files))
print("Final shape:", combined_df.shape)

# Drop diverted columns
combined_df = combined_df.drop(combined_df.filter(regex=r"^DIV\d+").columns, axis=1)
combined_df.columns = combined_df.columns.str.lower()

usdot_df = optimize_dataframe(
    combined_df,
    datetime_cols=['fl_date'],
    fillna=True
)
usdot_df = clean_column_names(usdot_df)

if SAMPLE_SIZE >= 0:
    print("Sampling the dataset to", SAMPLE_SIZE)
    bins = [-np.inf, -1, 0, 15, 60, 180, np.inf]
    labels = ['early', 'on_time', 'small_delay', 'moderate_delay', 'long_delay', 'extreme_delay']

    usdot_df['depdelay_bin'] = pd.cut(usdot_df['dep_delay'], bins=bins, labels=labels)
    usdot_df = (
        usdot_df.groupby('depdelay_bin', group_keys=False)
        .apply(lambda x: x.sample(
            n=min(int(SAMPLE_SIZE * len(x) / len(usdot_df)), len(x)), 
            random_state=42
        ))
        .reset_index(drop=True)
    )

    usdot_df = usdot_df.drop(columns=['depdelay_bin'])

# Get column categories

id_cols = ['op_carrier_airline_id', 'origin_airport_id', 'origin_airport_seq_id', 'origin_city_market_id', 'origin_state_fips', 'origin_wac', 'dest_airport_id', 'dest_airport_seq_id', 'dest_city_market_id', 'dest_state_fips', 'dest_wac', 'crs_dep_time', 'crs_arr_time']
cat_cols = ['op_unique_carrier', 'op_carrier', 'tail_num', 'origin', 'origin_city_name', 'origin_state_abr', 'origin_state_nm', 'dest', 'dest_city_name', 'dest_state_abr', 'dest_state_nm', 'dest_state_fips', 'dest_wac', 'dep_time_blk', 'arr_time_blk', 'cancellation_code,']
date_cols = ['fl_date', ]
target_cols = ['dep_delay', 'dep_delay_new', 'dep_del15', 'dep_delay_group', 'arr_delay', 'arr_delay_new', 'arr_del15', 'arr_delay_group', 'carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay']
feature_cols = [col for col in usdot_df.columns if col not in id_cols + cat_cols + date_cols + target_cols]

# drop leakage columns for linear regression
TARGET_COLUMN = 'dep_delay'
leakage_cols = [x for x in target_cols if x != TARGET_COLUMN]
usdot_df_lin = usdot_df.drop(leakage_cols + id_cols + date_cols, axis=1, errors="ignore").copy()

preprocessor, low_card, high_card, num_cols = build_preprocessing_pipeline(
    usdot_df_lin, 
    target=TARGET_COLUMN, 
    high_card_threshold=20, 
    scale_numeric=True
)

print("Low-cardinality categorical:", low_card)
print("High-cardinality categorical:", high_card)
print("Numeric columns:", num_cols)

X = usdot_df_lin.drop(columns=TARGET_COLUMN)
y = usdot_df_lin[TARGET_COLUMN]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


  dfs = [pd.read_csv(f) for f in all_files]
  dfs = [pd.read_csv(f) for f in all_files]
  dfs = [pd.read_csv(f) for f in all_files]
  dfs = [pd.read_csv(f) for f in all_files]
  dfs = [pd.read_csv(f) for f in all_files]


Files combined: 5
Final shape: (2906929, 109)
Memory usage before optimization: 3779.75 MB


  df[col] = pd.to_datetime(df[col], errors='coerce')


Memory usage after optimization: 552.31 MB
Reduced by 85.4%
Sampling the dataset to 50000


  usdot_df.groupby('depdelay_bin', group_keys=False)


Low-cardinality categorical: ['op_unique_carrier', 'op_carrier', 'dep_time_blk', 'arr_time_blk', 'cancellation_code']
High-cardinality categorical: ['tail_num', 'origin', 'origin_city_name', 'origin_state_abr', 'origin_state_nm', 'dest', 'dest_city_name', 'dest_state_abr', 'dest_state_nm']
Numeric columns: ['year', 'quarter', 'month', 'day_of_month', 'day_of_week', 'op_carrier_fl_num', 'dep_time', 'taxi_out', 'wheels_off', 'wheels_on', 'taxi_in', 'arr_time', 'cancelled', 'diverted', 'crs_elapsed_time', 'actual_elapsed_time', 'air_time', 'flights', 'distance', 'distance_group', 'first_dep_time', 'total_add_gtime', 'longest_add_gtime', 'div_airport_landings', 'div_reached_dest', 'div_actual_elapsed_time', 'div_arr_delay', 'div_distance']


  .apply(lambda x: x.sample(


### Forward Selection

In [51]:
# Fit preprocessor with y (important for supervised encoders)
preprocessor.fit(X_train, y_train)

X_train_trans = preprocessor.transform(X_train)
X_test_trans  = preprocessor.transform(X_test)

lin_reg = LinearRegression()

sfs_forward = SFS(
    lin_reg,
    k_features='best',
    forward=True,
    floating=False,
    scoring='r2',
    cv=3,
    n_jobs=-1,
    verbose=2
)

sfs_forward = sfs_forward.fit(X_train_trans, y_train)

# Map indices back to feature names
feature_names = preprocessor.get_feature_names_out()
selected_features = [feature_names[i] for i in sfs_forward.k_feature_idx_]
print("Selected features:", selected_features)

# --- Evaluate model accuracy ---
# Restrict to selected features
X_train_sel = X_train_trans[:, sfs_forward.k_feature_idx_]
X_test_sel  = X_test_trans[:, sfs_forward.k_feature_idx_]

# Fit final model
lin_reg.fit(X_train_sel, y_train)
y_pred = lin_reg.predict(X_test_sel)


r2 = r2_score(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred)

print(f"Model Accuracy (R²): {r2:.4f}")
print(f"Model RMSE: {rmse:.4f}")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  76 out of 107 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done 107 out of 107 | elapsed:    0.2s finished

[2025-09-28 14:42:18] Features: 1/107 -- score: 0.14339819239562326[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 106 out of 106 | elapsed:    0.2s finished

[2025-09-28 14:42:18] Features: 2/107 -- score: 0.15745076680487888[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  74 out of 105 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done 105 out of 105 | elapsed:    0.3s finished

[2025-09-28 14:42:19] Features: 3/107 -- score: 0.1637377783872315[Para

Selected features: ['low_card__op_unique_carrier_AA', 'low_card__op_unique_carrier_AS', 'low_card__op_unique_carrier_DL', 'low_card__op_unique_carrier_F9', 'low_card__op_unique_carrier_HA', 'low_card__op_unique_carrier_NK', 'low_card__op_unique_carrier_OO', 'low_card__op_unique_carrier_UA', 'low_card__op_unique_carrier_WN', 'low_card__op_unique_carrier_YX', 'low_card__op_carrier_AA', 'low_card__op_carrier_AS', 'low_card__op_carrier_DL', 'low_card__op_carrier_F9', 'low_card__op_carrier_HA', 'low_card__op_carrier_NK', 'low_card__op_carrier_OO', 'low_card__op_carrier_UA', 'low_card__op_carrier_WN', 'low_card__op_carrier_YX', 'low_card__dep_time_blk_0001-0559', 'low_card__dep_time_blk_0600-0659', 'low_card__dep_time_blk_0700-0759', 'low_card__dep_time_blk_0800-0859', 'low_card__dep_time_blk_0900-0959', 'low_card__dep_time_blk_1400-1459', 'low_card__dep_time_blk_1700-1759', 'low_card__dep_time_blk_1800-1859', 'low_card__dep_time_blk_1900-1959', 'low_card__dep_time_blk_2000-2059', 'low_card_


[2025-09-28 14:50:28] Features: 107/107 -- score: 0.18481102822461867

### Backward Selection

In [52]:
# --- Backward selection ---
sfs_backward = SFS(
    lin_reg,
    k_features='best',
    forward=False,
    floating=False,
    scoring='r2',
    cv=3,
    n_jobs=-1,
    verbose=2
)

# Important: fit preprocessor with y
preprocessor.fit(X_train, y_train)
X_train_trans = preprocessor.transform(X_train)
X_test_trans  = preprocessor.transform(X_test)

sfs_backward = sfs_backward.fit(X_train_trans, y_train)

# Map indices back to feature names
feature_names = preprocessor.get_feature_names_out()
selected_features = [feature_names[i] for i in sfs_backward.k_feature_idx_]
print("Backward-selected features:", selected_features)

# --- Evaluate model accuracy ---
# Restrict to selected features
X_train_sel = X_train_trans[:, sfs_backward.k_feature_idx_]
X_test_sel  = X_test_trans[:, sfs_backward.k_feature_idx_]

# Fit final model
lin_reg.fit(X_train_sel, y_train)
y_pred = lin_reg.predict(X_test_sel)

# Metrics
r2 = r2_score(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred)

print(f"Backward Selection Model Accuracy (R²): {r2:.4f}")
print(f"Backward Selection Model RMSE: {rmse:.4f}")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    8.2s
[Parallel(n_jobs=-1)]: Done 107 out of 107 | elapsed:   55.2s finished

[2025-09-28 14:52:27] Features: 106/1 -- score: 0.18499935318473468[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    7.7s
[Parallel(n_jobs=-1)]: Done 106 out of 106 | elapsed:   53.9s finished

[2025-09-28 14:53:21] Features: 105/1 -- score: 0.18516577826394912[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    7.9s
[Parallel(n_jobs=-1)]: Done 105 out of 105 | elapsed:   51.9s finished

[2025-09-28 14:54:13] Features: 104/1 -- score: 0.18531519681955885[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    7.8s
[Parallel(n_jobs=-1)]: Done

Backward-selected features: ['low_card__op_unique_carrier_AA', 'low_card__op_unique_carrier_AS', 'low_card__op_unique_carrier_B6', 'low_card__op_unique_carrier_F9', 'low_card__op_unique_carrier_G4', 'low_card__op_unique_carrier_NK', 'low_card__op_unique_carrier_YX', 'low_card__op_carrier_DL', 'low_card__op_carrier_HA', 'low_card__op_carrier_MQ', 'low_card__op_carrier_OH', 'low_card__op_carrier_UA', 'low_card__dep_time_blk_0001-0559', 'low_card__dep_time_blk_0600-0659', 'low_card__dep_time_blk_0700-0759', 'low_card__dep_time_blk_0800-0859', 'low_card__dep_time_blk_0900-0959', 'low_card__dep_time_blk_1000-1059', 'low_card__dep_time_blk_1100-1159', 'low_card__dep_time_blk_1200-1259', 'low_card__dep_time_blk_1300-1359', 'low_card__dep_time_blk_1400-1459', 'low_card__dep_time_blk_1500-1559', 'low_card__dep_time_blk_1600-1659', 'low_card__dep_time_blk_1700-1759', 'low_card__dep_time_blk_1900-1959', 'low_card__dep_time_blk_2200-2259', 'low_card__arr_time_blk_0001-0559', 'low_card__arr_time_bl


[2025-09-28 15:15:18] Features: 1/1 -- score: 0.14339819239562326

### PCR
Principal Component Regression

In [53]:
# Build pipeline: preprocessing → PCA → Linear Regression
pcr = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=20)),   # choose number of components
    ('regressor', LinearRegression())
])

pcr.fit(X_train, y_train)

y_pred_pcr = pcr.predict(X_test)

print("PCR R2:", r2_score(y_test, y_pred_pcr))
print("PCR RMSE:", mean_squared_error(y_test, y_pred_pcr))

PCR R2: -0.08136605278516584
PCR RMSE: 3419.3802488901792


### PLSR
Partial Least Squares Regression

In [55]:

# Preprocess first
X_train_trans = preprocessor.fit_transform(X_train, y_train)
X_test_trans  = preprocessor.transform(X_test)

# Fit PLSR with, say, 10 components
plsr = PLSRegression(n_components=10)
plsr.fit(X_train_trans, y_train)

y_pred_plsr = plsr.predict(X_test_trans)

print("PLSR R2:", r2_score(y_test, y_pred_plsr))
print("PLSR RMSE:", mean_squared_error(y_test, y_pred_plsr))

PLSR R2: -0.06250654912747011
PLSR RMSE: 3359.7447405025227


# Week 4 Notebook - Logistic Regression and Feature Scaling

For Week 4, include concepts such as logistic regression and feature scaling. This homework should be submitted for peer review in the assignment titled 4.3 Peer Review: Week 4 Jupyter Notebook. Complete and submit your Jupyter Notebook homework by 11:59pm ET on Sunday. 

## Mendeley Delay Data

### Data Prep

In [69]:
file_name = 'MendeleyDelayData.csv'
SAMPLE_SIZE = 50000
df = pd.read_csv(data_path + file_name)

df = optimize_dataframe(
    df,
    datetime_cols=['scheduleddepartdatetime'],
    fillna=True
)
df = clean_column_names(df)

if SAMPLE_SIZE:
    bins = [-np.inf, -1, 0, 15, 60, 180, np.inf]
    labels = ['early', 'on_time', 'small_delay', 'moderate_delay', 'long_delay', 'extreme_delay']

    df['depdelay_bin'] = pd.cut(df['depdelay'], bins=bins, labels=labels)


    df = (
        df.groupby('depdelay_bin', group_keys=False)
        .apply(lambda x: x.sample(
            n=min(int(SAMPLE_SIZE * len(x) / len(df)), len(x)), 
            random_state=42
        ))
        .reset_index(drop=True)
    )

    df = df.drop(columns=['depdelay_bin'])


# Get column categories
id_cols = ['originairportid', 'destairportid', ]
cat_cols = ['origin', 'dest', 'uniquecarrier', 'tailnum', 'origincityname', 'originstate', ]
date_cols = ['scheduleddepartdatetime', ]
target_cols = ['depdelay','arrdelay',]
feature_cols = [col for col in df.columns if col not in id_cols + cat_cols + date_cols + target_cols]

# drop leakage columns for linear regression
df_lin = df.drop(columns=['arrdelay'] + id_cols + date_cols).copy()

preprocessor, low_card, high_card, num_cols = build_preprocessing_pipeline(
    df_lin, 
    target='depdelay', 
    high_card_threshold=20, 
    scale_numeric=True
)

print("Low-cardinality categorical:", low_card)
print("High-cardinality categorical:", high_card)
print("Numeric columns:", num_cols)

# --- Create binary target: 1 if depdelay > 0, else 0 ---
df_lin['depdelay_binary'] = (df_lin['depdelay'] > 0).astype(int)

# Update target variable
X = df_lin.drop(columns=['depdelay', 'depdelay_binary'])
y = df_lin['depdelay_binary']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

Memory usage before optimization: 1008.24 MB
Memory usage after optimization: 150.66 MB
Reduced by 85.1%


  df.groupby('depdelay_bin', group_keys=False)


Low-cardinality categorical: ['uniquecarrier']
High-cardinality categorical: ['origin', 'dest', 'tailnum', 'origincityname', 'originstate']
Numeric columns: ['marketshareorigin', 'marketsharedest', 'hhiorigin', 'hhidest', 'nonhubairportorigin', 'smallhubairportorigin', 'mediumhubairportorigin', 'largehubairportorigin', 'nonhubairportdest', 'smallhubairportdest', 'mediumhubairportdest', 'largehubairportdest', 'nonhubairlineorigin', 'smallhubairlineorigin', 'mediumhubairlineorigin', 'largehubairlineorigin', 'nonhubairlinedest', 'smallhubairlinedest', 'mediumhubairlinedest', 'largehubairlinedest', 'year', 'month', 'dayofmonth', 'dayofweek', 'scheduledhour', 'capacity', 'loadfactor', 'numflights', 'distance', 'monopolyroute', 'temperature', 'temp_ninfty_n10', 'temp_n10_0', 'temp_0_10', 'temp_10_20', 'temp_20_30', 'temp_30_40', 'temp_40_infty', 'windspeed', 'windspeedsquare', 'windgustdummy', 'windgustspeed', 'raindummy', 'raintracedummy', 'snowdummy', 'snowtracedummy', 'originmetropop', 'o

  .apply(lambda x: x.sample(


### Log Regression: Basic

In [70]:

# --- Logistic Regression Pipeline ---
log_reg_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(
        max_iter=1000,       # increase iterations for convergence
        solver='lbfgs',      # robust solver
        n_jobs=-1,            # parallelize
        class_weight='balanced'  # handle class imbalance
    ))
])

# --- Fit model ---
log_reg_pipe.fit(X_train, y_train)

# --- Predict ---
y_pred = log_reg_pipe.predict(X_test)

# --- Evaluate ---
print("\nLogistic Regression Results")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Logistic Regression Results
Accuracy: 0.6042

Classification Report:
               precision    recall  f1-score   support

           0       0.69      0.63      0.66      6088
           1       0.49      0.57      0.53      3912

    accuracy                           0.60     10000
   macro avg       0.59      0.60      0.59     10000
weighted avg       0.62      0.60      0.61     10000



### Log Regression: Random Search

In [71]:
# --- Define parameter distributions ---
param_dist = {
    # Regularization strength (inverse of penalty)
    'classifier__C': loguniform(1e-3, 1e3),
    
    # Penalty type (note: lbfgs only supports l2, saga supports l1/l2/elasticnet)
    'classifier__penalty': ['l2'],
    
    # Try different solvers (must be compatible with penalty)
    'classifier__solver': ['lbfgs', 'saga'],
    
    # Optionally explore class weights
    'classifier__class_weight': ['balanced', None]
}

# --- Randomized Search ---
random_search = RandomizedSearchCV(
    estimator=log_reg_pipe,
    param_distributions=param_dist,
    n_iter=20,              # number of random combinations to try
    cv=5,                   # 5-fold cross-validation
    scoring='f1',           # optimize for F1 (better for imbalance than accuracy)
    n_jobs=-1,
    random_state=42,
    verbose=2
)

# --- Fit random search ---
random_search.fit(X_train, y_train)

# --- Best parameters and score ---
print("Best Parameters:", random_search.best_params_)
print("Best CV Score:", random_search.best_score_)

# --- Evaluate on test set ---
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)

print("\nRandom Search Logistic Regression Results")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Parameters: {'classifier__C': np.float64(0.008632008168602538), 'classifier__class_weight': 'balanced', 'classifier__penalty': 'l2', 'classifier__solver': 'lbfgs'}
Best CV Score: 0.5688355837791884

Random Search Logistic Regression Results
Accuracy: 0.6279

Classification Report:
               precision    recall  f1-score   support

           0       0.72      0.64      0.68      6088
           1       0.52      0.61      0.56      3912

    accuracy                           0.63     10000
   macro avg       0.62      0.62      0.62     10000
weighted avg       0.64      0.63      0.63     10000



## USDOT On Time Dataset

### Data Prep

In [63]:
# Use glob to find all matching CSV files
all_files = glob.glob(os.path.join(data_path, "T_ONTIME_REPORTING_2025*.csv"))
SAMPLE_SIZE = 50000

# Read and combine them
dfs = [pd.read_csv(f) for f in all_files]
combined_df = pd.concat(dfs, ignore_index=True)

print("Files combined:", len(all_files))
print("Final shape:", combined_df.shape)

# Drop diverted columns
combined_df = combined_df.drop(combined_df.filter(regex=r"^DIV\d+").columns, axis=1)
combined_df.columns = combined_df.columns.str.lower()

usdot_df = optimize_dataframe(
    combined_df,
    datetime_cols=['fl_date'],
    fillna=True
)
usdot_df = clean_column_names(usdot_df)

if SAMPLE_SIZE >= 0:
    print("Sampling the dataset to", SAMPLE_SIZE)
    bins = [-np.inf, -1, 0, 15, 60, 180, np.inf]
    labels = ['early', 'on_time', 'small_delay', 'moderate_delay', 'long_delay', 'extreme_delay']

    usdot_df['depdelay_bin'] = pd.cut(usdot_df['dep_delay'], bins=bins, labels=labels)
    usdot_df = (
        usdot_df.groupby('depdelay_bin', group_keys=False)
        .apply(lambda x: x.sample(
            n=min(int(SAMPLE_SIZE * len(x) / len(usdot_df)), len(x)), 
            random_state=42
        ))
        .reset_index(drop=True)
    )

    usdot_df = usdot_df.drop(columns=['depdelay_bin'])

# Get column categories

id_cols = ['op_carrier_airline_id', 'origin_airport_id', 'origin_airport_seq_id', 'origin_city_market_id', 'origin_state_fips', 'origin_wac', 'dest_airport_id', 'dest_airport_seq_id', 'dest_city_market_id', 'dest_state_fips', 'dest_wac', 'crs_dep_time', 'crs_arr_time']
cat_cols = ['op_unique_carrier', 'op_carrier', 'tail_num', 'origin', 'origin_city_name', 'origin_state_abr', 'origin_state_nm', 'dest', 'dest_city_name', 'dest_state_abr', 'dest_state_nm', 'dest_state_fips', 'dest_wac', 'dep_time_blk', 'arr_time_blk', 'cancellation_code,']
date_cols = ['fl_date', ]
target_cols = ['dep_delay', 'dep_delay_new', 'dep_del15', 'dep_delay_group', 'arr_delay', 'arr_delay_new', 'arr_del15', 'arr_delay_group', 'carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay']
feature_cols = [col for col in usdot_df.columns if col not in id_cols + cat_cols + date_cols + target_cols]

# drop leakage columns for linear regression
TARGET_COLUMN = 'dep_delay'
leakage_cols = [x for x in target_cols if x != TARGET_COLUMN]
usdot_df_lin = usdot_df.drop(leakage_cols + id_cols + date_cols, axis=1, errors="ignore").copy()

preprocessor, low_card, high_card, num_cols = build_preprocessing_pipeline(
    usdot_df_lin, 
    target=TARGET_COLUMN, 
    high_card_threshold=20, 
    scale_numeric=True
)

print("Low-cardinality categorical:", low_card)
print("High-cardinality categorical:", high_card)
print("Numeric columns:", num_cols)


# --- Create binary target: 1 if depdelay > 0, else 0 ---
usdot_df_lin['depdelay_binary'] = (usdot_df_lin['dep_delay'] > 0).astype(int)

# Update target variable
X = usdot_df_lin.drop(columns=['dep_delay', 'depdelay_binary'])
y = usdot_df_lin['depdelay_binary']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


  dfs = [pd.read_csv(f) for f in all_files]
  dfs = [pd.read_csv(f) for f in all_files]
  dfs = [pd.read_csv(f) for f in all_files]
  dfs = [pd.read_csv(f) for f in all_files]
  dfs = [pd.read_csv(f) for f in all_files]


Files combined: 5
Final shape: (2906929, 109)
Memory usage before optimization: 3779.75 MB


  df[col] = pd.to_datetime(df[col], errors='coerce')


Memory usage after optimization: 552.31 MB
Reduced by 85.4%
Sampling the dataset to 50000


  usdot_df.groupby('depdelay_bin', group_keys=False)


Low-cardinality categorical: ['op_unique_carrier', 'op_carrier', 'dep_time_blk', 'arr_time_blk', 'cancellation_code']
High-cardinality categorical: ['tail_num', 'origin', 'origin_city_name', 'origin_state_abr', 'origin_state_nm', 'dest', 'dest_city_name', 'dest_state_abr', 'dest_state_nm']
Numeric columns: ['year', 'quarter', 'month', 'day_of_month', 'day_of_week', 'op_carrier_fl_num', 'dep_time', 'taxi_out', 'wheels_off', 'wheels_on', 'taxi_in', 'arr_time', 'cancelled', 'diverted', 'crs_elapsed_time', 'actual_elapsed_time', 'air_time', 'flights', 'distance', 'distance_group', 'first_dep_time', 'total_add_gtime', 'longest_add_gtime', 'div_airport_landings', 'div_reached_dest', 'div_actual_elapsed_time', 'div_arr_delay', 'div_distance']


  .apply(lambda x: x.sample(


### Log Regression: Basic

In [64]:

# --- Logistic Regression Pipeline ---
log_reg_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(
        max_iter=1000,       # increase iterations for convergence
        solver='lbfgs',      # robust solver
        n_jobs=-1,            # parallelize
        class_weight='balanced'  # handle class imbalance
    ))
])

# --- Fit model ---
log_reg_pipe.fit(X_train, y_train)

# --- Predict ---
y_pred = log_reg_pipe.predict(X_test)

# --- Evaluate ---
print("\nLogistic Regression Results")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Logistic Regression Results
Accuracy: 0.6157

Classification Report:
               precision    recall  f1-score   support

           0       0.72      0.63      0.67      6230
           1       0.49      0.59      0.54      3770

    accuracy                           0.62     10000
   macro avg       0.60      0.61      0.60     10000
weighted avg       0.63      0.62      0.62     10000



### Log Regression: Random Search

In [66]:
# --- Define parameter distributions ---
param_dist = {
    # Regularization strength (inverse of penalty)
    'classifier__C': loguniform(1e-3, 1e3),
    
    # Penalty type (note: lbfgs only supports l2, saga supports l1/l2/elasticnet)
    'classifier__penalty': ['l2'],
    
    # Try different solvers (must be compatible with penalty)
    'classifier__solver': ['lbfgs', 'saga'],
    
    # Optionally explore class weights
    'classifier__class_weight': ['balanced', None]
}

# --- Randomized Search ---
random_search = RandomizedSearchCV(
    estimator=log_reg_pipe,
    param_distributions=param_dist,
    n_iter=20,              # number of random combinations to try
    cv=5,                   # 5-fold cross-validation
    scoring='f1',           # optimize for F1 (better for imbalance than accuracy)
    n_jobs=-1,
    random_state=42,
    verbose=2
)

# --- Fit random search ---
random_search.fit(X_train, y_train)

# --- Best parameters and score ---
print("Best Parameters:", random_search.best_params_)
print("Best CV Score:", random_search.best_score_)

# --- Evaluate on test set ---
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)

print("\nRandom Search Logistic Regression Results")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Parameters: {'classifier__C': np.float64(0.008632008168602538), 'classifier__class_weight': 'balanced', 'classifier__penalty': 'l2', 'classifier__solver': 'lbfgs'}
Best CV Score: 0.5898257289075901

Random Search Logistic Regression Results
Accuracy: 0.6348

Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.62      0.68      6230
           1       0.51      0.66      0.58      3770

    accuracy                           0.63     10000
   macro avg       0.63      0.64      0.63     10000
weighted avg       0.66      0.63      0.64     10000



# Week 5 - Support Vector Machines

For Week 5, include concepts such as support vector machines, the kernel trick, and regularization for support vector machines. 

## Mendeley Delay Data

## USDOT On Time Dataset