In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb
import xgboost as xgb
import catboost as cb

In [11]:
df = pd.read_csv('dataset/processed/jfk_encoded_2024.csv')

df.head()

Unnamed: 0,day_of_month,departure_delay,air_time,distance,temperature_celsius,precipitation_mm,cloud_cover_percentage,wind_speed_mps,airline_carrier_target_enc,destination_airport_id_target_enc,...,weather_heavy_snow,weather_light_drizzle,weather_mainly_clear,weather_moderate_drizzle,weather_moderate_rain,weather_moderate_snow,weather_overcast,weather_partly_cloudy,weather_slight_rain,weather_slight_snow
0,1,-3.0,0.913204,0.838999,-1.787289,-0.1953,0.237618,-0.360681,8.391779,9.218631,...,False,False,False,False,False,False,False,True,False,False
1,1,56.0,0.859902,0.838999,-1.776315,-0.1953,-0.432984,-0.304389,8.550816,9.039462,...,False,False,True,False,False,False,False,False,False,False
2,1,-4.0,0.877669,0.838999,-1.776315,-0.1953,-0.432984,-0.304389,8.550816,9.039462,...,False,False,True,False,False,False,False,False,False,False
3,1,-8.0,0.895436,0.838999,-1.611712,-0.1953,1.02384,-0.332535,8.414581,8.634657,...,False,False,False,False,False,False,True,False,False,False
4,1,4.0,0.877669,0.838999,-1.161796,-0.1953,0.353239,0.624422,8.643352,9.0598,...,False,False,False,False,False,False,False,True,False,False


In [12]:
X = df.drop(columns=['departure_delay'])
y = df['departure_delay']

In [13]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [14]:
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'AdaBoost': AdaBoostRegressor(),
    'Support Vector Regression': SVR(),
    'LightGBM': lgb.LGBMRegressor(),
    'XGBoost': xgb.XGBRegressor(),
    'CatBoost': cb.CatBoostRegressor(verbose=0)
}

In [15]:
results = {}

In [None]:
for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_scaled, y)
    
    y_pred = model.predict(X_scaled)

    rmse = np.sqrt(mean_squared_error(y, y_pred))
    mae = mean_absolute_error(y, y_pred)
    r2 = r2_score(y, y_pred)
    
    results[name] = {
        'RMSE': rmse,
        'MAE': mae,
        'R2': r2
    }

Training Linear Regression...
Training Random Forest...


  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  rmsle = np.sqrt(np.mean(np.log1p(y) - np.log1p(y_pred))**2)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  rmsle = np.sqrt(np.mean(np.log1p(y) - np.log1p(y_pred))**2)
  rmsle = np.sqrt(np.mean(np.log1p(y) - np.log1p(y_pred))**2)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)


Training Gradient Boosting...


  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  rmsle = np.sqrt(np.mean(np.log1p(y) - np.log1p(y_pred))**2)


Training AdaBoost...


  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


Training Support Vector Regression...


  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  rmsle = np.sqrt(np.mean(np.log1p(y) - np.log1p(y_pred))**2)


Training LightGBM...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002455 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1198
[LightGBM] [Info] Number of data points in the train set: 27840, number of used features: 30
[LightGBM] [Info] Start training from score 14.420654


  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  rmsle = np.sqrt(np.mean(np.log1p(y) - np.log1p(y_pred))**2)


Training XGBoost...


  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  rmsle = np.sqrt(np.mean(np.log1p(y) - np.log1p(y_pred))**2)


Training CatBoost...


  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  rmsle = np.sqrt(np.mean(np.log1p(y) - np.log1p(y_pred))**2)


In [17]:
results_df = pd.DataFrame(results).T
results_df = results_df.sort_values(by='R2', ascending=False)

In [18]:
print("Model performance metrics:")
print(results_df)

Model performance metrics:
                                 RMSE        MAE        R2  RMSLE
Random Forest               17.867735   7.314582  0.890577    NaN
XGBoost                     27.174596  13.910430  0.746897    inf
CatBoost                    35.487195  16.655385  0.568367    inf
LightGBM                    40.332433  17.417931  0.442454    inf
Gradient Boosting           46.310917  20.169188  0.264914    inf
Linear Regression           52.306609  24.587681  0.062255    inf
Support Vector Regression   54.776694  18.518146 -0.028403    inf
AdaBoost                   100.147148  80.064152 -2.437547    inf


In [19]:
results_df.to_csv('dataset/model_performance.csv')