In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
import pandas as pd
import os
project_folder_path = '/content/drive/MyDrive/Kool/sissejuhatus_andmeteadusesse/projekt/Project_C4'
df_train = pd.read_csv(os.path.join(project_folder_path, "df_train.csv"))
df_test = pd.read_csv(os.path.join(project_folder_path, "df_test.csv"))

season_map = {
    "Winter": 0,
    "Spring": 1,
    "Summer": 2,
    "Fall":   3
}

df_train['season'] = df_train['season'].map(season_map)
df_test['season']  = df_test['season'].map(season_map)

cat_features = [
    'day_of_week',
    'is_weekend',
    'season',
]

for c in cat_features:
    df_train[c] = df_train[c].astype(int)
    df_test[c] = df_test[c].astype(int)

df = pd.concat([df_train, df_test], ignore_index=True)
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values('date').reset_index(drop=True)

split_date = "2022-01-01"

In [None]:
!pip install lightgbm -q

In [None]:
import numpy as np

# Basic time components
df['day_of_week'] = df['date'].dt.dayofweek
df['month'] = df['date'].dt.month
df['day_of_year'] = df['date'].dt.dayofyear

# Fourier - weekly
df['sin_week'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
df['cos_week'] = np.cos(2 * np.pi * df['day_of_week'] / 7)

# Fourier - yearly
df['sin_year'] = np.sin(2 * np.pi * df['day_of_year'] / 365)
df['cos_year'] = np.cos(2 * np.pi * df['day_of_year'] / 365)

# Trend
df['trend'] = np.arange(len(df))

# Weather interactions
df['temp_x_weekend'] = df['tavg'] * df['is_weekend']
df['rain_x_season'] = df['prcp'] * df['season']
df['wind_x_season'] = df['wspd'] * df['season']
df['temp_x_season'] = df['tavg'] * df['season']
df['month_x_temp'] = df['month'] * df['tavg']

# Weather polynomial
df['tavg2'] = df['tavg']**2
df['prcp2'] = df['prcp']**2
df['wspd2'] = df['wspd']**2

# Lags
df['lag_1'] = df['trips'].shift(1)
df['lag_7'] = df['trips'].shift(7)
df['lag_14'] = df['trips'].shift(14)
df['lag_30'] = df['trips'].shift(30)

# Rolling Means
df['rolling_7'] = df['trips'].rolling(7).mean()
df['rolling_14'] = df['trips'].rolling(14).mean()
df['rolling_30'] = df['trips'].rolling(30).mean()

# Rolling STD
df['rolling_std_7'] = df['trips'].rolling(7).std()
df['rolling_std_14'] = df['trips'].rolling(14).std()
df['rolling_std_30'] = df['trips'].rolling(30).std()

df = df.dropna().reset_index(drop=True)

df_train_new = df[df['date'] < split_date].reset_index(drop=True)
df_test_new  = df[df['date'] >= split_date].reset_index(drop=True)

df_train = df_train_new.copy()
df_test = df_test_new.copy()

features = [
    'median_duration',
    'tavg','tavg2',
    'prcp','prcp2',
    'wspd','wspd2',
    'day_of_week','is_weekend','season',
    'trend','sin_year','cos_year','sin_week','cos_week',
    'temp_x_weekend','rain_x_season','wind_x_season','temp_x_season','month_x_temp',
    'lag_1','lag_7','lag_14','lag_30',
    'rolling_7','rolling_14','rolling_30',
    'rolling_std_7','rolling_std_14','rolling_std_30'
]

X_train = df_train[features]
X_test = df_test[features]

y_train = df_train['trips']
y_test = df_test['trips']


In [None]:
import lightgbm
print(lightgbm.__version__)

4.6.0


In [None]:
import pandas as pd

from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from lightgbm import early_stopping

In [None]:
lgbm = LGBMRegressor(
    objective='regression',
    metric='rmse',
    n_estimators=3000,        # many trees + early stopping
    learning_rate=0.03,
    num_leaves=64,
    max_depth=-1,             # no hard limit, controlled by num_leaves
    subsample=0.9,            # row sampling (bagging)
    subsample_freq=1,
    colsample_bytree=0.9,     # feature sampling
    reg_lambda=1.0,           # L2 regularization
    random_state=42,
    n_jobs=-1
)

lgbm.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    eval_metric='rmse',
    callbacks=[early_stopping(stopping_rounds=150)]
)

# Use the best_iteration_ found by early stopping
y_pred = lgbm.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"MAE:  {mae:,.2f}")
print(f"RMSE: {rmse:,.2f}")
print(f"R²:   {r2:.4f}")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001655 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6328
[LightGBM] [Info] Number of data points in the train set: 3097, number of used features: 30
[LightGBM] [Info] Start training from score 44541.552793
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[213]	valid_0's rmse: 23825.6
MAE:  18,866.06
RMSE: 23,825.62
R²:   0.4937


In [None]:
import matplotlib.pyplot as plt

importances = pd.Series(lgbm.feature_importances_, index=features).sort_values(ascending=False)

print("\nTop 25 Feature Importances:")
print(importances.head(25))


Top 25 Feature Importances:
median_duration    1741
rolling_std_7       992
prcp                917
trend               884
lag_1               793
rolling_7           768
lag_14              578
day_of_week         572
tavg                543
cos_year            483
rolling_std_14      479
rolling_std_30      468
sin_year            462
rain_x_season       428
lag_7               389
month_x_temp        368
wspd                365
lag_30              344
rolling_30          290
rolling_14          286
wind_x_season       285
sin_week            254
temp_x_weekend      180
temp_x_season       156
cos_week            139
dtype: int32
