In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import train_test_split

In [2]:
# Load dataset
# Import Dataset 
X_train = pd.read_csv('../data/processed/X_train.csv')
X_val = pd.read_csv('../data/processed/X_val.csv')
X_test = pd.read_csv('../data/processed/X_test.csv')
y_train = pd.read_csv('../data/processed/y_train.csv')
y_val = pd.read_csv('../data/processed/y_val.csv')
y_test = pd.read_csv('../data/processed/y_test.csv')

In [3]:
from joblib import load

In [4]:
# Load preprocessor
preprocessor= load('../models/preprocessor/preprocessor.pkl')

In [20]:
from lightgbm import LGBMRegressor

# Create a pipeline with the preprocessor and the LGBMRegressor model
lightgbm_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),  # Step 1: preprocess features
    ('model', LGBMRegressor(
        n_estimators=500,            # Increase n_estimators for a more robust ensemble
        learning_rate=0.05,          # Lower learning rate for better generalization
        num_leaves=50,               # Increase num_leaves for more complex trees
        max_depth=15,                # Limit tree depth to avoid overfitting
        subsample=0.8,               # Use 80% of data for each tree (regularization)
        colsample_bytree=0.8,        # Use 80% of features for each tree
        reg_alpha=0.1,               # L1 regularization to prevent overfitting
        reg_lambda=0.1,              # L2 regularization for more control
        random_state=42              # For reproducibility
    ))
])

## Hyperparameter Tuning

In [None]:

'''
from sklearn.model_selection import GridSearchCV


# Define the parameter grid for tuning
param_grid = {
    'model__n_estimators': [50, 100, 200], 
    'model__learning_rate': [0.1, 0.5],  
    'model__max_depth': [3, 10, 15],  
    'model__num_leaves': [10, 20, 50],  
    'model__subsample': [0.7, 0.8, 1.0],  
    'model__colsample_bytree': [0.7, 0.8, 1.0],  
    'model__min_child_samples': [10, 20, 30],  
    'model__reg_alpha': [0, 0.1, 0.5],  
    'model__reg_lambda': [0, 0.1, 0.5]  
}

# Create the pipeline with LGBMRegressor (make sure 'model' is LGBMRegressor)
lightgbm_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),  # Step 1: preprocess features
    ('model', LGBMRegressor(random_state=42))  # Step 2: model
])

# Set up GridSearchCV with cross-validation
grid_search = GridSearchCV(lightgbm_pipeline, param_grid, cv=5, n_jobs=1, verbose=1)

# Fit the GridSearchCV to the training data
grid_search.fit(X_train, y_train)

# Get the best parameters from the grid search
print(f"Best parameters: {grid_search.best_params_}")
'''

In [None]:
# Fit the pipeline on the training data
lightgbm_pipeline.fit(X_train, y_train)

## Evaluating Model

In [21]:
# Fit the pipeline on the training data
lightgbm_pipeline.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.146474 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 319
[LightGBM] [Info] Number of data points in the train set: 6278775, number of used features: 33
[LightGBM] [Info] Start training from score 338.880605


The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [22]:
preds_train = lightgbm_pipeline.predict(X_train)
preds_val = lightgbm_pipeline.predict(X_val)


In [23]:
preds_test = lightgbm_pipeline.predict(X_test)

In [24]:
from my_krml_ratana.models.performance import print_regressor_scores

In [25]:
print_regressor_scores(y_preds=preds_train, y_actuals=y_train, set_name='Training')
print_regressor_scores(y_preds=preds_val, y_actuals=y_val, set_name='Validating')

RMSE Training: 97.45127027935506
MAE Training: 67.09268812330298
RMSE Validating: 97.51157380104233
MAE Validating: 67.15511212344217


In [26]:
print_regressor_scores(y_preds=preds_test, y_actuals=y_test, set_name='Testing')

RMSE Testing: 97.81851385626722
MAE Testing: 67.19320618689217
