# The Machine Learning Workflow

In [2]:
import numpy as np
import optuna
import pandas as pd
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score, KFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OrdinalEncoder

### Load Data

In [3]:
data = pd.read_csv('../data/RtmSimulation_kickstart.csv', index_col= 0)

#### Define target and features

In [4]:
# Identify categorical and numerical columns
categorical_cols = [col for col in data.columns if data[col].dtype == 'object']
numerical_cols = [col for col in data.columns if col not in categorical_cols + ['id', 'lai']]

In [5]:
X = data.drop(['lai'], axis=1)
y = data['lai']

#### Train-Test Split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42)

## Feature Engineering

In [7]:
# Handling missing values
# We will fill missing values with the median of each column
X_train[numerical_cols] = X_train[numerical_cols].fillna(X_train[numerical_cols].median())
X_test[numerical_cols] = X_test[numerical_cols].fillna(X_train[numerical_cols].median())

In [8]:
# Initialize the OrdinalEncoder
ordinal_encoder = OrdinalEncoder()

# Fit and transform the categorical columns in X_train
X_train[categorical_cols] = ordinal_encoder.fit_transform(X_train[categorical_cols])

# Transform the categorical columns in X_test
X_test[categorical_cols] = ordinal_encoder.transform(X_test[categorical_cols])

scl = StandardScaler()

X_train[numerical_cols] = scl.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = scl.transform(X_test[numerical_cols])

## Modelling

In [9]:
gbm_model = LGBMRegressor(random_state=42)

In [10]:
gbm_model.fit(X_train, y_train, categorical_feature=set(categorical_cols))



In [11]:
y_pred_test = gbm_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred_test)

print("Test MSE:", mse, "Test r2:", r2_score(y_test, y_pred_test))

(0.4177748963817336, 0.9020821674992869)

In [12]:
y_pred_train = gbm_model.predict(X_train)
mse = mean_squared_error(y_train, y_pred_train)

print("Train MSE:", mse, "Train r2:", r2_score(y_train, y_pred_train))

0.999942064691171

## Hyperparameter Optimization

In [66]:
def param_bounds(trial: optuna.Trial):

    return {
        # Sample an integer between 10 and 100
        "n_estimators": trial.suggest_categorical("n_estimators", [50, 100, 200, 300, 400, 500, 700, 1000]),
        "num_leaves": trial.suggest_int("num_leaves", 10, 100),
        "max_depth": trial.suggest_int("max_depth", 10, 100),
        # Sample a categorical value from the list provided
        "objective": trial.suggest_categorical(
            "objective", ["regression", "regression_l1", "huber"]
        ),
        "random_state": [42],
        # Sample from a uniform distribution between 0.3 and 1.0
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.3, 1.0),
        # Sample from a uniform distribution between 0 and 10
        "reg_alpha": trial.suggest_float("reg_alpha", 0, 100),
        # Sample from a uniform distribution between 0 and 10
        "reg_lambda": trial.suggest_float("reg_lambda", 0, 100),
    }

In [ ]:
def objective(trial: optuna.Trial):
    gbm_model = LGBMRegressor()
    params = param_bounds(trial)
    gbm_model.set_params(**params)

    # Define the cross-validation strategy
    cv_strategy = KFold(n_splits=3, shuffle=True, random_state=42)
    
    # Perform cross-validation
    cv_scores = cross_val_score(gbm_model, X_train, y_train, cv=cv_strategy, fit_params={'categorical_feature': set(categorical_cols)})
    
    # Calculate mean and standard deviation of the scores
    cv_scores_mean = np.mean(cv_scores)
    cv_scores_std = np.std(cv_scores)

    return cv_scores_mean

    
sampler = optuna.samplers.TPESampler(n_startup_trials=10, seed=42)
# Create a study
study = optuna.create_study(direction="maximize", sampler=sampler)
# Start the optimization run
study.optimize(objective, n_trials=50, show_progress_bar=True)

fig = optuna.visualization.plot_param_importances(study)
fig.show()

bo_search_trials = study.trials_dataframe()
best_params = study.best_params
best_score = study.best_value
print(bo_search_trials.sort_values("value").head())
print(f"Best parameters: {best_params}")
print(f"Best score: {best_score}")

## Cross validation

In [ ]:
gbm_model = LGBMRegressor(random_state=42, **{'n_estimators': 1000, 'num_leaves': 27, 'max_depth': 49, 'objective': 'regression', 'colsample_bytree': 0.6566468750879891, 'reg_alpha': 0.32912524526992454, 'reg_lambda': 5.1141448141261545})

In [17]:
# Define the cross-validation strategy
cv_strategy = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation for R² score
r2_scores = cross_val_score(gbm_model, X_train, y_train, cv=cv_strategy, fit_params={'categorical_feature': set(categorical_cols)})

# Calculate mean and standard deviation of R² scores
r2_mean = np.mean(r2_scores)
r2_std = np.std(r2_scores)

# Perform cross-validation for MSE
mse_scores = cross_val_score(gbm_model, X_train, y_train, scoring='neg_mean_squared_error', cv=cv_strategy, fit_params={'categorical_feature': set(categorical_cols)})

# Calculate mean and standard deviation of MSE scores
mse_mean = np.mean(-mse_scores)  # Negate the scores back to positive
mse_std = np.std(-mse_scores)    # Negate the scores back to positive

print(f"Mean cross-validation R²: {r2_mean:.3f} +/- {r2_std:.3f}")
print(f"Mean cross-validation MSE: {mse_mean:.3f} +/- {mse_std:.3f}")

(0.9158518675633955, 0.021089492018234252)

In [ ]:
# Predict and evaluate
y_pred = gbm_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)

print("Test MSE:", mse, "Test r2:", r2_score(y_test, y_pred))

## Explainability

In [ ]:
feat_df = pd.DataFrame(
    {
        "feature": X_train.columns,
        "importance": gbm_model.feature_importances_.ravel(),
    }
)

feat_df["_abs_imp"] = np.abs(feat_df.importance)
feat_df = feat_df.sort_values("_abs_imp", ascending=False).drop(
    columns="_abs_imp"
)

feat_df = feat_df.sort_values(by="importance", ascending=False).head(15)
feat_df.plot(x="feature", y="importance", kind="bar", color="blue", )