In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
cd /content/drive/MyDrive/SDH_UIT/Stage_3/AdvancedBAAnalytics/Final

/content/drive/MyDrive/SDH_UIT/Stage_3/AdvancedBAAnalytics/Final


In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, roc_auc_score
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

# Load data
df_energy = pd.read_csv('data/energy.csv')
X = df_energy.iloc[:, :8]
y = df_energy["Y1"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

SGB

In [8]:
model_sgb = GradientBoostingRegressor(
    n_estimators=50, learning_rate=0.03, max_depth=6, random_state=0
)
model_sgb.fit(X_train_scaled, y_train)
y_pred_sgb = model_sgb.predict(X_test_scaled)
rmse_sgb = np.sqrt(mean_squared_error(y_test, y_pred_sgb))
print(f"SGB RMSE: {rmse_sgb:.4f}")

SGB RMSE: 2.3072


SLGB

In [9]:
n_models = 10
preds_test_sglb = []

for seed in range(n_models):
    model = GradientBoostingRegressor(
        n_estimators=50, learning_rate=0.03, max_depth=6,
        subsample=0.9, random_state=seed
    )
    model.fit(X_train_scaled, y_train)
    preds_test_sglb.append(model.predict(X_test_scaled))

y_pred_mean_sglb = np.mean(preds_test_sglb, axis=0)
rmse_sglb = np.sqrt(mean_squared_error(y_test, y_pred_mean_sglb))
print(f"SGLB RMSE: {rmse_sglb:.4f}")

SGLB RMSE: 2.3105


KGB

In [10]:
# Hàm Sample-then-Optimize (KGB)
def sample_then_optimize(X_train, y_train, X_test, n_estimators=50, lr=0.03, max_depth=6, seed=42):
    np.random.seed(seed)

    # Step 1: Sample finit
    finit = np.random.normal(loc=0.0, scale=1.0, size=len(y_train))

    # Step 2: Create new labels
    y_new = y_train - finit

    # Step 3: Train GBDT on residual
    model = GradientBoostingRegressor(
        n_estimators=n_estimators,
        learning_rate=lr,
        max_depth=max_depth,
        random_state=seed
    )
    model.fit(X_train, y_new)

    # Step 4: Predict and combine
    f_r = model.predict(X_test)
    f_final = finit[:len(f_r)] + f_r
    return f_final, finit[:len(f_r)], f_r

In [12]:
# Run KGB
f_final, finit, f_r = sample_then_optimize(X_train_scaled, y_train, X_test_scaled)

# Evaluate
rmse_kgb = np.sqrt(mean_squared_error(y_test, f_final))

# Output
print(f"SGB RMSE: {rmse_sgb:.4f}")

SGB RMSE: 2.3072


All result

In [14]:
print("Energy Dataset – RMSE Comparison")
print(f"SGB  RMSE: {rmse_sgb:.4f}")
print(f"SGLB RMSE: {rmse_sglb:.4f}")
print(f"KGB  RMSE: {rmse_kgb:.4f}")

Energy Dataset – RMSE Comparison
SGB  RMSE: 2.3072
SGLB RMSE: 2.3105
KGB  RMSE: 2.4937
