In [11]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [12]:
cd /content/drive/MyDrive/SDH_UIT/Stage_3/AdvancedBAAnalytics/Final

/content/drive/MyDrive/SDH_UIT/Stage_3/AdvancedBAAnalytics/Final


In [15]:
!dir data

concrete.txt  naval.csv       OOD-naval.txt  YearPredictionMSD.txt
energy.csv    OOD-energy.txt  protein.txt


In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

In [19]:
# Load dữ liệu NAVAL
df = pd.read_csv("data/naval.csv", header=None, sep='\s+')

In [20]:
# Chọn X, y (biến đầu ra là cột cuối cùng)
X = df.iloc[:, :-1].values  # 17 features
y = df.iloc[:, -1].values   # target (18th column)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [22]:
### ----- SGB -----
model_sgb = GradientBoostingRegressor(
    n_estimators=50, learning_rate=0.03, max_depth=6, random_state=0
)
model_sgb.fit(X_train_scaled, y_train)
y_pred_sgb = model_sgb.predict(X_test_scaled)
rmse_sgb = np.sqrt(mean_squared_error(y_test, y_pred_sgb))


In [23]:
### ----- SGLB (Ensemble GBDTs) -----
n_models = 10
preds_test_sglb = []

for seed in range(n_models):
    model = GradientBoostingRegressor(
        n_estimators=50, learning_rate=0.03, max_depth=6,
        subsample=0.9, random_state=seed
    )
    model.fit(X_train_scaled, y_train)
    preds_test_sglb.append(model.predict(X_test_scaled))

y_pred_mean_sglb = np.mean(preds_test_sglb, axis=0)
rmse_sglb = np.sqrt(mean_squared_error(y_test, y_pred_mean_sglb))

In [26]:
### ----- KGB -----
def sample_then_optimize(X_train, y_train, X_test, n_estimators=50, lr=0.03, max_depth=6, seed=42):
    np.random.seed(seed)

    # Step 1: Sample finit
    finit = np.random.normal(loc=0.0, scale=1.0, size=len(y_train))

    # Step 2: Create new labels
    y_new = y_train - finit

    # Step 3: Train GBDT on residual
    model = GradientBoostingRegressor(
        n_estimators=n_estimators,
        learning_rate=lr,
        max_depth=max_depth,
        random_state=seed
    )
    model.fit(X_train, y_new)

    # Step 4: Predict and combine
    f_r = model.predict(X_test)
    f_final = finit[:len(f_r)] + f_r
    return f_final

f_final = sample_then_optimize(X_train_scaled, y_train, X_test_scaled)
rmse_kgb = np.sqrt(mean_squared_error(y_test, f_final))

In [27]:
### ----- Kết quả -----
print("📊 Naval Dataset – RMSE Comparison")
print(f"SGB  RMSE: {rmse_sgb:.4f}")
print(f"SGLB RMSE: {rmse_sglb:.4f}")
print(f"KGB  RMSE: {rmse_kgb:.4f}")

📊 Naval Dataset – RMSE Comparison
SGB  RMSE: 0.0044
SGLB RMSE: 0.0044
KGB  RMSE: 0.9871
