In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
cd /content/drive/MyDrive/SDH_UIT/Stage_3/AdvancedBAAnalytics/Final

/content/drive/MyDrive/SDH_UIT/Stage_3/AdvancedBAAnalytics/Final


In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

In [5]:
# -------- 1. Load dữ liệu Protein --------
df = pd.read_csv("data/protein.txt", header=None, sep='\s+')

# Tách feature và label (giả sử cột cuối là nhãn)
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

# -------- 2. Train-test split --------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# -------- 3. Chuẩn hóa --------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [6]:
# -------- 4. SGB --------
model_sgb = GradientBoostingRegressor(
    n_estimators=50, learning_rate=0.03, max_depth=6, random_state=0
)
model_sgb.fit(X_train_scaled, y_train)
y_pred_sgb = model_sgb.predict(X_test_scaled)
rmse_sgb = np.sqrt(mean_squared_error(y_test, y_pred_sgb))

In [7]:
# -------- 5. SGLB --------
n_models = 10
preds_test_sglb = []

for seed in range(n_models):
    model = GradientBoostingRegressor(
        n_estimators=50, learning_rate=0.03, max_depth=6,
        subsample=0.9, random_state=seed
    )
    model.fit(X_train_scaled, y_train)
    preds_test_sglb.append(model.predict(X_test_scaled))

y_pred_mean_sglb = np.mean(preds_test_sglb, axis=0)
rmse_sglb = np.sqrt(mean_squared_error(y_test, y_pred_mean_sglb))

In [9]:
# -------- 6. KGB --------
def sample_then_optimize(X_train, y_train, X_test, n_estimators=50, lr=0.03, max_depth=6, seed=42):
    np.random.seed(seed)

    # Step 1: Sample initial function
    finit = np.random.normal(loc=0.0, scale=1.0, size=len(y_train))

    # Step 2: Compute residual
    y_new = y_train - finit

    # Step 3: Fit GBDT on residual
    model = GradientBoostingRegressor(
        n_estimators=n_estimators,
        learning_rate=lr,
        max_depth=max_depth,
        random_state=seed
    )
    model.fit(X_train, y_new)

    # Step 4: Combine predictions
    f_r = model.predict(X_test)
    f_final = finit[:len(f_r)] + f_r
    return f_final

f_final = sample_then_optimize(X_train_scaled, y_train, X_test_scaled)
rmse_kgb = np.sqrt(mean_squared_error(y_test, f_final))


In [10]:
# -------- 7. In kết quả --------
print("🧬 Protein Dataset – RMSE Comparison")
print(f"SGB  RMSE: {rmse_sgb:.4f}")
print(f"SGLB RMSE: {rmse_sglb:.4f}")
print(f"KGB  RMSE: {rmse_kgb:.4f}")

🧬 Protein Dataset – RMSE Comparison
SGB  RMSE: 4.7914
SGLB RMSE: 4.7861
KGB  RMSE: 4.8967
