In [26]:
# src/train.py
import datetime
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import RepeatedKFold, cross_val_score

# 1. Chuẩn bị dữ liệu
df = pd.read_csv("energydata_complete.csv")
df["date"] = pd.to_datetime(df["date"])
df["Appliances"] = pd.to_numeric(df["Appliances"])
df.set_index("date", inplace=True)

X = df.iloc[:,1:]
y = df.Appliances

# 2. Khởi tạo mô hình
models = {
    "LinearRegression": LinearRegression(),
    "RandomForest": RandomForestRegressor(n_estimators=32, random_state=42),
    "GradientBoosting": GradientBoostingRegressor(n_estimators=32, random_state=42),
}

print()

# 3. 10-fold CV lặp lại 3 lần → 30 kết quả mỗi model
rkf = RepeatedKFold(n_splits=10, n_repeats=3, random_state=42)

# 4. Đo bằng RMSE
results = {}
for name, model in models.items():
    # Trả về 30 giá trị RMSE âm → ta lấy âm lại để ra RMSE dương
    neg_rmse_scores = cross_val_score(model, X, y, cv=rkf, scoring="neg_root_mean_squared_error", n_jobs=-1)
    rmse_scores = -neg_rmse_scores  # đổi dấu
    results[name] = rmse_scores
    results[name] = np.append(results[name], rmse_scores.mean())
   
    print(f"{name}:")
    print(f"  → RMSE ({len(rmse_scores)} lần): {np.round(rmse_scores, 2)}")
    print(f"  → Mean RMSE = {rmse_scores.mean():.2f} ± {rmse_scores.std():.2f}\n")

df = pd.DataFrame(results)
df.to_csv(f"results_{models["RandomForest"].n_estimators}_{models["GradientBoosting"].n_estimators}.csv", index=False)


LinearRegression:
  → RMSE (30 lần): [ 85.35  96.62  95.47 104.19  94.71  90.27  96.8   88.79  94.61  90.17
  91.52  87.63  89.65  95.03  95.74  90.22  94.66  99.2   91.14 102.86
  97.47  96.53 102.68  99.34  93.14  86.97  88.15  86.37  90.96  95.49]
  → Mean RMSE = 93.72 ± 4.91

RandomForest:
  → RMSE (30 lần): [62.03 72.12 71.34 75.67 73.34 62.54 71.16 64.87 70.42 70.29 64.62 67.41
 63.25 72.85 71.9  64.08 71.39 74.63 70.27 74.6  69.21 71.16 76.81 75.35
 67.6  63.06 67.27 61.79 66.84 71.8 ]
  → Mean RMSE = 69.32 ± 4.38

GradientBoosting:
  → RMSE (30 lần): [ 84.95  95.93  95.27 101.99  95.44  88.7   96.32  87.43  95.06  89.42
  89.99  87.29  88.4   93.79  98.07  88.45  94.86  98.36  89.69 101.25
  97.3   95.42 100.99  98.47  92.4   86.45  88.62  86.06  89.98  93.93]
  → Mean RMSE = 93.01 ± 4.84

