In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split

# -------------------- Load Dataset --------------------
df = pd.read_csv("/Users/zhongyucheng/Desktop/Spring2025/ACMS60890/final_project/Python codes/UPDATED_CODES/updated_white_box/energydata_cleaned.csv")  

# -------------------- Define Features --------------------
target = 'Appliances_log_smooth'
features_full = [col for col in df.columns if col not in ['date', target]]

# Top-5 features from correlation analysis
features_top5 = ['T2', 'T1', 'T3', 'T8', 'T4']

# -------------------- Train/Test Split --------------------
X_full = df[features_full]
X_top5 = df[features_top5]
y = df[target]

X_train_full, X_test_full, y_train, y_test = train_test_split(
    X_full, y, test_size=0.2, random_state=42
)
X_train_top5, X_test_top5 = train_test_split(
    X_top5, test_size=0.2, random_state=42
)

# -------------------- Train Models --------------------
# Full-feature white-box
model_full = LinearRegression()
model_full.fit(X_train_full, y_train)
y_pred_full = model_full.predict(X_test_full)

# Top-5-feature white-box
model_top5 = LinearRegression()
model_top5.fit(X_train_top5, y_train)
y_pred_top5 = model_top5.predict(X_test_top5)

# -------------------- Evaluation Function --------------------
def evaluate_model(y_true, y_pred, label):
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    print(f"\n{label} Evaluation:")
    print(f"  RMSE: {rmse:.4f}")
    print(f"  MAE:  {mae:.4f}")
    print(f"  R²:   {r2:.4f}")
    return {"RMSE": rmse, "MAE": mae, "R2": r2}

# -------------------- Evaluate Both Models --------------------
results_full = evaluate_model(y_test, y_pred_full, "Full-Feature White-Box Model")
results_top5 = evaluate_model(y_test, y_pred_top5, "Top-5-Feature White-Box Model")

# -------------------- Output --------------------
print("\nSummary:")
print("Full-Feature Model:", results_full)
print("Top-5-Feature Model:", results_top5)


Full-Feature White-Box Model Evaluation:
  RMSE: 0.6105
  MAE:  0.4727
  R²:   0.6267

Top-5-Feature White-Box Model Evaluation:
  RMSE: 0.9131
  MAE:  0.7366
  R²:   0.1648

Summary:
Full-Feature Model: {'RMSE': 0.610460846091488, 'MAE': 0.47271231526774987, 'R2': 0.6266998643704618}
Top-5-Feature Model: {'RMSE': 0.9131164142172654, 'MAE': 0.7366122742822002, 'R2': 0.1647916670759506}
