In [None]:
# wine_quality_main.py
# 说明：基线到 XGBoost 的完整流程（可在 Jupyter Notebook 里逐步运行）

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib

# 如果使用 xgboost
try:
    import xgboost as xgb
    has_xgb = True
except Exception:
    has_xgb = False
    print("未检测到 xgboost，若需使用请安装：pip install xgboost")

# ---------- 配置 ----------
DATA_PATH = "data/winequality-red.csv"  # 改成你的路径
RANDOM_STATE = 42
TEST_SIZE = 0.2

os.makedirs("results/figures", exist_ok=True)
os.makedirs("results/metrics", exist_ok=True)

# ---------- 1. 加载数据 ----------
df = pd.read_csv(DATA_PATH, sep=';')  # UCI 葡萄酒数据集通常分号分隔
print("数据形状:", df.shape)
print(df.head())

# ---------- 2. 初步可视化（分布 & 相关性） ----------
# 基本统计
print(df.describe())

# 目标分布
plt.figure(figsize=(6,4))
plt.hist(df['quality'], bins=range(int(df['quality'].min()), int(df['quality'].max())+2), rwidth=0.8)
plt.xlabel("quality")
plt.ylabel("count")
plt.title("Quality distribution")
plt.savefig("results/figures/quality_distribution.png", bbox_inches='tight')
plt.close()

# 相关矩阵热力图（使用 pandas）
corr = df.corr()
plt.figure(figsize=(10,8))
plt.matshow(corr, fignum=1)
plt.colorbar()
plt.title("Correlation matrix (visual)", y=1.2)
plt.savefig("results/figures/correlation_matrix.png", bbox_inches='tight')
plt.close()

# ---------- 3. 处理缺失值 ----------
# 检查是否有缺失
print("缺失值统计：\n", df.isnull().sum())
# 示例策略：如果有少量缺失用中位数填充；如果很多可考虑删除列/行
df = df.fillna(df.median())  # 简单的中位数填充

# ---------- 4. 特征与目标分离 ----------
X = df.drop(columns=['quality'])
y = df['quality']

# 9/29：做特征缩放（StandardScaler）
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 保存 scaler 以备后用
joblib.dump(scaler, "results/scaler.joblib")

# ---------- 5. 划分训练/测试集 ----------
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=TEST_SIZE, random_state=RANDOM_STATE)
print("训练集大小:", X_train.shape, "测试集大小:", X_test.shape)

# ---------- 6. 训练 Baseline：Linear Regression ----------
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

def evaluate_model(y_true, y_pred, prefix="model"):
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    print(f"{prefix} -> RMSE: {rmse:.4f}, MAE: {mae:.4f}, R2: {r2:.4f}")
    return {"rmse": rmse, "mae": mae, "r2": r2}

metrics_lr = evaluate_model(y_test, y_pred_lr, prefix="LinearRegression")
joblib.dump(lr, "results/lr_model.joblib")

# 保存残差图（预测 vs 真实）
plt.figure(figsize=(6,6))
plt.scatter(y_test, y_pred_lr, alpha=0.6)
plt.xlabel("True quality")
plt.ylabel("Predicted quality")
plt.title("LinearRegression: Pred vs True")
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--')
plt.savefig("results/figures/pred_vs_true_lr.png", bbox_inches='tight')
plt.close()

# ---------- 7. 训练 RandomForest ----------
rf = RandomForestRegressor(n_estimators=100, random_state=RANDOM_STATE, n_jobs=-1)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
metrics_rf = evaluate_model(y_test, y_pred_rf, prefix="RandomForest")
joblib.dump(rf, "results/rf_model.joblib")

# 特征重要性（原始特征名）
feat_importances = rf.feature_importances_
feat_names = X.columns
fi_df = pd.DataFrame({"feature": feat_names, "importance": feat_importances}).sort_values(by="importance", ascending=False)
fi_df.to_csv("results/metrics/rf_feature_importance.csv", index=False)
print(fi_df)

# 绘图
plt.figure(figsize=(8,5))
plt.barh(fi_df['feature'][::-1], fi_df['importance'][::-1])
plt.xlabel("importance")
plt.title("Random Forest Feature Importance")
plt.tight_layout()
plt.savefig("results/figures/rf_feature_importance.png", bbox_inches='tight')
plt.close()

# ---------- 8. 交叉验证（KFold）示例 ----------
kf = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
cv_scores = cross_val_score(rf, X_scaled, y, cv=kf, scoring='neg_root_mean_squared_error', n_jobs=-1)
print("RandomForest 5-fold CV RMSE (negated):", cv_scores)
print("平均 CV RMSE:", -np.mean(cv_scores))

# ---------- 9. XGBoost 训练与交叉验证（如果安装了 xgboost） ----------
if has_xgb:
    xgbr = xgb.XGBRegressor(objective='reg:squarederror', random_state=RANDOM_STATE, n_jobs=-1)
    xgbr.fit(X_train, y_train)
    y_pred_xgb = xgbr.predict(X_test)
    metrics_xgb = evaluate_model(y_test, y_pred_xgb, prefix="XGBoost")
    joblib.dump(xgbr, "results/xgb_model.joblib")
else:
    metrics_xgb = None

# ---------- 10. GridSearchCV 示例（对 RandomForest 调参，作为 10/4 的工作起点） ----------
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 8, 12],
    'min_samples_split': [2, 5]
}
gsearch = GridSearchCV(RandomForestRegressor(random_state=RANDOM_STATE, n_jobs=-1),
                       param_grid, cv=3, scoring='neg_root_mean_squared_error', n_jobs=-1, verbose=1)
gsearch.fit(X_train, y_train)
print("GridSearch 最佳参数：", gsearch.best_params_)
best_rf = gsearch.best_estimator_
y_pred_best_rf = best_rf.predict(X_test)
metrics_best_rf = evaluate_model(y_test, y_pred_best_rf, prefix="RF (GridSearch best)")
joblib.dump(gsearch, "results/rf_gridsearch.joblib")

# ---------- 11. 保存最终评估表 ----------
results_summary = {
    "LinearRegression": metrics_lr,
    "RandomForest": metrics_rf,
    "RandomForest_GridSearch": metrics_best_rf,
    "XGBoost": metrics_xgb
}
# 简单保存为 CSV（结构化）
rows = []
for k,v in results_summary.items():
    if v is None:
        continue
    rows.append({"model": k, "rmse": v["rmse"], "mae": v["mae"], "r2": v["r2"]})
pd.DataFrame(rows).to_csv("results/metrics/model_comparison.csv", index=False)
print("已保存 model_comparison.csv")

# ---------- 12. 简单的残差直方图（以最佳 RF 为例） ----------
residuals = y_test - y_pred_best_rf
plt.figure(figsize=(6,4))
plt.hist(residuals, bins=30)
plt.xlabel("residual")
plt.title("Residuals histogram (best RF)")
plt.savefig("results/figures/residuals_best_rf.png", bbox_inches='tight')
plt.close()

print("流程完成。检查 results/ 下的输出文件并上传至 GitHub。")
