In [None]:
import json
import importlib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from datetime import datetime
from math import sqrt
from tqdm import tqdm
import lightgbm as lgb
import xgboost as xgb
from lightgbm import early_stopping, log_evaluation
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.metrics import r2_score

mlflow.sklearn.autolog()

# 加载数据
data = pd.read_csv('./DATA/exported_data7.csv')
X = data.drop('取引価格（総額）', axis=1)
y = data['取引価格（総額）']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 标准化数据
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 定义模型参数
params = {
    'objective': 'reg:squarederror',  # 指定模型目标为平方误差回归
    'n_estimators': 183,  # 树的数量
    'learning_rate': 0.05,  # 学习率
    'max_depth': 6,  # 树的最大深度
    'subsample': 0.8,  # 样本采样比例
    'colsample_bytree': 0.8,  # 特征采样比例
    'reg_alpha': 0.1,  # L1 正则化参数
    'reg_lambda': 0.1,  # L2 正则化参数
    'verbosity': 1  # 显示训练日志的详细程度
}

# 开始一个 MLflow 运行
with mlflow.start_run():
    # 初始化并训练 XGBoost 模型
    xgb_model = xgb.XGBRegressor(**params)
    xgb_model.fit(X_train_scaled, y_train, 
                  eval_set=[(X_test_scaled, y_test)], 
                  verbose=True)
    
    # 使用训练好的模型进行预测
    y_pred = xgb_model.predict(X_test_scaled)
    
    # 计算并记录指标
    mse = mean_squared_error(y_test, y_pred)
    rmse = sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    
    mlflow.log_metric("mse", mse)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)
    mlflow.log_metric("mae", mae)
    
    # 记录模型
    mlflow.sklearn.log_model(xgb_model, "model")
    
    # 可选：记录模型参数
    mlflow.log_params(params)

print(f"Test MSE: {mse}")
print(f"Test RMSE: {rmse}")
print(f"Test R²: {r2}")
print(f"Test MAE: {mae}")