In [18]:
import pandas as pd
import xgboost as xgb
import joblib
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler,FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from math import sqrt
from datetime import datetime
import mlflow
import mlflow.xgboost

mlflow.set_experiment("XGBoost_Regression")
# mlflow.xgboost.autolog()

data = pd.read_csv('../DATA/exported_data4.csv')
X = data.drop('取引価格（総額）', axis=1)
Y = data['取引価格（総額）']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)


categorical_features = ['地区名']  
numeric_features = [col for col in X.columns if col not in categorical_features and col != '建物の構造']


# 更新预处理器
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')), 
            ('scaler', StandardScaler())]), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('ord', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), ['建物の構造']),
    ], remainder='passthrough'
)


run_name = datetime.now().strftime("%Y-%m-%d_%H:%M") + "_xgboost"

with mlflow.start_run(run_name=run_name):
    X_train_processed = preprocessor.fit_transform(X_train)
    X_test_processed = preprocessor.transform(X_test)
    
    # 创建XGBoost回归模型
    xgb_model = xgb.XGBRegressor(
        objective='reg:squarederror',
        # alpha=0.5, 
        n_estimators=4000,
        learning_rate=0.06,
        max_depth=7,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_alpha=0.1,
        reg_lambda=0.1,
        verbosity=2,
        eval_metric=['rmse', 'mae']
    )

    xgb_model.fit(X_train_processed, y_train, eval_set=[(X_train_processed, y_train), (X_test_processed, y_test)], verbose=True)


    results = xgb_model.evals_result()
    train_rmse = results['validation_0']['rmse']
    train_mae = results['validation_0']['mae']
    test_rmse = results['validation_1']['rmse']
    test_mae = results['validation_1']['mae']
    

    final_train_rmse = train_rmse[-1]
    final_train_mae = train_mae[-1]
    final_test_rmse = test_rmse[-1]
    final_test_mae = test_mae[-1]

    y_pred = xgb_model.predict(X_test_processed)
    mse = mean_squared_error(y_test, y_pred)
    rmse = sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    
    mlflow.log_metric("mse", mse)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("final_train_rmse", final_train_rmse)
    mlflow.log_metric("final_train_mae", final_train_mae)
    mlflow.log_metric("final_test_rmse", final_test_rmse)
    mlflow.log_metric("final_test_mae", final_test_mae)

    mlflow.sklearn.log_model(xgb_model, "model")
    

    joblib.dump(preprocessor, 'xgboost_preprocessor.pkl')
    joblib.dump(xgb_model, 'xgboost_model.pkl')
    
    print("MSE:", mse)
    print("RMSE:", rmse)
    print("MAE:", mae)
    print("R2 Score:", r2)


[0]	validation_0-rmse:18973400.88470	validation_0-mae:15035803.15936	validation_1-rmse:18772848.30157	validation_1-mae:14873142.09255
[1]	validation_0-rmse:18580635.96352	validation_0-mae:14731068.47820	validation_1-rmse:18397526.04814	validation_1-mae:14582824.99897
[2]	validation_0-rmse:18343697.22186	validation_0-mae:14536551.33213	validation_1-rmse:18172706.60248	validation_1-mae:14398852.83267
[3]	validation_0-rmse:18037341.15036	validation_0-mae:14305185.62295	validation_1-rmse:17879216.21145	validation_1-mae:14171220.90378
[4]	validation_0-rmse:17880266.22499	validation_0-mae:14177578.68513	validation_1-rmse:17733727.44884	validation_1-mae:14050920.50948
[5]	validation_0-rmse:17709210.08256	validation_0-mae:14036402.85085	validation_1-rmse:17577375.71280	validation_1-mae:13919722.97341
[6]	validation_0-rmse:17413022.81792	validation_0-mae:13805845.25214	validation_1-rmse:17295821.57418	validation_1-mae:13698176.73160
[7]	validation_0-rmse:17178647.31837	validation_0-mae:13626978

 - mlflow (current: 2.14.2, required: mlflow==2.14.1)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
 - mlflow (current: 2.14.2, required: mlflow==2.14.1)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.


MSE: 104791000203725.69
RMSE: 10236747.54029451
MAE: 7231425.257102064
R2 Score: 0.7151494744086968


In [None]:
from sklearn.ensemble import StackingRegressor, RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
mlflow.set_experiment("XGBoost_Regression")
mlflow.xgboost.autolog()

data = pd.read_csv('./DATA/exported_data4.csv')
X = data.drop('取引価格（総額）', axis=1)
Y = data['取引価格（総額）']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)


categorical_features = ['地区名']  
numeric_features = [col for col in X.columns if col not in categorical_features and col != '建物の構造']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')), 
            ('scaler', StandardScaler())]), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('ord', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), ['建物の構造'])
    ], remainder='passthrough'
)

# 定义基模型
estimators = [
    ('xgb', xgb.XGBRegressor(
        objective='reg:squarederror',
        n_estimators=3000,
        learning_rate=0.05,
        max_depth=7,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_alpha=0.2,
        reg_lambda=0.2)),
    ('rf', RandomForestRegressor(n_estimators=100)),
    ('ridge', Ridge())
]

# 定义元模型
stack_model = StackingRegressor(
    estimators=estimators,
    final_estimator=Ridge()
)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', stack_model)
])

with mlflow.start_run(run_name=datetime.now().strftime("%Y-%m-%d_%H:%M") + "_stacked_model"):
    # 训练管道
    pipeline.fit(X_train, y_train)

    # 预测
    y_pred = pipeline.predict(X_test)
    
    # 计算指标
    mse = mean_squared_error(y_test, y_pred)
    rmse = sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # 记录指标
    mlflow.log_metric("mse", mse)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("r2", r2)
    
    # 保存模型和预处理器
    mlflow.sklearn.log_model(pipeline, "stacked_model")
    joblib.dump(pipeline, 'stacked_model.pkl')
    
    print("MSE:", mse)
    print("RMSE:", rmse)
    print("MAE:", mae)
    print("R2 Score:", r2)