In [2]:
import pandas as pd
import xgboost as xgb
import joblib
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler,FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from math import sqrt
from datetime import datetime
import mlflow
import mlflow.xgboost

mlflow.set_experiment("XGBoost_Regression")
mlflow.xgboost.autolog()

data = pd.read_csv('../DATA/exported_data_masion2.csv')
# data = data.drop('取引時期',axis=1)
X = data.drop('调整価格', axis=1)
Y = data['调整価格']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

onehot_features = ['地区名']  # onehot
ordinal_features = ['建物の構造','最寄駅：名称','地区名','間取り']  # ordinal '間取り'
numeric_features = [col for col in X.columns if col not in onehot_features and col not in ordinal_features]# number

# processor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')), 
            ('scaler', StandardScaler())]), numeric_features),
        # ('cat', OneHotEncoder(handle_unknown='ignore'), onehot_features),
        ('ord', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1),ordinal_features),
    ], remainder='passthrough'
)


run_name = "xgboost_alldata_"+datetime.now().strftime("%Y-%m-%d_%H:%M") 

with mlflow.start_run(run_name=run_name):
    X_train_processed = preprocessor.fit_transform(X_train)
    X_test_processed = preprocessor.transform(X_test)
    
    # 创建XGBoost回归模型
    xgb_model = xgb.XGBRegressor(
        objective='reg:squarederror',
        # alpha=0.5, 
        n_estimators=4000,
        learning_rate=0.05,
        max_depth=8,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_alpha=0.1,
        reg_lambda=0.1,
        verbosity=2,
        eval_metric=['rmse', 'mae']
    )

    xgb_model.fit(X_train_processed, y_train, eval_set=[(X_train_processed, y_train), (X_test_processed, y_test)], verbose=True)


    results = xgb_model.evals_result()
    train_rmse = results['validation_0']['rmse']
    train_mae = results['validation_0']['mae']
    test_rmse = results['validation_1']['rmse']
    test_mae = results['validation_1']['mae']
    

    final_train_rmse = train_rmse[-1]
    final_train_mae = train_mae[-1]
    final_test_rmse = test_rmse[-1]
    final_test_mae = test_mae[-1]

    y_pred = xgb_model.predict(X_test_processed)
    mse = mean_squared_error(y_test, y_pred)
    rmse = sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    
    mlflow.log_metric("mse", mse)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("final_train_rmse", final_train_rmse)
    mlflow.log_metric("final_train_mae", final_train_mae)
    mlflow.log_metric("final_test_rmse", final_test_rmse)
    mlflow.log_metric("final_test_mae", final_test_mae)

    mlflow.sklearn.log_model(xgb_model, "model")
    

    joblib.dump(preprocessor, 'xgboost_preprocessor.pkl')
    joblib.dump(xgb_model, 'xgboost_model.pkl')
    
    print("MSE:", mse)
    print("RMSE:", rmse)
    print("MAE:", mae)
    print("R2 Score:", r2)




[0]	validation_0-rmse:27896734.91232	validation_0-mae:20886979.11202	validation_1-rmse:27920782.71750	validation_1-mae:20923997.70874
[1]	validation_0-rmse:27214594.32774	validation_0-mae:20322581.29224	validation_1-rmse:27242660.87572	validation_1-mae:20361700.91623
[2]	validation_0-rmse:26575769.44349	validation_0-mae:19772927.75506	validation_1-rmse:26606912.30703	validation_1-mae:19814328.14784
[3]	validation_0-rmse:25986523.56483	validation_0-mae:19268217.56042	validation_1-rmse:26021353.99619	validation_1-mae:19311030.70821
[4]	validation_0-rmse:25428277.99452	validation_0-mae:18813047.70807	validation_1-rmse:25467260.14598	validation_1-mae:18857836.75280
[5]	validation_0-rmse:25086776.22723	validation_0-mae:18548769.83701	validation_1-rmse:25128558.54391	validation_1-mae:18592908.74043
[6]	validation_0-rmse:24650164.45605	validation_0-mae:18156436.20981	validation_1-rmse:24694477.81269	validation_1-mae:18201518.97707
[7]	validation_0-rmse:24189164.36088	validation_0-mae:17780504



MSE: 93752680158314.36
RMSE: 9682596.767309602
MAE: 6404733.405399603
R2 Score: 0.8861220435908387


In [None]:
import pandas as pd
import xgboost as xgb
import joblib
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler,FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from math import sqrt
from datetime import datetime
import mlflow
import mlflow.xgboost

mlflow.set_experiment("XGBoost_Regression")
# mlflow.xgboost.autolog()

data = pd.read_csv('../DATA/exported_data-masion.csv')
X = data.drop('取引価格（総額）', axis=1)
Y = data['取引価格（総額）']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)


categorical_features = ['地区名']  
numeric_features = [col for col in X.columns if col not in categorical_features and col != '建物の構造']


# 更新预处理器
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')), 
            ('scaler', StandardScaler())]), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('ord', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), ['建物の構造','間取り']),
    ], remainder='passthrough'
)

In [None]:
data.columns