In [None]:
import json
import importlib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from math import sqrt
import xgboost as xgb
from datetime import datetime

mlflow.xgboost.autolog()

# 加载数据
data = pd.read_csv('./DATA/exported_data.csv')
X = data.drop('取引価格（総額）', axis=1)
y = data['取引価格（総額）']

# 标准化数据
X_scaler = StandardScaler()
X_scaled = X_scaler.fit_transform(X)

y_scaler = StandardScaler()
y_scaled = y_scaler.fit_transform(np.array(y).reshape(-1, 1))

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.2, random_state=42)

params = {
    'objective': 'reg:squarederror',
    'n_estimators': 3000,
    'learning_rate': 0.04,
    'max_depth': 8,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1,
    'verbosity': 2, 
    'eval_metric': ['rmse', 'mae']  
}

current_time = datetime.now().strftime("%m-%d_%H:%M")

with mlflow.start_run(run_name=f"xgboost_{current_time}"):
    xgb_model = xgb.XGBRegressor(**params)
    
    # 记录训练过程中的指标
    eval_results = {}
    xgb_model.fit(X_train, y_train, 
                  eval_set=[(X_train, y_train), (X_test, y_test)], 
                  verbose=True)
    
    # 提取训练过程中的指标
    results = xgb_model.evals_result()
    train_rmse = results['validation_0']['rmse']
    train_mae = results['validation_0']['mae']
    test_rmse = results['validation_1']['rmse']
    test_mae = results['validation_1']['mae']
    
    # 获取最后一个迭代的指标
    final_train_rmse = train_rmse[-1]
    final_train_mae = train_mae[-1]
    final_test_rmse = test_rmse[-1]
    final_test_mae = test_mae[-1]
    #预测并计算test指标
    y_pred = xgb_model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    
    mlflow.log_metric("mse", mse)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)
    mlflow.log_metric("mae", mae)
    mlflow.sklearn.log_model(xgb_model, "model")
    mlflow.log_params(params)
    # 记录最后一轮迭代的训练指标到 MLflow
    mlflow.log_metric("final_train_rmse", final_train_rmse)
    mlflow.log_metric("final_train_mae", final_train_mae)
    mlflow.log_metric("final_test_rmse", final_test_rmse)
    mlflow.log_metric("final_test_mae", final_test_mae)
    
    #
    # y_pred = xgb_model.predict(X_test)
    # mse = mean_squared_error(y_test, y_pred)
    # rmse = sqrt(mse)
print(f"Test MSE: {mse}")
print(f"Test RMSE: {rmse}")
print(f"Test R²: {r2}")
print(f"Test MAE: {mae}")

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from math import sqrt
import xgboost as xgb
import mlflow
import mlflow.sklearn

mlflow.xgboost.autolog()

# 加载数据
data = pd.read_csv('./DATA/exported_data.csv')
X = data.drop('取引価格（総額）', axis=1)
y = data['取引価格（総額）']

# 标准化数据
X_scaler = StandardScaler()
X_scaled = X_scaler.fit_transform(X)

y_scaler = StandardScaler()
y_scaled = y_scaler.fit_transform(np.array(y).reshape(-1, 1))

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.2, random_state=42)

params = {
    'objective': 'reg:squarederror',
    'n_estimators': 3000,
    'learning_rate': 0.04,
    'max_depth': 8,
    'subsample': 0.8,
    'colsample_bytree': 0.5,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1,
    'verbosity': 2  # 将日志详细程度设置为 1
}

with mlflow.start_run():
    xgb_model = xgb.XGBRegressor(**params)
    xgb_model.fit(X_train, y_train, 
                  eval_set=[(X_test, y_test)], 
                  verbose=True)
    
    y_pred_scaled = xgb_model.predict(X_test)

    # 反标准化预测值和实际值
    y_pred = y_scaler.inverse_transform(y_pred_scaled.reshape(-1, 1))
    y_test_orig = y_scaler.inverse_transform(y_test)

    mse = mean_squared_error(y_test_orig, y_pred)
    rmse = sqrt(mse)
    r2 = r2_score(y_test_orig, y_pred)
    mae = mean_absolute_error(y_test_orig, y_pred)
    
    mlflow.log_metric("mse", mse)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)
    mlflow.log_metric("mae", mae)
    
    mlflow.sklearn.log_model(xgb_model, "model")
    mlflow.log_params(params)

print(f"Test MSE: {mse}")
print(f"Test RMSE: {rmse}")
print(f"Test R²: {r2}")
print(f"Test MAE: {mae}")

In [None]:
data.columns

In [33]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
import xgboost as xgb
import joblib
import pandas as pd
from sklearn.model_selection import train_test_split

# 加载数据
data = pd.read_csv('./DATA/exported_data4.csv')
X = data.drop('取引価格（総額）', axis=1)
Y = data['取引価格（総額）']

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# 创建OneHotEncoder和OrdinalEncoder
# 设置sparse_output=False以返回稠密矩阵，如果XGBoost或StandardScaler处理稀疏矩阵有问题
onehot_encoder = OneHotEncoder(sparse_output=False)
ordinal_encoder = OrdinalEncoder()  # 用于整数编码

# ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('ohe', onehot_encoder, ['地区名']),  # 假设地区名需要独热编码
        ('ordinal', ordinal_encoder, ['建物の構造'])  # 使用OrdinalEncoder替代LabelEncoder
    ], remainder='passthrough'
)

# XGBoost回归模型
xgb_model = xgb.XGBRegressor(
    objective='reg:squarederror', n_estimators=3000, learning_rate=0.04,
    max_depth=8, subsample=0.8, colsample_bytree=0.8, reg_alpha=0.1,
    reg_lambda=0.1, verbosity=2, eval_metric=['rmse', 'mae']
)

# 整个管道，包括预处理和XGBoost模型
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),  # 假设还包含标准化
    ('regressor', xgb_model)  # 更改标识符为'regressor'，因为这是一个回归模型
])

# 训练管道
pipeline.fit(X_train, y_train)

# 保存训练好的管道
joblib.dump(pipeline, 'xgboost_pipeline.pkl')

2024/07/05 13:37:51 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '6630667914454b93b136148d9af3afdf', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


['xgboost_pipeline.pkl']

In [35]:
import pandas as pd
import joblib

# 加载训练好的管道
pipeline = joblib.load('xgboost_pipeline.pkl')

# 准备新的数据
new_data = pd.DataFrame({
    '最寄駅：距離（分）': [18],
    '面積（㎡）': [100],
    '建物の構造': [10],  # 确保这是数值类型，如果原始数据中这是字符串类型，可能需要转换
    '建ぺい率（％）': [50],
    '容積率（％）': [200],
    '建築年数': [5],
    '地区名': ['西蒲田']  # 假设这是一个分类特征，并已经通过OneHotEncoder进行处理
})

# 确保所有数值列都是正确的数据类型
new_data['最寄駅：距離（分）'] = new_data['最寄駅：距離（分）'].astype(float)
new_data['面積（㎡）'] = new_data['面積（㎡）'].astype(float)
new_data['建物の構造'] = new_data['建物の構造'].astype(int)
new_data['建ぺい率（％）'] = new_data['建ぺい率（％）'].astype(float)
new_data['容積率（％）'] = new_data['容積率（％）'].astype(float)
new_data['建築年数'] = new_data['建築年数'].astype(int)

# 使用管道预测新数据的房价
predicted_price = pipeline.predict(new_data)

# 打印预测结果
print("预测的房价：", predicted_price)

TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

In [37]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer
import xgboost as xgb
import joblib

# 加载数据
data = pd.read_csv('./DATA/exported_data4.csv')

# 检查数据类型，确保每列都是正确的类型
print("原始数据类型:\n", data.dtypes)

# 将建物の構造列转换为数值型，如果是字符串
data['建物の構造'] = pd.to_numeric(data['建物の構造'], errors='coerce')

# 划分训练集和测试集
X = data.drop('取引価格（総額）', axis=1)
Y = data['取引価格（総額）']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# 创建预处理器
# 对分类变量使用OneHotEncoder，对数值变量使用StandardScaler
categorical_features = ['地区名']  # 假设地区名为分类变量
numeric_features = [col for col in X.columns if col not in categorical_features and col != '建物の構造']  # 其他数值变量

preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),  # 处理数值列的NaN值
            ('scaler', StandardScaler())]), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('ord', OrdinalEncoder(), ['建物の構造'])  # 假设建物の構造为有序分类变量
    ], remainder='passthrough'
)

# 创建XGBoost回归模型
xgb_model = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=3000,
    learning_rate=0.04,
    max_depth=8,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=0.1,
    verbosity=2,
    eval_metric=['rmse', 'mae']
)

# 构建整个管道
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', xgb_model)
])

# 训练管道
pipeline.fit(X_train, y_train)

# 保存训练好的管道
joblib.dump(pipeline, 'xgboost_pipeline.pkl')

# 加载模型并进行预测（示例）
loaded_pipeline = joblib.load('xgboost_pipeline.pkl')
predicted_price = loaded_pipeline.predict(X_test)
print("预测价格：", predicted_price)

2024/07/05 13:51:53 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '20a717ffcf7b475d955b8a28ffb829b7', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


原始数据类型:
 最寄駅：距離（分）      int64
取引価格（総額）       int64
面積（㎡）          int64
建物の構造         object
地区名           object
建ぺい率（％）      float64
容積率（％）       float64
建築年数           int64
dtype: object




预测价格： [30405978. 44360416. 58980780. ... 63497484. 32699766. 66778776.]


In [42]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer
import xgboost as xgb
import joblib
import pandas as pd
from sklearn.model_selection import train_test_split

# 加载数据
data = pd.read_csv('./DATA/exported_data4.csv')

# 划分训练集和测试集
X = data.drop('取引価格（総額）', axis=1)
Y = data['取引価格（総額）']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# 创建预处理器
categorical_features = ['地区名']  # 假设地区名为分类变量
numeric_features = [col for col in X.columns if col not in categorical_features and col != '建物の構造']  # 其他数值变量

preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),  # 处理数值列的NaN值
            ('scaler', StandardScaler())]), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('ord', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), ['建物の構造'])  # 使用use_encoded_value并指定unknown_value
    ], remainder='passthrough'
)

# 创建XGBoost回归模型
xgb_model = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=3000,
    learning_rate=0.04,
    max_depth=8,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=0.1,
    verbosity=1,
    eval_metric=['rmse', 'mae']
)

# 构建整个管道
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', xgb_model)
])

# 训练管道
pipeline.fit(X_train, y_train)

# 保存训练好的管道
joblib.dump(pipeline, 'xgboost_pipeline.pkl')

# 加载模型并进行预测（示例）
loaded_pipeline = joblib.load('xgboost_pipeline.pkl')
predicted_price = loaded_pipeline.predict(X_test)
print("预测价格：", predicted_price)

2024/07/05 14:02:35 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '06a23bdaf02341849cbf45201864f05a', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


预测价格： [30482450. 44405860. 59131508. ... 62750568. 32472214. 65832696.]


In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer
import xgboost as xgb
import joblib
import pandas as pd
from sklearn.model_selection import train_test_split

# 加载数据
data = pd.read_csv('./DATA/exported_data4.csv')

# 划分训练集和测试集
X = data.drop('取引価格（総額）', axis=1)
Y = data['取引価格（総額）']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# 创建预处理器
categorical_features = ['地区名']  # 假设地区名为分类变量
numeric_features = [col for col in X.columns if col not in categorical_features and col != '建物の構造']  # 其他数值变量

preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),  # 处理数值列的NaN值
            ('scaler', StandardScaler())]), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('ord', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), ['建物の構造'])  # 使用use_encoded_value并指定unknown_value
    ], remainder='passthrough'
)

# 创建XGBoost回归模型
xgb_model = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=3000,
    learning_rate=0.04,
    max_depth=8,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=0.1,
    verbosity=1,
    eval_metric=['rmse', 'mae']
)

# 构建整个管道
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', xgb_model)
])

# 训练管道
pipeline.fit(X_train, y_train)

# 保存训练好的管道
joblib.dump(pipeline, 'xgboost_pipeline.pkl')

# 加载模型并进行预测（示例）
loaded_pipeline = joblib.load('xgboost_pipeline.pkl')
predicted_price = loaded_pipeline.predict(X_test)
print("预测价格：", predicted_price)

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer
import xgboost as xgb
import joblib
import pandas as pd
from sklearn.model_selection import train_test_split

# 加载数据
data = pd.read_csv('./DATA/exported_data4.csv')

# 划分训练集和测试集
X = data.drop('取引価格（総額）', axis=1)
Y = data['取引価格（総額）']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# 创建预处理器
categorical_features = ['地区名']  # 假设地区名为分类变量
numeric_features = [col for col in X.columns if col not in categorical_features and col != '建物の構造']  # 其他数值变量

preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),  # 处理数值列的NaN值
            ('scaler', StandardScaler())]), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('ord', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), ['建物の構造'])  # 使用use_encoded_value并指定unknown_value
    ], remainder='passthrough'
)

# 创建XGBoost回归模型
xgb_model = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=3000,
    learning_rate=0.04,
    max_depth=8,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=0.1,
    verbosity=1,
    eval_metric=['rmse', 'mae']
)

# 构建整个管道
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', xgb_model)
])

# 训练管道
pipeline.fit(X_train, y_train)

# 保存训练好的管道
joblib.dump(pipeline, 'xgboost_pipeline.pkl')

# 加载模型并进行预测（示例）
loaded_pipeline = joblib.load('xgboost_pipeline.pkl')
predicted_price = loaded_pipeline.predict(X_test)
print("预测价格：", predicted_price)

In [None]:
import pandas as pd
import xgboost as xgb
import joblib
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from datetime import datetime
import mlflow
import mlflow.xgboost

# 启动 MLflow
mlflow.set_experiment("XGBoost_Regression")
mlflow.xgboost.autolog()

# 加载数据
data = pd.read_csv('./DATA/exported_data4.csv')
X = data.drop('取引価格（総額）', axis=1)
Y = data['取引価格（総額）']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# 创建预处理器
categorical_features = ['地区名']  # 假设地区名为分类变量
numeric_features = [col for col in X.columns if col not in categorical_features and col != '建物の構造']  # 其他数值变量

preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),  # 处理数值列的NaN值
            ('scaler', StandardScaler())]), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('ord', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), ['建物の構造'])
    ], remainder='passthrough'
)

# 创建XGBoost回归模型
xgb_model = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=4000,
    learning_rate=0.06,
    max_depth=10,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.2,
    reg_lambda=0.2,
    verbosity=1,
    eval_metric=['rmse', 'mae']
)

# 构建整个管道
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', xgb_model)
])

# 定义运行的名字
run_name = datetime.now().strftime("%Y-%m-%d_%H:%M:%S") + "_xgboost"

with mlflow.start_run(run_name=run_name):
    # 训练管道
    pipeline.fit(X_train, y_train)
    
    # 评估模型
    predictions = pipeline.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    rmse = mean_squared_error(y_test, predictions, squared=False)
    mae = mean_absolute_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    
    # 打印指标
    print("MSE:", mse)
    print("RMSE:", rmse)
    print("MAE:", mae)
    print("R2 Score:", r2)
    
    # 记录指标
    mlflow.log_metric("mse", mse)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("r2", r2)
    
    # 记录模型
    mlflow.sklearn.log_model(pipeline, "model")
    
    # 保存训练好的管道
    joblib.dump(pipeline, 'xgboost_pipeline.pkl')

# 加载模型并进行预测（示例）
loaded_pipeline = joblib.load('xgboost_pipeline.pkl')
predicted_price = loaded_pipeline.predict(X_test)
print("预测价格：", predicted_price)