In [None]:
## 基础工具
import warnings
warnings.filterwarnings("ignore")
import os
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras.callbacks import LearningRateScheduler
import lightgbm as lgb
from catboost import CatBoostRegressor
from sklearn.model_selection import KFold, RepeatedKFold
from sklearn.metrics import mean_absolute_error
from sklearn import linear_model

## 读取树模型数据
# path = os.path.abspath(os.path.dirname(os.getcwd()) + os.path.sep + ".")
path = os.getcwd()
tree_data_path = os.path.join(path, "user_data")
Train_data = pd.read_csv(os.path.join(tree_data_path, "train_tree.csv"), sep=" ")
TestA_data = pd.read_csv(os.path.join(tree_data_path, "test_tree.csv"), sep=" ")

numerical_cols = Train_data.columns
feature_cols = [col for col in numerical_cols if col not in ["price", "SaleID"]]
## 提前特征列，标签列构造训练样本和测试样本
X_data = Train_data[feature_cols]
X_test = TestA_data[feature_cols]
print(X_data.shape)
print(X_test.shape)

X_data = np.array(X_data)
X_test = np.array(X_test)
Y_data = np.array(Train_data["price"])

"""
lightgbm
"""


# 自定义损失函数
def myFeval(preds, xgbtrain):
    label = xgbtrain.get_label()
    score = mean_absolute_error(np.expm1(label), np.expm1(preds))
    return "myFeval", score, False
param = {
    "boosting_type": "gbdt",         # 使用 GBDT（梯度提升树）作为提升方法，是 LightGBM 默认的方式
    "num_leaves": 31,                # 每棵树的最大叶子节点数，值越大模型越复杂，越容易过拟合
    "max_depth": -1,                 # 不限制树的最大深度，通常与 num_leaves 联合控制复杂度
    "lambda_l2": 2,                  # L2 正则化系数，用于防止模型过拟合
    "min_data_in_leaf": 20,          # 每个叶子节点最少的数据量，防止分裂出只有少量样本的叶子，减少过拟合风险
    "objective": "regression_l1",    # 目标函数为 L1 回归（最小绝对误差 MAE），对异常值更稳健
    "learning_rate": 0.02,           # 学习率，值小代表每棵树学习得更慢，训练更稳定但耗时更长
    "min_child_samples": 20,         # 与 min_data_in_leaf 类似，叶子节点所需的最小样本数，防止过拟合
    "feature_fraction": 0.8,         # 每棵树训练时，随机使用 80% 的特征，有助于减少特征间的共线性和过拟合
    "bagging_freq": 1,               # 每 1 次迭代执行一次数据采样（bagging）
    "bagging_fraction": 0.8,         # 每次训练时只用 80% 的训练数据进行采样，防止模型过拟合
    "bagging_seed": 11,              # 控制 bagging 的随机性，使得结果可复现
    "metric": "mae",                 # 使用 MAE（平均绝对误差）作为验证集评估指标，衡量预测误差
}

folds = KFold(n_splits=2, shuffle=True)
oof_lgb = np.zeros(len(X_data))
predictions_lgb = np.zeros(len(X_test))
predictions_train_lgb = np.zeros(len(X_data))
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_data, Y_data)):
    print("fold n°{}".format(fold_ + 1))
    trn_data = lgb.Dataset(X_data[trn_idx], Y_data[trn_idx])
    val_data = lgb.Dataset(X_data[val_idx], Y_data[val_idx])

    num_round = 1000000
    # 添加回调函数
    callbacks = [
        lgb.callback.log_evaluation(period=300),
        lgb.callback.early_stopping(stopping_rounds=300),
    ]
    clf = lgb.train(
        param,
        trn_data,
        num_round,
        valid_sets=[trn_data, val_data],
        feval=myFeval,
        callbacks=callbacks,
    )
    oof_lgb[val_idx] = clf.predict(X_data[val_idx], num_iteration=clf.best_iteration)
    predictions_lgb += (
        clf.predict(X_test, num_iteration=clf.best_iteration) / folds.n_splits
    )
    predictions_train_lgb += (
        clf.predict(X_data, num_iteration=clf.best_iteration) / folds.n_splits
    )

print(
    "lightgbm score: {:<8.8f}".format(
        mean_absolute_error(np.expm1(oof_lgb), np.expm1(Y_data))
    )
)

(149999, 83)
(50000, 83)
fold n°1
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.081016 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 17643
[LightGBM] [Info] Number of data points in the train set: 134999, number of used features: 83
[LightGBM] [Info] Start training from score 8.086718
Training until validation scores don't improve for 300 rounds
[300]	training's l1: 0.14616	training's myFeval: 759.277	valid_1's l1: 0.148133	valid_1's myFeval: 806.277
[600]	training's l1: 0.127635	training's myFeval: 597.761	valid_1's l1: 0.129608	valid_1's myFeval: 628.858
[900]	training's l1: 0.120952	training's myFeval: 545.781	valid_1's l1: 0.123165	valid_1's myFeval: 574.072
[1200]	training's l1: 0.117161	training's myFeval: 519.097	valid_1's l1: 0.119728	valid_1's myFeval: 548.005
[1500]	training's l1: 0.114667	training's myFeval: 499.419	valid_1's l1: 0.117561	valid_1's myFeval: 529.236
[1800]	training's l

In [None]:
output_path = os.path.join(path, "user_data")
# 测试集输出
predictions = predictions_lgb
predictions[predictions < 0] = 0
sub = pd.DataFrame()
sub["SaleID"] = TestA_data.SaleID
sub["price"] = predictions
sub.to_csv(os.path.join(output_path, "test_lgb.csv"), index=False)

In [None]:
# 验证集输出
oof_lgb[oof_lgb < 0] = 0
sub = pd.DataFrame()
sub["SaleID"] = Train_data.SaleID
sub["price"] = oof_lgb
sub.to_csv(os.path.join(output_path, "train_lgb.csv"), index=False)

In [None]:
"""
catboost
"""

kfolder = KFold(n_splits=2, shuffle=True)
oof_cb = np.zeros(len(X_data))
predictions_cb = np.zeros(len(X_test))
predictions_train_cb = np.zeros(len(X_data))
kfold = kfolder.split(X_data, Y_data)
fold_ = 0
for train_index, vali_index in kfold:
    fold_ = fold_ + 1
    print("fold n°{}".format(fold_))
    k_x_train = X_data[train_index]
    k_y_train = Y_data[train_index]
    k_x_vali = X_data[vali_index]
    k_y_vali = Y_data[vali_index]
    cb_params = {
        "n_estimators": 1000000,          # 最多训练的迭代次数（树的数量），非常大，通常配合 early_stopping 使用
        "loss_function": "MAE",           # 损失函数使用 MAE（Mean Absolute Error）—— 绝对值误差，更鲁棒于离群值
        "eval_metric": "MAE",             # 验证时的评估指标也是 MAE（与 loss_function 一致）
        "learning_rate": 0.02,            # 学习率，小学习率配合大 n_estimators，训练更稳定
        "depth": 6,                       # 每棵树的最大深度，控制模型复杂度（一般 6~10）
        "use_best_model": True,           # 使用验证集找到最佳模型（用于 early stopping）
        "subsample": 0.6,                 # 每次训练使用 60% 的样本，防止过拟合
        "bootstrap_type": "Bernoulli",   # 使用 Bernoulli 采样方法来做子样本（和 subsample 一起使用）
        "reg_lambda": 3,                  # L2 正则化系数，防止过拟合
        "one_hot_max_size": 2,           # 如果类别变量的唯一值数量 ≤ 2，则使用 One-Hot 编码
    }
    model_cb = CatBoostRegressor(**cb_params)
    # train the model
    model_cb.fit(
        k_x_train,
        k_y_train,
        eval_set=[(k_x_vali, k_y_vali)],
        verbose=300,
        early_stopping_rounds=300,
    )
    oof_cb[vali_index] = model_cb.predict(k_x_vali, ntree_end=model_cb.best_iteration_)
    predictions_cb += (
        model_cb.predict(X_test, ntree_end=model_cb.best_iteration_) / kfolder.n_splits
    )
    predictions_train_cb += (
        model_cb.predict(X_data, ntree_end=model_cb.best_iteration_) / kfolder.n_splits
    )

print(
    "catboost score: {:<8.8f}".format(
        mean_absolute_error(np.expm1(oof_cb), np.expm1(Y_data))
    )
)

fold n°1
0:	learn: 0.9748379	test: 0.9794465	best: 0.9794465 (0)	total: 184ms	remaining: 2d 3h 1m 34s
300:	learn: 0.1702876	test: 0.1754260	best: 0.1754260 (300)	total: 9.63s	remaining: 8h 53m 17s
600:	learn: 0.1443512	test: 0.1493608	best: 0.1493608 (600)	total: 19.1s	remaining: 8h 49m 2s
900:	learn: 0.1330670	test: 0.1380017	best: 0.1380017 (900)	total: 28.5s	remaining: 8h 47m 18s
1200:	learn: 0.1269843	test: 0.1322234	best: 0.1322234 (1200)	total: 37.9s	remaining: 8h 45m 5s
1500:	learn: 0.1230593	test: 0.1285772	best: 0.1285772 (1500)	total: 47.3s	remaining: 8h 44m 21s
1800:	learn: 0.1201550	test: 0.1259519	best: 0.1259519 (1800)	total: 56.7s	remaining: 8h 43m 47s
2100:	learn: 0.1179963	test: 0.1240175	best: 0.1240175 (2100)	total: 1m 6s	remaining: 8h 43m 10s
2400:	learn: 0.1162220	test: 0.1225035	best: 0.1225035 (2400)	total: 1m 14s	remaining: 8h 39m 2s
2700:	learn: 0.1147293	test: 0.1212625	best: 0.1212625 (2700)	total: 1m 23s	remaining: 8h 35m 47s
3000:	learn: 0.1134367	test: 0.1

In [None]:
output_path = os.path.join(path, "user_data")
# 测试集输出
predictions = predictions_cb
predictions[predictions < 0] = 0
sub = pd.DataFrame()
sub["SaleID"] = TestA_data.SaleID
sub["price"] = predictions
sub.to_csv(os.path.join(output_path, "test_cab.csv"), index=False)

In [None]:
# 验证集输出
oof_cb[oof_cb < 0] = 0
sub = pd.DataFrame()
sub["SaleID"] = Train_data.SaleID  # Train_data.SaleID的长度是149999，与oof_cb一致
sub["price"] = oof_cb
sub.to_csv(os.path.join(output_path, "train_cab.csv"), index=False)

In [None]:
"""
神经网络
"""

## 读取神经网络模型数据
path = os.path.abspath(os.path.dirname(os.getcwd()) + os.path.sep + ".")
tree_data_path = os.path.join(path, "code", "user_data")
Train_NN_data = pd.read_csv(os.path.join(tree_data_path, "train_nn.csv"), sep=" ")
Test_NN_data = pd.read_csv(os.path.join(tree_data_path, "test_nn.csv"), sep=" ")

numerical_cols = Train_NN_data.columns
print(numerical_cols)
feature_cols = [col for col in numerical_cols if col not in ["price", "SaleID"]]
## 提前特征列，标签列构造训练样本和测试样本
X_data = Train_NN_data[feature_cols]
X_test = Test_NN_data[feature_cols]
print(X_data.shape)
print(X_test.shape)

x = np.array(X_data)
y = np.array(Train_NN_data["price"])
x_test = np.array(X_test)

print(x)
print(x_test)
print(y)


# 调整训练过程的学习率
# 调整训练过程的学习率
def scheduler(epoch):
    # 到规定的epoch，学习率减小为原来的1/10
    if epoch == 1400:
        # 使用适用于TensorFlow 2.x的方式更新学习率
        lr = float(model.optimizer.learning_rate.numpy())
        model.optimizer.learning_rate.assign(lr * 0.1)
        print("lr changed to {}".format(lr * 0.1))
    if epoch == 1700:
        lr = float(model.optimizer.learning_rate.numpy())
        model.optimizer.learning_rate.assign(lr * 0.1)
        print("lr changed to {}".format(lr * 0.1))
    if epoch == 1900:
        lr = float(model.optimizer.learning_rate.numpy())
        model.optimizer.learning_rate.assign(lr * 0.1)
        print("lr changed to {}".format(lr * 0.1))
    return float(model.optimizer.learning_rate.numpy())


reduce_lr = LearningRateScheduler(scheduler)


kfolder = KFold(n_splits=2, shuffle=True)
oof_nn = np.zeros(len(x))
predictions_nn = np.zeros(len(x_test))
predictions_train_nn = np.zeros(len(x))
kfold = kfolder.split(x, y)
fold_ = 0
for train_index, vali_index in kfold:
    k_x_train = x[train_index]
    k_y_train = y[train_index]
    k_x_vali = x[vali_index]
    k_y_vali = y[vali_index]

    model = tf.keras.Sequential()
    model.add(
        tf.keras.layers.Dense(
            512, activation="relu", kernel_regularizer=tf.keras.regularizers.l2(0.02)
        )
    )
    model.add(
        tf.keras.layers.Dense(
            256, activation="relu", kernel_regularizer=tf.keras.regularizers.l2(0.02)
        )
    )
    model.add(
        tf.keras.layers.Dense(
            128, activation="relu", kernel_regularizer=tf.keras.regularizers.l2(0.02)
        )
    )
    model.add(
        tf.keras.layers.Dense(
            64, activation="relu", kernel_regularizer=tf.keras.regularizers.l2(0.02)
        )
    )
    model.add(
        tf.keras.layers.Dense(1, kernel_regularizer=tf.keras.regularizers.l2(0.02))
    )

    model.compile(
        loss="mean_absolute_error",
        optimizer=tf.keras.optimizers.Adam(),
        metrics=["mae"],
    )

    model.fit(
        k_x_train,
        k_y_train,
        batch_size=512,
        epochs=2000,
        validation_data=(k_x_vali, k_y_vali),
        callbacks=[reduce_lr],
    )  # callbacks=callbacks,
    oof_nn[vali_index] = model.predict(k_x_vali).reshape(
        (model.predict(k_x_vali).shape[0],)
    )
    predictions_nn += (
        model.predict(x_test).reshape((model.predict(x_test).shape[0],))
        / kfolder.n_splits
    )
    predictions_train_nn += (
        model.predict(x).reshape((model.predict(x).shape[0],)) / kfolder.n_splits
    )

print("NN score: {:<8.8f}".format(mean_absolute_error(oof_nn, y)))

In [None]:
output_path = os.path.join(path, "code", "user_data")
# 测试集输出
predictions = predictions_nn
predictions[predictions < 0] = 0
sub = pd.DataFrame()
sub["SaleID"] = Test_NN_data.SaleID
sub["price"] = predictions
sub.to_csv(os.path.join(output_path, "test_nn.csv"), index=False)

# 验证集输出
oof_nn[oof_nn < 0] = 0
sub = pd.DataFrame()
sub["SaleID"] = Train_NN_data.SaleID
sub["price"] = oof_nn
sub.to_csv(os.path.join(output_path, "train_nn.csv"), index=False)


tree_data_path = os.path.join(path, "code", "user_data")

# 导入树模型lgb预测数据
predictions_lgb = np.array(
    pd.read_csv(os.path.join(tree_data_path, "test_lgb.csv"))["price"]
)
oof_lgb = np.array(pd.read_csv(os.path.join(tree_data_path, "train_lgb.csv"))["price"])

# 导入树模型cab预测数据
predictions_cb = np.array(
    pd.read_csv(os.path.join(tree_data_path, "test_cab.csv"))["price"]
)
oof_cb = np.array(pd.read_csv(os.path.join(tree_data_path, "train_cab.csv"))["price"])

# 读取price，对验证集进行评估
Train_data = pd.read_csv(os.path.join(tree_data_path, "train_tree.csv"), sep=" ")
TestA_data = pd.read_csv(os.path.join(tree_data_path, "test_tree.csv"), sep=" ")
Y_data = Train_data["price"]

train_stack = np.vstack([oof_lgb, oof_cb]).transpose()
test_stack = np.vstack([predictions_lgb, predictions_cb]).transpose()
folds_stack = RepeatedKFold(n_splits=2, n_repeats=2)
tree_stack = np.zeros(train_stack.shape[0])
predictions = np.zeros(test_stack.shape[0])

In [None]:
# 二层贝叶斯回归stack
for fold_, (trn_idx, val_idx) in enumerate(folds_stack.split(train_stack, Y_data)):
    print("fold {}".format(fold_))
    trn_data, trn_y = train_stack[trn_idx], Y_data[trn_idx]
    val_data, val_y = train_stack[val_idx], Y_data[val_idx]

    Bayes = linear_model.BayesianRidge()
    Bayes.fit(trn_data, trn_y)
    tree_stack[val_idx] = Bayes.predict(val_data)
    predictions += Bayes.predict(test_stack) / 4

tree_predictions = np.expm1(predictions)
tree_stack = np.expm1(tree_stack)
tree_point = mean_absolute_error(tree_stack, np.expm1(Y_data))
print("树模型：二层贝叶斯: {:<8.8f}".format(tree_point))

fold 0
fold 1
fold 2
fold 3
树模型：二层贝叶斯: 433.24430404


In [None]:
# 导入神经网络模型预测训练集数据，进行三层融合

# nn_point = mean_absolute_error(oof_nn, np.expm1(Y_data))
# print("神经网络: {:<8.8f}".format(nn_point))

mix_nn = False
if mix_nn:
    oof_nn = np.array(pd.read_csv(tree_data_path + "nn_train.csv")["price"])
    oof = (oof_nn + tree_stack) / 2
    predictions_nn = np.array(pd.read_csv(tree_data_path + "nn_test.csv")["price"])
    predictions = (tree_predictions + predictions_nn) / 2
else:
    oof = tree_stack
    predictions = tree_predictions
all_point = mean_absolute_error(oof, np.expm1(Y_data))
print("总输出：三层融合: {:<8.8f}".format(all_point))


output_path = os.path.join(path, "prediction_result")
# 测试集输出
sub = pd.DataFrame()
sub["SaleID"] = TestA_data.SaleID
predictions[predictions < 0] = 0
sub["price"] = predictions
import random

x = random.randint(1, 10000)

sub.to_csv(os.path.join(output_path, f"predictions_{x}.csv"), index=False)

总输出：三层融合: 433.24430404
