# 数据预处理

In [116]:
import pandas as pd

test = pd.read_csv("test.csv")
train = pd.read_csv("train.csv")

data = pd.concat([train, test], axis=0, sort=False)
missing_cols = [c for c in data if data[c].isna().mean() * 100 > 50]  # 删除缺失值比例大于50%的特征列
data = data.drop(missing_cols, axis=1)

object_df = data.select_dtypes(include=['object'])
numerical_df = data.select_dtypes(exclude=['object'])
object_df = object_df.fillna('unknown')  # 用unknown填充
missing_cols = [c for c in numerical_df if numerical_df[c].isna().sum() > 0]
for c in missing_cols:
    numerical_df[c] = numerical_df[c].fillna(numerical_df[c].median())  # 用中位数填充

In [117]:
object_df = object_df.drop(['Heating', 'RoofMatl', 'Condition2', 'Street', 'Utilities'], axis=1)  # 删除类别比不均衡的特征（对分数没影响）

In [118]:
numerical_df.loc[
    numerical_df['YrSold'] < numerical_df['YearBuilt'], 'YrSold'] = 2009  # 将销售日期小于建造日期的数据的销售日期改为2009(销售日期的最大值)
numerical_df['Age_House'] = (numerical_df['YrSold'] - numerical_df['YearBuilt'])  # 计算房屋的年龄

numerical_df['TotalBsmtBath'] = numerical_df['BsmtFullBath'] + numerical_df['BsmtHalfBath'] * 0.5  # 对浴池求和得到地下室的总浴室数
numerical_df['TotalBath'] = numerical_df['FullBath'] + numerical_df['HalfBath'] * 0.5  # 对浴池求和得到地上的总浴室数
numerical_df['TotalSA'] = numerical_df['TotalBsmtSF'] + numerical_df['1stFlrSF'] + numerical_df['2ndFlrSF']  # 计算总面积
# numerical_df = numerical_df.drop(['YrSold', 'YearBuilt', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
#                                     'TotalBsmtSF', '1stFlrSF', '2ndFlrSF'], axis=1)  # 删除不需要的特征，删了分数会低一些



In [119]:
bin_map = {'TA': 2, 'Gd': 3, 'Fa': 1, 'Ex': 4, 'Po': 1, 'None': 0,
           'Y': 1, 'N': 0, 'Reg': 3, 'IR1': 2, 'IR2': 1,
           'IR3': 0, "None": 0, "No": 2, "Mn": 2,
           "Av": 3, "Gd": 4, "Unf": 1, "LwQ": 2,
           "Rec": 3, "BLQ": 4, "ALQ": 5, "GLQ": 6}
object_df['ExterQual'] = object_df['ExterQual'].map(bin_map)
object_df['ExterCond'] = object_df['ExterCond'].map(bin_map)
object_df['BsmtCond'] = object_df['BsmtCond'].map(bin_map)
object_df['BsmtQual'] = object_df['BsmtQual'].map(bin_map)
object_df['HeatingQC'] = object_df['HeatingQC'].map(bin_map)
object_df['KitchenQual'] = object_df['KitchenQual'].map(bin_map)
object_df['FireplaceQu'] = object_df['FireplaceQu'].map(bin_map)
object_df['GarageQual'] = object_df['GarageQual'].map(bin_map)
object_df['GarageCond'] = object_df['GarageCond'].map(bin_map)
object_df['CentralAir'] = object_df['CentralAir'].map(bin_map)
object_df['LotShape'] = object_df['LotShape'].map(bin_map)
object_df['BsmtExposure'] = object_df['BsmtExposure'].map(bin_map)
object_df['BsmtFinType1'] = object_df['BsmtFinType1'].map(bin_map)
object_df['BsmtFinType2'] = object_df['BsmtFinType2'].map(bin_map)

PavedDrive = {"N": 0, "P": 1, "Y": 2}
object_df['PavedDrive'] = object_df['PavedDrive'].map(PavedDrive)
# 选择剩余的object特征
rest_object_columns = object_df.select_dtypes(include=['object'])
# 进行one-hot编码
object_df = pd.get_dummies(object_df, columns=rest_object_columns.columns)

In [120]:
data = pd.concat([object_df, numerical_df], axis=1, sort=False)  # 将处理后的数据合并
print(data.head())

# # 标准化
# from sklearn.preprocessing import StandardScaler
# 
# scaler = StandardScaler()
# data = pd.DataFrame(scaler.fit_transform(data), columns=data.columns)  # 分数略微提高


   LotShape  ExterQual  ExterCond  BsmtQual  BsmtCond  BsmtExposure  \
0         3          4          2       4.0       2.0           2.0   
1         3          2          2       4.0       2.0           4.0   
2         2          4          2       4.0       2.0           2.0   
3         2          2          2       2.0       4.0           2.0   
4         2          4          2       4.0       2.0           3.0   

   BsmtFinType1  BsmtFinType2  HeatingQC  CentralAir  ...  ScreenPorch  \
0           6.0           1.0          4           1  ...            0   
1           5.0           1.0          4           1  ...            0   
2           6.0           1.0          4           1  ...            0   
3           5.0           1.0          4           1  ...            0   
4           6.0           1.0          4           1  ...            0   

   PoolArea  MiscVal  MoSold  YrSold  SalePrice  Age_House  TotalBsmtBath  \
0         0        0       2    2008   208500.0    

In [121]:
def correlation(data, threshold):
    col_corr = set()
    corr_matrix = data.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold:  # 相似性分数与阈值对比
                colname = corr_matrix.columns[i]  # 获取列名
                col_corr.add(colname)
    return col_corr


all_cols = [c for c in data.columns if c not in ['SalePrice']]
corr_features = correlation(data[all_cols], 0.9)  # 选择相关性大于0.9的特征
print(corr_features)
# data = data.drop(corr_features, axis=1)  # 删除相关性大于0.9的特征(保留一个，减少特征冗余)。不做分还高一些


{'GarageFinish_unknown', 'LandSlope_Mod', 'Age_House', 'Exterior2nd_unknown', 'TotalBsmtBath', 'SaleCondition_Partial', 'RoofStyle_Hip', 'TotalBath', 'Exterior2nd_CmentBd', 'Exterior2nd_MetalSd', 'Exterior2nd_VinylSd'}


# 使用与D_HP_1一致的LightGBM模型进行训练

In [122]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

# 切分数据
X_train = data[:train.shape[0]]
X_test = data[train.shape[0]:]
y_train = train['SalePrice']

# 设置LightGBM模型参数
params = {
    'num_leaves': 63,
    'min_child_samples': 50,
    'objective': 'regression',
    'learning_rate': 0.01,
    'boosting_type': 'gbdt',
    'metric': 'rmse',
    'verbose': -1,
}

# KFold交叉验证设置
folds = KFold(n_splits=5, shuffle=True, random_state=42)

oof_preds = np.zeros(X_train.shape[0])  # 存储每一折的验证集预测值，oof: out-of-fold（每个训练样本的验证集预测值）0
test_preds = np.zeros(X_test.shape[0])  # 存储测试集的平均预测值（所有折叠的预测结果）

# 每次分割返回训练集索引 trn_idx 和验证集索引 val_idx
for trn_idx, val_idx in folds.split(X_train, y_train):
    trn_df, trn_label = X_train.iloc[trn_idx], y_train.iloc[trn_idx]
    val_df, val_label = X_train.iloc[val_idx], y_train.iloc[val_idx]

    # 创建LightGBM数据集
    dtrn = lgb.Dataset(trn_df, label=trn_label)
    dval = lgb.Dataset(val_df, label=val_label)

    # 训练模型
    bst = lgb.train(params, dtrn,
                    num_boost_round=1000,
                    valid_sets=[dtrn, dval])

    # 预测。num_iteration=bst.best_iteration 是告诉LightGBM在训练过程中使用表现最好的迭代次数（通过验证集上的评估确定）
    oof_preds[val_idx] = bst.predict(val_df, num_iteration=bst.best_iteration)
    test_preds += bst.predict(X_test, num_iteration=bst.best_iteration) / folds.n_splits  # 这里folds.n_splits=5
    print(f"Fold RMSE: {np.sqrt(mean_squared_error(val_label, oof_preds[val_idx]))}")

# 输出训练集上的RMSE评分
rmse = np.sqrt(mean_squared_error(y_train, oof_preds))
print(f'Overall RMSE on training data: {rmse:.4f}')

# 生成Kaggle提交文件
submission = pd.DataFrame({
    'Id': test['Id'],  # 保持测试集的ID列
    'SalePrice': test_preds  # 预测的结果列
})
submission.to_csv("output/data_process/submission.csv", index=False)  # 保存为csv文件


Fold RMSE: 22764.18232796425
Fold RMSE: 19997.184765044498
Fold RMSE: 13363.179243128678
Fold RMSE: 13230.688058652397
Fold RMSE: 11912.784783713896
Overall RMSE on training data: 16814.4908
