In [28]:
import numpy as np
import pandas as pd
import os

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


In [29]:
df = pd.read_csv("home-data-for-ml-course/train.csv")
serving_df = pd.read_csv("home-data-for-ml-course/test.csv")

df.head(10)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
5,6,50,RL,85.0,14115,Pave,,IR1,Lvl,AllPub,...,0,,MnPrv,Shed,700,10,2009,WD,Normal,143000
6,7,20,RL,75.0,10084,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,307000
7,8,60,RL,,10382,Pave,,IR1,Lvl,AllPub,...,0,,,Shed,350,11,2009,WD,Normal,200000
8,9,50,RM,51.0,6120,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2008,WD,Abnorml,129900
9,10,190,RL,50.0,7420,Pave,,Reg,Lvl,AllPub,...,0,,,,0,1,2008,WD,Normal,118000


In [30]:
serving_df.head(10)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal
5,1466,60,RL,75.0,10000,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,4,2010,WD,Normal
6,1467,20,RL,,7980,Pave,,IR1,Lvl,AllPub,...,0,0,,GdPrv,Shed,500,3,2010,WD,Normal
7,1468,60,RL,63.0,8402,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,5,2010,WD,Normal
8,1469,20,RL,85.0,10176,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,2,2010,WD,Normal
9,1470,20,RL,70.0,8400,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,,0,4,2010,WD,Normal


In [31]:
print(df[['FullBath', 'HalfBath', 'BsmtFullBath', 'BsmtHalfBath']].dtypes)
print(df[['FullBath', 'HalfBath', 'BsmtFullBath', 'BsmtHalfBath']].head())

for col in ['FullBath', 'HalfBath', 'BsmtFullBath', 'BsmtHalfBath']:
    print(f"{col} 类型: {type(df[col])}, 形状: {df[col].shape}")


FullBath        int64
HalfBath        int64
BsmtFullBath    int64
BsmtHalfBath    int64
dtype: object
   FullBath  HalfBath  BsmtFullBath  BsmtHalfBath
0         2         1             1             0
1         2         0             0             1
2         2         1             1             0
3         1         0             1             0
4         2         1             1             0
FullBath 类型: <class 'pandas.core.series.Series'>, 形状: (1460,)
HalfBath 类型: <class 'pandas.core.series.Series'>, 形状: (1460,)
BsmtFullBath 类型: <class 'pandas.core.series.Series'>, 形状: (1460,)
BsmtHalfBath 类型: <class 'pandas.core.series.Series'>, 形状: (1460,)


In [32]:
print(df.index.is_unique)


True


In [33]:
# 检查重复列名
dupes = df.columns[df.columns.duplicated()]
print(f"重复列名：{dupes.tolist()}")



重复列名：[]


In [34]:
# ========== Step 1: 保留重要特征 ==========
keep_features = list(dict.fromkeys([
    'OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea', 'TotalBsmtSF',
    '1stFlrSF', 'FullBath', 'TotRmsAbvGrd', 'YearBuilt', 'YearRemodAdd',
    'Neighborhood', 'KitchenQual', 'ExterQual', 'BsmtQual', 'SaleCondition',
    'LotArea', 'MSZoning', 'HouseStyle', 'Fireplaces',
    'BsmtExposure', 'BsmtFinSF1', 'BsmtCond', 'Foundation', 'CentralAir',
    'FireplaceQu', 'PavedDrive', 'HeatingQC', 'Functional',
    'GarageFinish', 'GarageType', 'GarageYrBlt', 'ExterCond',
    'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch',
    'HalfBath', 'BsmtFullBath', 'BsmtHalfBath',
    'YrSold', 'SalePrice'
]))


df = df[keep_features].copy()

# ========== Step 2: 缺失值处理 ==========
cat_fillna = ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'FireplaceQu',
              'GarageFinish', 'GarageType', 'KitchenQual', 'ExterQual',
              'ExterCond', 'SaleCondition', 'Foundation',
              'HeatingQC', 'PavedDrive', 'Functional', 'CentralAir']
for col in cat_fillna:
    if col in df.columns:
        df[col] = df[col].fillna('None')

num_fillna = ['GarageCars', 'GarageArea', 'TotalBsmtSF', 'BsmtFinSF1',
              'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch',
              'Fireplaces', 'GarageYrBlt', 'LotArea', 'GrLivArea', 'YearBuilt']
for col in num_fillna:
    if col in df.columns:
        df[col] = df[col].fillna(df[col].median())

# ========== Step 3: 构造新特征 ==========
df['HouseAge'] = df['YrSold'] - df['YearBuilt']
df['RemodFlag'] = (df['YearBuilt'] != df['YearRemodAdd']).astype(int)
df['TotalPorchSF'] = df[['OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch']].sum(axis=1)
df['TotalBath'] = (
    df['FullBath'].astype(float) +
    0.5 * df['HalfBath'].astype(float) +
    df['BsmtFullBath'].astype(float) +
    0.5 * df['BsmtHalfBath'].astype(float)
)


# ========== Step 4: 编码分类特征 ==========
categorical_cols = ['Neighborhood', 'KitchenQual', 'ExterQual', 'BsmtQual',
                    'SaleCondition', 'MSZoning', 'HouseStyle', 'BsmtExposure',
                    'BsmtCond', 'Foundation', 'CentralAir', 'FireplaceQu',
                    'PavedDrive', 'HeatingQC', 'Functional', 'GarageFinish',
                    'GarageType', 'ExterCond']
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# ========== Step 5: 构建特征和标签 ==========
X = df.drop(['SalePrice', 'YearBuilt', 'YearRemodAdd', 'YrSold',
             'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch',
             'FullBath', 'HalfBath', 'BsmtFullBath', 'BsmtHalfBath'], axis=1)
y = df['SalePrice']

nan_rows = X[X.isnull().any(axis=1)]
print(nan_rows.shape)
print(nan_rows.head())



(0, 109)
Empty DataFrame
Columns: [OverallQual, GrLivArea, GarageCars, GarageArea, TotalBsmtSF, 1stFlrSF, TotRmsAbvGrd, LotArea, Fireplaces, BsmtFinSF1, GarageYrBlt, HouseAge, RemodFlag, TotalPorchSF, TotalBath, Neighborhood_Blueste, Neighborhood_BrDale, Neighborhood_BrkSide, Neighborhood_ClearCr, Neighborhood_CollgCr, Neighborhood_Crawfor, Neighborhood_Edwards, Neighborhood_Gilbert, Neighborhood_IDOTRR, Neighborhood_MeadowV, Neighborhood_Mitchel, Neighborhood_NAmes, Neighborhood_NPkVill, Neighborhood_NWAmes, Neighborhood_NoRidge, Neighborhood_NridgHt, Neighborhood_OldTown, Neighborhood_SWISU, Neighborhood_Sawyer, Neighborhood_SawyerW, Neighborhood_Somerst, Neighborhood_StoneBr, Neighborhood_Timber, Neighborhood_Veenker, KitchenQual_Fa, KitchenQual_Gd, KitchenQual_TA, ExterQual_Fa, ExterQual_Gd, ExterQual_TA, BsmtQual_Fa, BsmtQual_Gd, BsmtQual_None, BsmtQual_TA, SaleCondition_AdjLand, SaleCondition_Alloca, SaleCondition_Family, SaleCondition_Normal, SaleCondition_Partial, MSZoning_FV, 

In [35]:
# ========== Step 1: 保留重要特征 ==========
features = list(dict.fromkeys([
    'OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea', 'TotalBsmtSF',
    '1stFlrSF', 'FullBath', 'TotRmsAbvGrd', 'YearBuilt', 'YearRemodAdd',
    'Neighborhood', 'KitchenQual', 'ExterQual', 'BsmtQual', 'SaleCondition',
    'LotArea', 'MSZoning', 'HouseStyle', 'Fireplaces',
    'BsmtExposure', 'BsmtFinSF1', 'BsmtCond', 'Foundation', 'CentralAir',
    'FireplaceQu', 'PavedDrive', 'HeatingQC', 'Functional',
    'GarageFinish', 'GarageType', 'GarageYrBlt', 'ExterCond',
    'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch',
    'HalfBath', 'BsmtFullBath', 'BsmtHalfBath',
    'YrSold'
]))
serving_df = serving_df[features].copy()

# ========== Step 2: 缺失值处理 ==========
cat_fillna = ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'FireplaceQu',
              'GarageFinish', 'GarageType', 'KitchenQual', 'ExterQual',
              'ExterCond', 'SaleCondition', 'Foundation',
              'HeatingQC', 'PavedDrive', 'Functional', 'CentralAir']
for col in cat_fillna:
    if col in df.columns:
        serving_df[col] = serving_df[col].fillna('None')

num_fillna = ['GarageCars', 'GarageArea', 'TotalBsmtSF', 'BsmtFinSF1',
              'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch',
              'Fireplaces', 'GarageYrBlt', 'LotArea', 'GrLivArea', 'YearBuilt']
for col in num_fillna:
    if col in df.columns:
        serving_df[col] = serving_df[col].fillna(serving_df[col].median())

# ========== Step 3: 构造新特征 ==========
serving_df['HouseAge'] = serving_df['YrSold'] - serving_df['YearBuilt']
serving_df['RemodFlag'] = (serving_df['YearBuilt'] != serving_df['YearRemodAdd']).astype(int)
serving_df['TotalPorchSF'] = serving_df[['OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch']].sum(axis=1)
serving_df['TotalBath'] = serving_df['FullBath'] + 0.5 * serving_df['HalfBath'] + serving_df['BsmtFullBath'] + 0.5 * serving_df['BsmtHalfBath']

# ========== Step 4: 编码分类特征 ==========
categorical_cols = ['Neighborhood', 'KitchenQual', 'ExterQual', 'BsmtQual',
                    'SaleCondition', 'MSZoning', 'HouseStyle', 'BsmtExposure',
                    'BsmtCond', 'Foundation', 'CentralAir', 'FireplaceQu',
                    'PavedDrive', 'HeatingQC', 'Functional', 'GarageFinish',
                    'GarageType', 'ExterCond']
serving_df = pd.get_dummies(serving_df, columns=categorical_cols, drop_first=True)

ts = serving_df.drop(['YearBuilt', 'YearRemodAdd', 'YrSold',
             'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch',
             'FullBath', 'HalfBath', 'BsmtFullBath', 'BsmtHalfBath'], axis=1)

ts = ts.reindex(columns=X.columns, fill_value=0)

ts = ts.fillna(0)  # 或者用 ts.fillna(ts.median())

print(ts.isnull().sum()[ts.isnull().sum() > 0])



Series([], dtype: int64)


In [36]:
predictions = None
num_predictions = 0

for i in range(100):
    print(f"i:{i}")
    
    model = GradientBoostingRegressor(
        n_estimators=300,
        learning_rate=0.05,
        subsample=0.8,
        max_depth=4,
        random_state=i,

    )
    model.fit(X, y)
    
    sub_predictions = model.predict(ts)
    if predictions is None:
        predictions = sub_predictions
    else:
        predictions += sub_predictions
    num_predictions += 1

predictions/=num_predictions

print(predictions)

i:0
i:1
i:2
i:3
i:4
i:5
i:6
i:7
i:8
i:9
i:10
i:11
i:12
i:13
i:14
i:15
i:16
i:17
i:18
i:19
i:20
i:21
i:22
i:23
i:24
i:25
i:26
i:27
i:28
i:29
i:30
i:31
i:32
i:33
i:34
i:35
i:36
i:37
i:38
i:39
i:40
i:41
i:42
i:43
i:44
i:45
i:46
i:47
i:48
i:49
i:50
i:51
i:52
i:53
i:54
i:55
i:56
i:57
i:58
i:59
i:60
i:61
i:62
i:63
i:64
i:65
i:66
i:67
i:68
i:69
i:70
i:71
i:72
i:73
i:74
i:75
i:76
i:77
i:78
i:79
i:80
i:81
i:82
i:83
i:84
i:85
i:86
i:87
i:88
i:89
i:90
i:91
i:92
i:93
i:94
i:95
i:96
i:97
i:98
i:99
[128219.70592892 156763.51033133 179470.679453   ... 140322.35189961
 125344.32695685 215196.83220581]


In [42]:
tt = pd.read_csv("home-data-for-ml-course/test.csv")
kaggle_predictions = pd.DataFrame({
        "Id": tt["Id"],
        "SalePrice": (predictions).astype(int)
    })

path="submission.csv"
kaggle_predictions.to_csv(path, index=False)
print(f"Submission exported to {path}")

Submission exported to submission.csv
