In [1]:
from datetime import datetime

from scipy.stats import skew  
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax

from sklearn.linear_model import ElasticNetCV, LassoCV, RidgeCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error
from mlxtend.regressor import StackingCVRegressor

import numpy as np
import pandas as pd

import umap.umap_ as umap
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import StackingRegressor


2025-07-06 19:03:14.417854: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751796194.432096    9629 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751796194.436446    9629 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1751796194.447575    9629 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1751796194.447590    9629 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1751796194.447592    9629 computation_placer.cc:177] computation placer alr

In [2]:
train = pd.read_csv('home-data-for-ml-course/train.csv')
test = pd.read_csv('home-data-for-ml-course/test.csv')
print("Train set size:", train.shape)
print("Test set size:", test.shape)
print('START data processing', datetime.now(), )

train_ID = train['Id']
test_ID = test['Id']
# Now drop the  'Id' colum since it's unnecessary for  the prediction process.
train.drop(['Id'], axis=1, inplace=True)
test.drop(['Id'], axis=1, inplace=True)

# Deleting outliers
train = train[train.GrLivArea < 4500]
train.reset_index(drop=True, inplace=True)

# We use the numpy fuction log1p which  applies log(1+x) to all elements of the column
train["SalePrice"] = np.log1p(train["SalePrice"])
y = train.SalePrice.reset_index(drop=True)
train_features = train.drop(['SalePrice'], axis=1)
test_features = test

features = pd.concat([train_features, test_features]).reset_index(drop=True)
print(features.shape)
# Some of the non-numeric predictors are stored as numbers; we convert them into strings 
features['MSSubClass'] = features['MSSubClass'].apply(str)
features['YrSold'] = features['YrSold'].astype(str)
features['MoSold'] = features['MoSold'].astype(str)

features['Functional'] = features['Functional'].fillna('Typ')
features['Electrical'] = features['Electrical'].fillna("SBrkr")
features['KitchenQual'] = features['KitchenQual'].fillna("TA")
features['Exterior1st'] = features['Exterior1st'].fillna(features['Exterior1st'].mode()[0])
features['Exterior2nd'] = features['Exterior2nd'].fillna(features['Exterior2nd'].mode()[0])
features['SaleType'] = features['SaleType'].fillna(features['SaleType'].mode()[0])

features["PoolQC"] = features["PoolQC"].fillna("None")

for col in ('GarageYrBlt', 'GarageArea', 'GarageCars'):
    features[col] = features[col].fillna(0)
for col in ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']:
    features[col] = features[col].fillna('None')
for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
    features[col] = features[col].fillna('None')

features['MSZoning'] = features.groupby('MSSubClass')['MSZoning'].transform(lambda x: x.fillna(x.mode()[0]))

objects = []
for i in features.columns:
    if features[i].dtype == object:
        objects.append(i)

features.update(features[objects].fillna('None'))

features['LotFrontage'] = features.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))

# Filling in the rest of the NA's

numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numerics = []
for i in features.columns:
    if features[i].dtype in numeric_dtypes:
        numerics.append(i)
features.update(features[numerics].fillna(0))

numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numerics2 = []
for i in features.columns:
    if features[i].dtype in numeric_dtypes:
        numerics2.append(i)

skew_features = features[numerics2].apply(lambda x: skew(x)).sort_values(ascending=False)

high_skew = skew_features[skew_features > 0.5]
skew_index = high_skew.index

for i in skew_index:
    try:
        features[i] = boxcox1p(features[i], boxcox_normmax(features[i] + 1))
    except Exception as e:
        print(f"⚠️ 跳过特征 {i}，最大值：{features[i].max()}，最小值：{features[i].min()}，原因：{e}")



features = features.drop(['Utilities', 'Street', 'PoolQC',], axis=1)

features['YrBltAndRemod']=features['YearBuilt']+features['YearRemodAdd']
features['TotalSF']=features['TotalBsmtSF'] + features['1stFlrSF'] + features['2ndFlrSF']

features['Total_sqr_footage'] = (features['BsmtFinSF1'] + features['BsmtFinSF2'] +
                                 features['1stFlrSF'] + features['2ndFlrSF'])

features['Total_Bathrooms'] = (features['FullBath'] + (0.5 * features['HalfBath']) +
                               features['BsmtFullBath'] + (0.5 * features['BsmtHalfBath']))

features['Total_porch_sf'] = (features['OpenPorchSF'] + features['3SsnPorch'] +
                              features['EnclosedPorch'] + features['ScreenPorch'] +
                              features['WoodDeckSF'])

# simplified features
features['haspool'] = features['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
features['has2ndfloor'] = features['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
features['hasgarage'] = features['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
features['hasbsmt'] = features['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
features['hasfireplace'] = features['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)


Train set size: (1460, 81)
Test set size: (1459, 80)
START data processing 2025-07-06 19:03:18.069943
(2917, 79)
⚠️ 跳过特征 LotArea，最大值：215245，最小值：1300，原因：The algorithm terminated without finding a valid bracket. Consider trying different initial points.


  r, prob = _stats_py.pearsonr(xvals, yvals)
  r, prob = _stats_py.pearsonr(xvals, yvals)


⚠️ 跳过特征 1stFlrSF，最大值：5095，最小值：334，原因：The algorithm terminated without finding a valid bracket. Consider trying different initial points.


In [3]:
print(features.shape)
final_features0 = pd.get_dummies(features).reset_index(drop=True)
print(final_features0.shape)

(2917, 86)
(2917, 333)


In [4]:
# 原始拼接后的完整数据
features_all = final_features0.copy()

# 你已有的：降维 + 聚类
X_pca = PCA(n_components=50).fit_transform(final_features0)
X_umap = umap.UMAP(n_components=2).fit_transform(X_pca)
kmeans = KMeans(n_clusters=5, random_state=0).fit(X_umap)



In [5]:
cluster_dummies = pd.get_dummies(kmeans.labels_, prefix='umap_cluster')
final_features = pd.concat([final_features0, cluster_dummies], axis=1)


In [6]:
X = final_features.iloc[:len(y), :]
X_sub = final_features.iloc[len(X):, :]


In [7]:
print('X', X.shape, 'y', y.shape, 'X_sub', X_sub.shape)

X (1458, 338) y (1458,) X_sub (1459, 338)


In [8]:
outliers = [30, 88, 462, 631, 1322]
X = X.drop(X.index[outliers])
y = y.drop(y.index[outliers])

overfit = []
for i in X.columns:
    counts = X[i].value_counts()
    zeros = counts.iloc[0]
    if zeros / len(X) * 100 > 99.94:
        overfit.append(i)

overfit = list(overfit)
overfit.append('MSZoning_C (all)')

X = X.drop(overfit, axis=1, errors='ignore')
X_sub = X_sub.drop(overfit, axis=1, errors='ignore')

print('X', X.shape, 'y', y.shape, 'X_sub', X_sub.shape)

# ################## ML ########################################
print('START ML', datetime.now(), )

kfolds = KFold(n_splits=10, shuffle=True, random_state=42)


# rmsle
def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))


# build our model scoring function
def cv_rmse(model, X=X):
    rmse = np.sqrt(-cross_val_score(model, X, y,
                                    scoring="neg_mean_squared_error",
                                    cv=kfolds))
    return (rmse)


# setup models    
alphas_alt = [14.5, 14.6, 14.7, 14.8, 14.9, 15, 15.1, 15.2, 15.3, 15.4, 15.5]
alphas2 = [5e-05, 0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008]
e_alphas = [0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007]
e_l1ratio = [0.8, 0.85, 0.9, 0.95, 0.99, 1]

ridge = make_pipeline(RobustScaler(),
                      RidgeCV(alphas=alphas_alt, cv=kfolds))

lasso = make_pipeline(RobustScaler(),
                      LassoCV(max_iter=int(1e7), alphas=alphas2,
                              random_state=42, cv=kfolds))

elasticnet = make_pipeline(RobustScaler(),
                           ElasticNetCV(max_iter=int(1e7), alphas=e_alphas,
                                        cv=kfolds, l1_ratio=e_l1ratio))
                                        
svr = make_pipeline(RobustScaler(),
                      SVR(C= 20, epsilon= 0.008, gamma=0.0003,))


gbr = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =42)
                                   

lightgbm = LGBMRegressor(objective='regression', n_jobs=-1,
                                       num_leaves=4,
                                       learning_rate=0.01, 
                                       n_estimators=5000,
                                       max_bin=200, 
                                       bagging_fraction=0.75,
                                       bagging_freq=5, 
                                       bagging_seed=7,
                                       feature_fraction=0.2,
                                       feature_fraction_seed=7,
                                       verbose=-1,
                                       #min_data_in_leaf=2,
                                       #min_sum_hessian_in_leaf=11
                                       )
                                       

xgboost = XGBRegressor(learning_rate=0.01, n_estimators=3460,
                                     max_depth=3, min_child_weight=0,
                                     gamma=0, subsample=0.7,
                                     colsample_bytree=0.7,
                                     objective='reg:squarederror', nthread=-1,
                                     scale_pos_weight=1, seed=27,
                                     reg_alpha=0.00006)

# stack
stack_gen = StackingRegressor(
    estimators=[
        ('ridge', ridge),
        ('lasso', lasso),
        ('elastic', elasticnet),
        ('gbr', gbr),
        ('xgb', xgboost),
        ('lgb', lightgbm)
    ],
    final_estimator=xgboost,
    cv=10,
    passthrough=True,
    verbose=1,
    n_jobs=-1
)

                                

print('TEST score on CV')

score = cv_rmse(ridge)
print("Kernel Ridge score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )

score = cv_rmse(lasso)
print("Lasso score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )

score = cv_rmse(elasticnet)
print("ElasticNet score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )

score = cv_rmse(svr)
print("SVR score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )

score = cv_rmse(lightgbm)
print("Lightgbm score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )

score = cv_rmse(gbr)
print("GradientBoosting score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )

score = cv_rmse(xgboost)
print("Xgboost score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )

print('finished')

X (1453, 336) y (1453,) X_sub (1459, 336)
START ML 2025-07-06 19:03:30.654708
TEST score on CV
Kernel Ridge score: 0.1041 (0.0147)
 2025-07-06 19:04:23.051585
Lasso score: 0.1048 (0.0151)
 2025-07-06 19:04:53.952496
ElasticNet score: 0.1049 (0.0151)
 2025-07-06 19:07:06.823295
SVR score: 0.1049 (0.0141)
 2025-07-06 19:07:09.703587
Lightgbm score: 0.1055 (0.0164)
 2025-07-06 19:07:20.644993
GradientBoosting score: 0.1061 (0.0133)
 2025-07-06 19:08:54.505252
Xgboost score: 0.1071 (0.0160)
 2025-07-06 19:09:34.648083
finished


In [9]:

print('START Fit')
print(datetime.now(), 'StackingRegressor')
stack_gen_model = stack_gen.fit(X, y)
print(datetime.now(), 'elasticnet')
elastic_model_full_data = elasticnet.fit(X, y)
print(datetime.now(), 'lasso')
lasso_model_full_data = lasso.fit(X, y)
print(datetime.now(), 'ridge')
ridge_model_full_data = ridge.fit(X, y)
print(datetime.now(), 'svr')
svr_model_full_data = svr.fit(X, y)
print(datetime.now(), 'GradientBoosting')
gbr_model_full_data = gbr.fit(X, y)
print(datetime.now(), 'xgboost')
xgb_model_full_data = xgboost.fit(X, y)
print(datetime.now(), 'lightgbm')
lgb_model_full_data = lightgbm.fit(X, y)

START Fit
2025-07-06 19:09:34.653618 StackingRegressor


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:   13.6s remaining:    9.1s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   23.9s finished
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:   31.4s remaining:   20.9s
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:   28.5s remaining:   19.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   33.8s finished
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   44.4s finished
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:   53

2025-07-06 19:12:06.635111 elasticnet
2025-07-06 19:12:20.632929 lasso
2025-07-06 19:12:23.934254 ridge
2025-07-06 19:12:29.809704 svr
2025-07-06 19:12:30.159056 GradientBoosting
2025-07-06 19:12:40.016506 xgboost
2025-07-06 19:12:46.450164 lightgbm


In [10]:
def blend_models_predict(X):
    return ((0.1 * elastic_model_full_data.predict(X)) + \
            (0.1 * lasso_model_full_data.predict(X)) + \
            (0.1 * ridge_model_full_data.predict(X)) + \
            (0.1 * svr_model_full_data.predict(X)) + \
            (0.1 * gbr_model_full_data.predict(X)) + \
            (0.15 * xgb_model_full_data.predict(X)) + \
            (0.1 * lgb_model_full_data.predict(X)) + \
            (0.25 * stack_gen_model.predict(X)))
            
print('RMSLE score on train data:')
print(rmsle(y, blend_models_predict(X)))

RMSLE score on train data:
0.057587365695045294


In [11]:
def blend_models_predict1(X):
    return (
        (0.1090 * elastic_model_full_data.predict(X)) +
        (0.1091 * lasso_model_full_data.predict(X)) +
        (0.1083 * ridge_model_full_data.predict(X)) +
        (0.1090 * svr_model_full_data.predict(X)) +
        (0.1038 * gbr_model_full_data.predict(X)) +
        (0.1053 * xgb_model_full_data.predict(X)) +
        (0.1055 * lgb_model_full_data.predict(X)) +
        (0.25 * stack_gen_model.predict(X))
    )

print('RMSLE score on train data:')
print(rmsle(y, blend_models_predict1(X)))

RMSLE score on train data:
0.059261129076075496


In [12]:
submission = pd.read_csv('home-data-for-ml-course/sample_submission.csv')
submission.iloc[:,1] = np.floor(np.expm1(blend_models_predict(X_sub)))
submission.to_csv("my_model_submission0.csv", index=False)

In [13]:
submission = pd.read_csv('home-data-for-ml-course/sample_submission.csv')
submission.iloc[:,1] = np.floor(np.expm1(blend_models_predict(X_sub)))
submission.to_csv("my_model_submission1.csv", index=False)

In [14]:
from sklearn.linear_model import LinearRegression

# 构建训练数据：每列是一个模型的预测
blend_train = np.column_stack([
    elastic_model_full_data.predict(X),
    lasso_model_full_data.predict(X),
    ridge_model_full_data.predict(X),
    svr_model_full_data.predict(X),
    gbr_model_full_data.predict(X),
    xgb_model_full_data.predict(X),
    lgb_model_full_data.predict(X),
    stack_gen_model.predict(X)
])

# 学习最优线性融合权重
lr = LinearRegression()
lr.fit(blend_train, y)

# 查看权重
print("融合权重：", lr.coef_)
print("偏置：", lr.intercept_)

# 预测函数
def blend_models_predict2(X):
    blend_test = np.column_stack([
        elastic_model_full_data.predict(X),
        lasso_model_full_data.predict(X),
        ridge_model_full_data.predict(X),
        svr_model_full_data.predict(X),
        gbr_model_full_data.predict(X),
        xgb_model_full_data.predict(X),
        lgb_model_full_data.predict(X),
        stack_gen_model.predict(X)
    ])
    return lr.predict(blend_test)


print('RMSLE score on train data:')
print(rmsle(y, blend_models_predict2(X)))


融合权重： [-0.49027111  0.54690786 -0.10459108 -0.14624843  0.72452336  0.80470765
 -0.52473413  0.1913566 ]
偏置： -0.020349336863574763
RMSLE score on train data:
0.02682163641777133


In [15]:
submission = pd.read_csv('home-data-for-ml-course/sample_submission.csv')
submission.iloc[:,1] = np.floor(np.expm1(blend_models_predict2(X_sub)))
submission.to_csv("my_model_submission.csv2", index=False)

In [None]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_predict

step = 0
# 用于生成交叉验证预测（每个模型）
elastic_oof = cross_val_predict(elastic_model_full_data, X, y, cv=10, n_jobs=-1)
print("step =", step := step + 1)
lasso_oof = cross_val_predict(lasso_model_full_data, X, y, cv=10, n_jobs=-1)
print("step =", step := step + 1)
ridge_oof = cross_val_predict(ridge_model_full_data, X, y, cv=10, n_jobs=-1)
print("step =", step := step + 1)
svr_oof = cross_val_predict(svr_model_full_data, X, y, cv=10, n_jobs=-1)
print("step =", step := step + 1)
gbr_oof = cross_val_predict(gbr_model_full_data, X, y, cv=10, n_jobs=-1)
print("step =", step := step + 1)
xgb_oof = cross_val_predict(xgb_model_full_data, X, y, cv=10, n_jobs=-1)
print("step =", step := step + 1)
lgb_oof = cross_val_predict(lgb_model_full_data, X, y, cv=10, n_jobs=-1)
print("step =", step := step + 1)
stack_oof = cross_val_predict(stack_gen_model, X, y, cv=10, n_jobs=-1)
print("step =", step := step + 1)

# 拼接所有模型的输出
blend_input = np.column_stack([elastic_oof, lasso_oof, ridge_oof, svr_oof,
                               gbr_oof, xgb_oof, lgb_oof, stack_oof])

# 用 Ridge 来学习融合权重
blend_model = Ridge(alpha=1e-3)
blend_model.fit(blend_input, y)

print("融合权重：", blend_model.coef_)
print("偏置：", blend_model.intercept_)

blend_test_input = np.column_stack([
    elastic_model_full_data.predict(X),
    lasso_model_full_data.predict(X),
    ridge_model_full_data.predict(X),
    svr_model_full_data.predict(X),
    gbr_model_full_data.predict(X),
    xgb_model_full_data.predict(X),
    lgb_model_full_data.predict(X),
    stack_gen_model.predict(X)
])

blend_pred = blend_model.predict(blend_test_input)
print("融合 RMSLE:", rmsle(y, blend_pred))


step = 1
step = 2
step = 3
step = 4
step = 5
step = 6


In [None]:
# 构建测试集的预测特征
blend_test_input_sub = np.column_stack([
    elastic_model_full_data.predict(X_sub),
    lasso_model_full_data.predict(X_sub),
    ridge_model_full_data.predict(X_sub),
    svr_model_full_data.predict(X_sub),
    gbr_model_full_data.predict(X_sub),
    xgb_model_full_data.predict(X_sub),
    lgb_model_full_data.predict(X_sub),
    stack_gen_model.predict(X_sub)
])

# 用融合模型预测测试集
blend_models_predict3 = blend_model.predict(blend_test_input_sub)

submission = pd.read_csv('home-data-for-ml-course/sample_submission.csv')
submission.iloc[:, 1] = np.floor(np.expm1(blend_models_predict3))
submission.to_csv("my_model_submission3.csv", index=False)

In [None]:
print('Finished')

In [None]:
# 1. 用当前最优融合模型预测测试集标签
pseudo_labels = blend_models_predict1(X_sub)

# 2. 构造新的训练集（包含原训练集 + 测试集伪标签）
X_pseudo = pd.concat([X, X_sub], axis=0)
y_pseudo = pd.concat([y, pd.Series(pseudo_labels)], axis=0)

# 3. 用融合权重拟合一个 Ridge 模型来再训练（可视为 meta 模型）
blend_input = np.column_stack([
    elastic_model_full_data.predict(X_pseudo),
    lasso_model_full_data.predict(X_pseudo),
    ridge_model_full_data.predict(X_pseudo),
    svr_model_full_data.predict(X_pseudo),
    gbr_model_full_data.predict(X_pseudo),
    xgb_model_full_data.predict(X_pseudo),
    lgb_model_full_data.predict(X_pseudo),
    stack_gen_model.predict(X_pseudo)
])

# 4. 用 Ridge 再次拟合融合权重（或保留原融合权重也可）
blend_model_final = Ridge(alpha=1e-3)
blend_model_final.fit(blend_input, y_pseudo)

# 5. 再次对测试集预测
blend_test_input = np.column_stack([
    elastic_model_full_data.predict(X_sub),
    lasso_model_full_data.predict(X_sub),
    ridge_model_full_data.predict(X_sub),
    svr_model_full_data.predict(X_sub),
    gbr_model_full_data.predict(X_sub),
    xgb_model_full_data.predict(X_sub),
    lgb_model_full_data.predict(X_sub),
    stack_gen_model.predict(X_sub)
])

# 6. 最终预测输出
final_preds = blend_model_final.predict(blend_test_input)
submission = pd.read_csv('home-data-for-ml-course/sample_submission.csv')
submission.iloc[:, 1] = np.floor(np.expm1(final_preds))
submission.to_csv("my_model_submission4.csv", index=False)

In [None]:
pseudo_labels = blend_models_predict1(X_sub)
X_combined = pd.concat([X, X_sub], axis=0).reset_index(drop=True)
y_combined = pd.concat([y, pd.Series(pseudo_labels)], axis=0).reset_index(drop=True)
step = 0
print("step =", step := step + 1)
elastic_model_full_data.fit(X_combined, y_combined)
print("step =", step := step + 1)
lasso_model_full_data.fit(X_combined, y_combined)
print("step =", step := step + 1)
ridge_model_full_data.fit(X_combined, y_combined)
print("step =", step := step + 1)
svr_model_full_data.fit(X_combined, y_combined)
print("step =", step := step + 1)
gbr_model_full_data.fit(X_combined, y_combined)
print("step =", step := step + 1)
xgb_model_full_data.fit(X_combined, y_combined)
print("step =", step := step + 1)
lgb_model_full_data.fit(X_combined, y_combined)
print("step =", step := step + 1)

stack_gen_model = StackingRegressor(
    estimators=[
        ('ridge', ridge),
        ('lasso', lasso),
        ('elastic', elasticnet),
        ('gbr', gbr),
        ('xgb', xgboost),
        ('lgb', lightgbm)
    ],
    final_estimator=xgboost,
    cv=10,
    passthrough=True,
    verbose=1,
    n_jobs=-1  
)

print("step =", step := step + 1)
stack_gen_model.fit(X_combined, y_combined)
print("step =", step := step + 1)
train_preds = blend_models_predict(X)  
print("训练集融合 RMSLE:", rmsle(y, train_preds))
submission = pd.read_csv('home-data-for-ml-course/sample_submission.csv')
submission.iloc[:,1] = np.floor(np.expm1(blend_models_predict1(X_sub)))
submission.to_csv("my_model_submission4.csv", index=False)
submission = pd.read_csv('home-data-for-ml-course/sample_submission.csv')
submission.iloc[:,1] = np.floor(np.expm1(blend_models_predict1(X_sub)))
submission.to_csv("my_model_submission5.csv", index=False)