In [None]:
import pandas as pd
import numpy as np

from sklearn.linear_model import RidgeCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LassoCV
from sklearn.ensemble import StackingRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression

import xgboost as xgb
import lightgbm as lgb

In [None]:
X_train = pd.read_csv("./data/X_train.csv")
X_test = pd.read_csv("./data/X_test.csv")
y_train = pd.read_csv("./data/y_train_log.csv").values.ravel()

In [None]:
estimators = [
    ("ridge", RidgeCV(alphas=[5, 10, 20])),
    ("lasso", LassoCV(alphas=[0.001, 0.005, 0.01])),
    ("rf", RandomForestRegressor(n_estimators=100, max_depth=5)),
    ("xgb", xgb.XGBRegressor(n_estimators=200, max_depth=3, learning_rate=0.05)),
    ("lgb", lgb.LGBMRegressor(n_estimators=200, learning_rate=0.05, num_leaves=31)),
]

In [None]:
stacking_model = StackingRegressor(
    estimators=estimators,
    final_estimator=LinearRegression(),  # 메타모델은 선형 회귀
    passthrough=True,  # 원본 X도 메타모델에 같이 전달
    cv=5,
)

In [None]:
scores = cross_val_score(
    stacking_model, X_train, y_train, scoring="neg_root_mean_squared_error", cv=5
)

print(f"Stacking RMSE scores: {-scores}")
print(f"Average RMSE: {-scores.mean():.4f}")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002238 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3458
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 197
[LightGBM] [Info] Start training from score 12.021409
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002844 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3232
[LightGBM] [Info] Number of data points in the train set: 934, number of used features: 179
[LightGBM] [Info] Start training from score 12.019199
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002530 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3227
[LightGBM] [Info] Number of data points in the train se

In [None]:
stacking_model.fit(X_train, y_train)
y_pred_log = stacking_model.predict(X_test)
y_pred = np.expm1(y_pred_log)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002601 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3744
[LightGBM] [Info] Number of data points in the train set: 1460, number of used features: 203
[LightGBM] [Info] Start training from score 12.024057
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002011 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3458
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 197
[LightGBM] [Info] Start training from score 12.021409
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002502 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3468
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 193
[LightGBM] [Info] Start t

In [None]:
test_origin = pd.read_csv("./data/test.csv")
submission = pd.DataFrame({"Id": test_origin["Id"], "SalePrice": y_pred})

submission.to_csv("./submission/stacking_submission.csv", index=False)
submission.head()

Unnamed: 0,Id,SalePrice
0,1461,122568.518521
1,1462,106812.273492
2,1463,184405.301516
3,1464,199211.862252
4,1465,195079.967258
