In [2]:
%pip install xgboost lightgbm catboost scikit-learn


Collecting scikit-learn
  Downloading scikit_learn-1.7.2-cp311-cp311-win_amd64.whl.metadata (11 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.5.2-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.7.2-cp311-cp311-win_amd64.whl (8.9 MB)
   ---------------------------------------- 0.0/8.9 MB ? eta -:--:--
   - -------------------------------------- 0.3/8.9 MB ? eta -:--:--
   ----- ---------------------------------- 1.3/8.9 MB 6.7 MB/s eta 0:00:02
   ------------ --------------------------- 2.9/8.9 MB 6.5 MB/s eta 0:00:01
   ----------------------- ---------------- 5.2/8.9 MB 7.8 MB/s eta 0:00:01
   ------------------------------- -------- 7.1/8.9 MB 7.9 MB/s eta 0:00:01
   -------------------------------------- - 8.7/8.9 MB 8.0 MB/s eta 0:00:01
   ---------------------------------------- 8.9/8.9 MB 7.8 MB/s  0:00:01
Downloading job

In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import RobustScaler, LabelEncoder, PowerTransformer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Lasso, Ridge, ElasticNet
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.base import BaseEstimator, RegressorMixin, clone

In [8]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

print("Train shape:", train.shape)
print("Test shape:", test.shape)

train.head()


Train shape: (1460, 81)
Test shape: (1459, 80)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [9]:
# Log transform SalePrice (target)
train["SalePrice"] = np.log1p(train["SalePrice"])
y = train["SalePrice"]

train_ID = train["Id"]
test_ID = test["Id"]

# Drop Id
train.drop("Id", axis=1, inplace=True)
test.drop("Id", axis=1, inplace=True)

ntrain = train.shape[0]
ntest = test.shape[0]

# Merge train and test for preprocessing
all_data = pd.concat((train.drop("SalePrice", axis=1), test)).reset_index(drop=True)
print("Combined data shape:", all_data.shape)


Combined data shape: (2919, 79)


In [10]:
# Fill categorical with 'None'
for col in ["PoolQC","MiscFeature","Alley","Fence","FireplaceQu",
            "GarageType","GarageFinish","GarageQual","GarageCond",
            "BsmtQual","BsmtCond","BsmtExposure","BsmtFinType1","BsmtFinType2","MasVnrType"]:
    all_data[col] = all_data[col].fillna("None")

# Fill numerical with 0
for col in ["GarageYrBlt","GarageArea","GarageCars","BsmtFinSF1","BsmtFinSF2","BsmtUnfSF",
            "TotalBsmtSF","BsmtFullBath","BsmtHalfBath","MasVnrArea"]:
    all_data[col] = all_data[col].fillna(0)

# LotFrontage by neighborhood median
all_data["LotFrontage"] = all_data.groupby("Neighborhood")["LotFrontage"].transform(lambda x: x.fillna(x.median()))

# Remaining categorical with mode
for col in all_data.select_dtypes(include="object").columns:
    all_data[col] = all_data[col].fillna(all_data[col].mode()[0])

print("Missing values after handling:", all_data.isnull().sum().sum())


Missing values after handling: 0


In [11]:
# MSSubClass to string
all_data["MSSubClass"] = all_data["MSSubClass"].astype(str)

# Encode some ordinal categorical features
cols = ["FireplaceQu","BsmtQual","BsmtCond","GarageQual","GarageCond",
        "ExterQual","ExterCond","HeatingQC","KitchenQual","BsmtFinType1",
        "BsmtFinType2","Functional","Fence","BsmtExposure","GarageFinish",
        "LandSlope","LotShape","PavedDrive","PoolQC","Street","Alley",
        "CentralAir","MSSubClass","OverallCond","YrSold","MoSold"]

for c in cols:
    lbl = LabelEncoder()
    all_data[c] = lbl.fit_transform(all_data[c].astype(str))

# New feature
all_data["TotalSF"] = all_data["TotalBsmtSF"] + all_data["1stFlrSF"] + all_data["2ndFlrSF"]

# Skewness correction
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
from scipy.stats import skew
skewed_feats = all_data[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
skewed_feats = skewed_feats[abs(skewed_feats) > 0.75].index

pt = PowerTransformer()
all_data[skewed_feats] = pt.fit_transform(all_data[skewed_feats])

# One-hot encoding
all_data = pd.get_dummies(all_data)
print("After encoding:", all_data.shape)

# Split back
X = all_data[:ntrain]
X_test = all_data[ntrain:]


After encoding: (2919, 223)


In [12]:
def rmsle_cv(model):
    kf = KFold(10, shuffle=True, random_state=42).get_n_splits(X.values)
    rmse = np.sqrt(-cross_val_score(model, X.values, y, scoring="neg_mean_squared_error", cv=kf))
    return rmse


In [13]:
lasso = make_pipeline(RobustScaler(), Lasso(alpha=0.0005, random_state=1))
ridge = make_pipeline(RobustScaler(), Ridge(alpha=10))
ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=0.9, random_state=3))
GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state=5)
model_xgb = XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                         learning_rate=0.05, max_depth=3, 
                         min_child_weight=1.7817, n_estimators=2200,
                         reg_alpha=0.4640, reg_lambda=0.8571,
                         subsample=0.5213, random_state=7, nthread=-1)
model_lgb = LGBMRegressor(objective='regression', num_leaves=5,
                          learning_rate=0.05, n_estimators=720,
                          max_bin=55, bagging_fraction=0.8,
                          bagging_freq=5, feature_fraction=0.2319,
                          feature_fraction_seed=9, bagging_seed=9,
                          min_data_in_leaf=6, min_sum_hessian_in_leaf=11)

models = [("Lasso", lasso), ("Ridge", ridge), ("ElasticNet", ENet),
          ("GradientBoosting", GBoost), ("XGBoost", model_xgb), ("LightGBM", model_lgb)]


In [14]:
for name, model in models:
    score = rmsle_cv(model)
    print(f"{name} score: {score.mean():.5f} ({score.std():.5f})")


Lasso score: 0.12257 (0.02595)
Ridge score: 0.12517 (0.02521)
ElasticNet score: 0.12253 (0.02591)
GradientBoosting score: 0.12089 (0.02070)
XGBoost score: 0.12025 (0.01706)
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002482 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1474
[LightGBM] [Info] Number of data points in the train set: 1314, number of used features: 177
[LightGBM] [Info] Start training from score 12.026862




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001292 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1469
[LightGBM] [Info] Number of data points in the train set: 1314, number of used features: 177
[LightGBM] [Info] Start training from score 12.018898




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002348 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1474
[LightGBM] [Info] Number of data points in the train set: 1314, number of used features: 179
[LightGBM] [Info] Start training from score 12.023673




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000804 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1474
[LightGBM] [Info] Number of data points in the train set: 1314, number of used features: 178
[LightGBM] [Info] Start training from score 12.023758




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001011 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1470
[LightGBM] [Info] Number of data points in the train set: 1314, number of used features: 177
[LightGBM] [Info] Start training from score 12.023772




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001083 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1474
[LightGBM] [Info] Number of data points in the train set: 1314, number of used features: 179
[LightGBM] [Info] Start training from score 12.021391




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002353 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1476
[LightGBM] [Info] Number of data points in the train set: 1314, number of used features: 178
[LightGBM] [Info] Start training from score 12.027937




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001538 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1479
[LightGBM] [Info] Number of data points in the train set: 1314, number of used features: 179
[LightGBM] [Info] Start training from score 12.028088




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000899 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1465
[LightGBM] [Info] Number of data points in the train set: 1314, number of used features: 176
[LightGBM] [Info] Start training from score 12.019062




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000742 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1472
[LightGBM] [Info] Number of data points in the train set: 1314, number of used features: 177
[LightGBM] [Info] Start training from score 12.027133
LightGBM score: 0.12116 (0.01728)




In [15]:
class AveragingModels(BaseEstimator, RegressorMixin):
    def __init__(self, models):
        self.models = models
        
    def fit(self, X, y):
        self.models_ = [clone(x) for x in self.models]
        for model in self.models_:
            model.fit(X, y)
        return self
    
    def predict(self, X):
        preds = np.column_stack([model.predict(X) for model in self.models_])
        return np.mean(preds, axis=1)

averaged_models = AveragingModels(models=(ENet, GBoost, model_xgb, model_lgb, lasso))
score = rmsle_cv(averaged_models)
print("Averaged base models score: {:.5f} ({:.5f})".format(score.mean(), score.std()))


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001419 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1474
[LightGBM] [Info] Number of data points in the train set: 1314, number of used features: 177
[LightGBM] [Info] Start training from score 12.026862




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002412 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1469
[LightGBM] [Info] Number of data points in the train set: 1314, number of used features: 177
[LightGBM] [Info] Start training from score 12.018898




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000618 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1474
[LightGBM] [Info] Number of data points in the train set: 1314, number of used features: 179
[LightGBM] [Info] Start training from score 12.023673




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000942 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1474
[LightGBM] [Info] Number of data points in the train set: 1314, number of used features: 178
[LightGBM] [Info] Start training from score 12.023758




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000784 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1470
[LightGBM] [Info] Number of data points in the train set: 1314, number of used features: 177
[LightGBM] [Info] Start training from score 12.023772




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000936 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1474
[LightGBM] [Info] Number of data points in the train set: 1314, number of used features: 179
[LightGBM] [Info] Start training from score 12.021391




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002031 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1476
[LightGBM] [Info] Number of data points in the train set: 1314, number of used features: 178
[LightGBM] [Info] Start training from score 12.027937




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001454 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1479
[LightGBM] [Info] Number of data points in the train set: 1314, number of used features: 179
[LightGBM] [Info] Start training from score 12.028088




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001437 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1465
[LightGBM] [Info] Number of data points in the train set: 1314, number of used features: 176
[LightGBM] [Info] Start training from score 12.019062




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001861 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1472
[LightGBM] [Info] Number of data points in the train set: 1314, number of used features: 177
[LightGBM] [Info] Start training from score 12.027133
Averaged base models score: 0.11584 (0.02058)




In [16]:
averaged_models.fit(X, y)
preds = np.expm1(averaged_models.predict(X_test))  # reverse log1p

submission = pd.DataFrame({"Id": test_ID, "SalePrice": preds})
submission.to_csv("submission.csv", index=False)

print("✅ Submission file created: submission.csv")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001518 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1487
[LightGBM] [Info] Number of data points in the train set: 1460, number of used features: 181
[LightGBM] [Info] Start training from score 12.024057
✅ Submission file created: submission.csv
