### Imports / Setup

In [603]:
import joblib

# base
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

# Важная настройка для корректной настройки pipeline!
import sklearn

sklearn.set_config(transform_output="pandas")

# Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin

# Preprocessing
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.preprocessing import (
    OneHotEncoder,
    StandardScaler,
    RobustScaler,
    MinMaxScaler,
    OrdinalEncoder,
)
from category_encoders.target_encoder import TargetEncoder
from category_encoders.cat_boost import CatBoostEncoder

# Feature selection
from sklearn.inspection import permutation_importance
from sklearn.feature_selection import f_regression, chi2

# Model learning extras
from sklearn.model_selection import (
    train_test_split,
    RandomizedSearchCV,
    cross_val_score,
    KFold,
)

# Models
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

from statsmodels.stats.outliers_influence import variance_inflation_factor

# Metrics
from sklearn.metrics import (
    root_mean_squared_error,
    mean_squared_error,
    mean_absolute_percentage_error,
    root_mean_squared_log_error
)


# tunning hyperparamters model
import optuna

In [550]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# both = pd.concat([train, test])

In [551]:
num_features = train.select_dtypes(exclude="object")
cat_features = train.select_dtypes(include="object")

### EDA

In [552]:
print(train.shape)

train.describe()

(1460, 81)


Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1379.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,46.549315,567.240411,1057.429452,1162.626712,346.992466,5.844521,1515.463699,0.425342,0.057534,1.565068,0.382877,2.866438,1.046575,6.517808,0.613014,1978.506164,1.767123,472.980137,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,161.319273,441.866955,438.705324,386.587738,436.528436,48.623081,525.480383,0.518911,0.238753,0.550916,0.502885,0.815778,0.220338,1.625393,0.644666,24.689725,0.747315,213.804841,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,0.0,0.0,0.0,334.0,0.0,0.0,334.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1900.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,0.0,223.0,795.75,882.0,0.0,0.0,1129.5,0.0,0.0,1.0,0.0,2.0,1.0,5.0,0.0,1961.0,1.0,334.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,0.0,477.5,991.5,1087.0,0.0,0.0,1464.0,0.0,0.0,2.0,0.0,3.0,1.0,6.0,1.0,1980.0,2.0,480.0,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,0.0,808.0,1298.25,1391.25,728.0,0.0,1776.75,1.0,0.0,2.0,1.0,3.0,1.0,7.0,1.0,2002.0,2.0,576.0,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,1474.0,2336.0,6110.0,4692.0,2065.0,572.0,5642.0,3.0,2.0,3.0,2.0,8.0,3.0,14.0,3.0,2010.0,4.0,1418.0,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


#### Feature barplots

In [553]:
# cat_features = filled.select_dtypes(include="object")
# melted = cat_features.melt()

# g = sns.FacetGrid(melted, col='variable', col_wrap=6, sharex=False, sharey=False)

# g.map(sns.histplot, "value", discrete=True)

# for ax, var in zip(g.axes.flatten(), melted['variable'].unique()):
#     unique_vals = sorted(melted[melted['variable'] == var]['value'].unique(), key=lambda x: str(x))
#     ax.set_xticks(unique_vals)
#     ax.set_xticklabels(unique_vals, rotation=45, fontsize=8)

# plt.tight_layout()
# plt.show()

### TODO

________
* ~~Feature selection~~
* ~~Оптимизация гиперпараметров (optuna)~~
________

* ~~Проверить каждый признак на значения, которые не NaN, но по сути ими являются~~
* ~~Поменять TargetEncoder на CatBoostEncoder?~~
* ~~Сделать общий пайплайн для препроцессинга~~
* Сделать что-то полезное с YrSold

In [554]:
pd.DataFrame(
    data={
        "NaN_count": train.isna().sum(),
        "NaN_%": train.isna().sum() / len(train),
        "data_type": train.dtypes,
    }
).sort_values(by="NaN_count", ascending=False).head(20)

Unnamed: 0,NaN_count,NaN_%,data_type
PoolQC,1453,0.995205,object
MiscFeature,1406,0.963014,object
Alley,1369,0.937671,object
Fence,1179,0.807534,object
MasVnrType,872,0.59726,object
FireplaceQu,690,0.472603,object
LotFrontage,259,0.177397,float64
GarageYrBlt,81,0.055479,float64
GarageCond,81,0.055479,object
GarageType,81,0.055479,object


### Outliers + Feature Engineering

In [555]:
train = train[train.GrLivArea < 4500]
train.reset_index(drop=True, inplace=True)

In [556]:
train["YrBltAndRemod"] = train["YearBuilt"] + train["YearRemodAdd"]
train["TotalSF"] = train["TotalBsmtSF"] + train["1stFlrSF"] + train["2ndFlrSF"]

train["Total_sqr_footage"] = (
    train["BsmtFinSF1"] + train["BsmtFinSF2"] + train["1stFlrSF"] + train["2ndFlrSF"]
)

train["Total_Bathrooms"] = (
    train["FullBath"]
    + (0.5 * train["HalfBath"])
    + train["BsmtFullBath"]
    + (0.5 * train["BsmtHalfBath"])
)

train["Total_porch_sf"] = (
    train["OpenPorchSF"]
    + train["3SsnPorch"]
    + train["EnclosedPorch"]
    + train["ScreenPorch"]
    + train["WoodDeckSF"]
)

# simplified features
train["haspool"] = train["PoolArea"].apply(lambda x: 1 if x > 0 else 0)
train["has2ndfloor"] = train["2ndFlrSF"].apply(lambda x: 1 if x > 0 else 0)
train["hasgarage"] = train["GarageArea"].apply(lambda x: 1 if x > 0 else 0)
train["hasbsmt"] = train["TotalBsmtSF"].apply(lambda x: 1 if x > 0 else 0)
train["hasfireplace"] = train["Fireplaces"].apply(lambda x: 1 if x > 0 else 0)

# # logs
# train['LotFrontage'] = np.log(train['LotFrontage'])
# train['LotArea'] = np.log(train['LotArea'])
# train['MasVnrArea'] = np.log(train['MasVnrArea'])
# train['BsmtUnfSF'] = np.log(train['BsmtUnfSF'])
# train['1stFlrSF'] = np.log(train['1stFlrSF'])
# train['GrLivArea'] = np.log(train['GrLivArea'])
# train['OpenPorchSF'] = np.log(train['OpenPorchSF'])
# train['WoodDeckSF'] = np.log(train['WoodDeckSF'])

In [557]:
test["YrBltAndRemod"] = test["YearBuilt"] + test["YearRemodAdd"]
test["TotalSF"] = test["TotalBsmtSF"] + test["1stFlrSF"] + test["2ndFlrSF"]

test["Total_sqr_footage"] = (
    test["BsmtFinSF1"] + test["BsmtFinSF2"] + test["1stFlrSF"] + test["2ndFlrSF"]
)

test["Total_Bathrooms"] = (
    test["FullBath"]
    + (0.5 * test["HalfBath"])
    + test["BsmtFullBath"]
    + (0.5 * test["BsmtHalfBath"])
)

test["Total_porch_sf"] = (
    test["OpenPorchSF"]
    + test["3SsnPorch"]
    + test["EnclosedPorch"]
    + test["ScreenPorch"]
    + test["WoodDeckSF"]
)

# simplified features
test["haspool"] = test["PoolArea"].apply(lambda x: 1 if x > 0 else 0)
test["has2ndfloor"] = test["2ndFlrSF"].apply(lambda x: 1 if x > 0 else 0)
test["hasgarage"] = test["GarageArea"].apply(lambda x: 1 if x > 0 else 0)
test["hasbsmt"] = test["TotalBsmtSF"].apply(lambda x: 1 if x > 0 else 0)
test["hasfireplace"] = test["Fireplaces"].apply(lambda x: 1 if x > 0 else 0)

In [558]:
X, y = train.drop("SalePrice", axis=1), train["SalePrice"]

### Preprocessing

#### Imputer + Dropper

In [559]:
# Drop rows?
# 398 - Electrical outlier

In [560]:
drop = [
    "Id",
    "PoolQC",
    "MiscFeature",
    "Alley",
    "Fence",
    "Street",
    "Utilities",
    "Condition2",
    "RoofMatl",
    "Heating",
    "Functional",
    "YearBuilt",
    "YearRemodAdd",
    "TotalBsmtSF",
    "1stFlrSF",
    "2ndFlrSF",
    "BsmtFinSF1",
    "BsmtFinSF2",
    "FullBath",
    "HalfBath",
    "BsmtFullBath",
    "BsmtHalfBath",
    "OpenPorchSF",
    "3SsnPorch",
    "EnclosedPorch",
    "ScreenPorch",
    "WoodDeckSF",
    "PoolArea",
    "2ndFlrSF",
    "GarageArea",
    "Fireplaces",
]
num_imp_avg = ["MasVnrArea"]
num_imp_no = ["LotFrontage", "GarageYrBlt"]
cat_imp_mode = ["MasVnrType"]
cat_imp_no = [
    "FireplaceQu",
    "GarageCond",
    "GarageType",
    "GarageFinish",
    "GarageQual",
    "BsmtFinType2",
    "BsmtExposure",
    "BsmtQual",
    "BsmtCond",
    "BsmtFinType1",
    "Electrical",
]

imputer = ColumnTransformer(
    transformers=[
        ("drop_features", "drop", drop),
        ("num_imp_avg", SimpleImputer(strategy="mean"), num_imp_avg),
        ("cat_imp_mode", SimpleImputer(strategy="most_frequent"), cat_imp_mode),
        ("cat_imp_no", SimpleImputer(strategy="constant", fill_value=None), cat_imp_no),
        ("num_imp_no", SimpleImputer(strategy="constant", fill_value=None), num_imp_no),
    ],
    verbose_feature_names_out=False,
    remainder="passthrough",
)

In [561]:
filled = imputer.fit_transform(X)

#### Scaler + Encoder

In [562]:
ordinal_encoding_columns = ["ExterQual", "ExterCond", "KitchenQual", "HeatingQC"]
ordinal_encoding_columns_with_mv = [
    "BsmtQual",
    "BsmtCond",
    "FireplaceQu",
    "GarageCond",
    "GarageQual",
]
binary_encoding_columns = ["CentralAir"]
one_hot_encoding_columns = [
    "LotShape",
    "LandContour",
    "LandSlope",
    "MSZoning",
    "LotConfig",
    "Neighborhood",
    "BldgType",
    "HouseStyle",
    "RoofStyle",
    "Exterior1st",
    "Exterior2nd",
    "BsmtFinType1",
    "BsmtFinType2",
    "SaleType",
]
target_encoding_columns = [
    "MasVnrType",
    "BsmtExposure",
    "GarageFinish",
    "PavedDrive",
    "Condition1",
    "Foundation",
    "Electrical",
    "GarageType",
    "SaleCondition",
]

standard_scaler_columns = [
    "MSSubClass",
    "LotFrontage",
    "LotArea",
    "OverallQual",
    "OverallCond",
    "MasVnrArea",
    "BsmtUnfSF",
    "LowQualFinSF",
    "GrLivArea",
    "BedroomAbvGr",
    "KitchenAbvGr",
    "TotRmsAbvGrd",
    "GarageYrBlt",
    "GarageCars",
    "MiscVal",
    "MoSold",
    "YrSold",
]

categories_with_missing = [["missing_value", "Po", "Fa", "TA", "Gd", "Ex"]] * len(
    ordinal_encoding_columns_with_mv
)
categories_without_missing = [["Po", "Fa", "TA", "Gd", "Ex"]] * len(
    ordinal_encoding_columns
)

scaler_and_encoder = ColumnTransformer(
    [
        (
            "ordinal_with_missing",
            OrdinalEncoder(
                categories=categories_with_missing,
                handle_unknown="use_encoded_value",
                unknown_value=-1,
            ),
            ordinal_encoding_columns_with_mv,
        ),
        (
            "ordinal_without_missing",
            OrdinalEncoder(
                categories=categories_without_missing,
                handle_unknown="use_encoded_value",
                unknown_value=-1,
            ),
            ordinal_encoding_columns,
        ),
        ("binary_encoding", OrdinalEncoder(), binary_encoding_columns),
        (
            "one_hot_encoding",
            OneHotEncoder(sparse_output=False, handle_unknown="ignore"),
            one_hot_encoding_columns,
        ),
        ("target_encoding", TargetEncoder(), target_encoding_columns),
        ("scaling_num_columns", StandardScaler(), standard_scaler_columns),
    ],
    verbose_feature_names_out=False,
    remainder="passthrough",
)

In [563]:
scaled_encoded = scaler_and_encoder.fit_transform(filled, y)

#### Extra dropper

In [564]:
extra_drop = [
    "BsmtFinType1_BLQ",
    "PavedDrive",
    "MSZoning_RL",
    "LandSlope_Gtl",
    "RoofStyle_Mansard",
    "Exterior2nd_CBlock",
]

extra_dropper = ColumnTransformer(
    transformers=[
        ("drop_features", "drop", extra_drop),
    ],
    verbose_feature_names_out=False,
    remainder="passthrough",
)

In [565]:
# for c in train.columns:
#     if train[c].nunique() < 5:
#         print(train.groupby(c)['SalePrice'].mean().sort_values(ascending=False))

#### Preprocessing pipeline

In [566]:
preprocessor = Pipeline(
    [
        ("imputer", imputer),
        ("scaler_and_encoder", scaler_and_encoder),
        ("extra_dropper", extra_dropper),
    ]
)

In [567]:
processed_X = preprocessor.fit_transform(X, y)

#### Train/test split + misc

In [568]:
X_train, X_valid, y_train, y_valid = train_test_split(
    processed_X, y, test_size=0.2, random_state=42
)

### Models

#### Training / metrics

In [569]:
# reg_rf = RandomForestRegressor()

In [570]:
# reg_xgb = XGBRegressor(
#     learning_rate=0.02789659888155338,
#     n_estimators=850,
#     max_depth=3,
#     min_child_weight=1,
#     gamma=0.0008205765310458492,
#     subsample=0.6779188114408448,
#     colsample_bytree=0.7354576738013417,
#     colsample_bylevel=0.6895591787450616,
#     reg_alpha=0.001500382145151174,
#     reg_lambda=0.005478476239762672,
# )

In [571]:
reg = CatBoostRegressor(
    iterations=2000,
    learning_rate=0.05117900387653215,
    depth=4,
    grow_policy='Lossguide',
    max_leaves=36,
    min_data_in_leaf=35,
    subsample=0.894900781520786,
    l2_leaf_reg=1.788234954064627e-06,
    loss_function='MAE',
)

In [602]:
reg.fit(X_train, y_train)

0:	learn: 53602.1155966	total: 4.12ms	remaining: 8.24s
1:	learn: 52169.2758328	total: 7.46ms	remaining: 7.45s
2:	learn: 50495.7456917	total: 9.8ms	remaining: 6.52s
3:	learn: 49063.4268476	total: 11.4ms	remaining: 5.69s
4:	learn: 47568.7165252	total: 13.4ms	remaining: 5.35s
5:	learn: 46215.0110877	total: 15.1ms	remaining: 5.02s
6:	learn: 44909.1063519	total: 17ms	remaining: 4.83s
7:	learn: 43706.5371093	total: 19.1ms	remaining: 4.75s
8:	learn: 42301.7195760	total: 20.8ms	remaining: 4.6s
9:	learn: 41151.1019631	total: 22.2ms	remaining: 4.42s
10:	learn: 40173.0471279	total: 24ms	remaining: 4.34s
11:	learn: 39116.8390911	total: 26ms	remaining: 4.31s
12:	learn: 38310.8419820	total: 28.1ms	remaining: 4.29s
13:	learn: 37265.1978716	total: 30.1ms	remaining: 4.27s
14:	learn: 36534.4095779	total: 32.1ms	remaining: 4.25s
15:	learn: 35865.9269868	total: 33.9ms	remaining: 4.21s
16:	learn: 34956.9140703	total: 36.7ms	remaining: 4.28s
17:	learn: 34112.9188972	total: 39.6ms	remaining: 4.36s
18:	learn:

<catboost.core.CatBoostRegressor at 0x35332f740>

In [573]:
y_pred = reg.predict(X_train)

(
    mean_absolute_percentage_error(y_train, y_pred),
    root_mean_squared_log_error(y_train, y_pred),
)

(0.052292031702611104, 0.09198068167232969)

In [574]:
y_pred = reg.predict(X_train)

(
    mean_absolute_percentage_error(y_train, y_pred),
    root_mean_squared_log_error(y_train, y_pred),
)

(0.10904274610848222, 0.14810184311436836)

#### Optuna

In [575]:
# def objective(trial):
#     # Learning parameters
#     learning_rate = trial.suggest_float("learning_rate", 0.01, 0.3, log=True)
#     n_estimators = trial.suggest_int("n_estimators", 50, 1000, step=50)

#     # Tree parameters
#     max_depth = trial.suggest_int("max_depth", 3, 15)
#     min_child_weight = trial.suggest_int("min_child_weight", 1, 10)
#     gamma = trial.suggest_float("gamma", 0, 5)

#     # Sampling parameters
#     subsample = trial.suggest_float("subsample", 0.6, 1.0)
#     colsample_bytree = trial.suggest_float("colsample_bytree", 0.6, 1.0)
#     colsample_bylevel = trial.suggest_float("colsample_bylevel", 0.6, 1.0)

#     # Regularization parameters
#     reg_alpha = trial.suggest_float("reg_alpha", 0.0001, 1.0, log=True)
#     reg_lambda = trial.suggest_float("reg_lambda", 0.0001, 1.0, log=True)

#     # Build model with trial parameters
#     model = XGBRegressor(
#         learning_rate=learning_rate,
#         n_estimators=n_estimators,
#         max_depth=max_depth,
#         min_child_weight=min_child_weight,
#         gamma=gamma,
#         subsample=subsample,
#         colsample_bytree=colsample_bytree,
#         colsample_bylevel=colsample_bylevel,
#         reg_alpha=reg_alpha,
#         reg_lambda=reg_lambda,
#         random_state=42,
#         n_jobs=-1
#     )

#     # Cross-validation setup
#     cv = KFold(n_splits=5, random_state=666, shuffle=True)

#     # Evaluate using negative MSE (to maximize)
#     scores = cross_val_score(
#         model, processed_X, np.log(y),
#         scoring="neg_root_mean_squared_error",
#         cv=cv,
#         n_jobs=-1
#     )

#     # Return the mean score
#     return scores.mean()

# # Create study with maximization objective
# study = optuna.create_study(direction="maximize")

# # Run optimization with 500 trials
# study.optimize(objective, n_trials=600)
# ,
# # Get best trial information
# best_trial = study.best_trial
# print(f"Best Score (neg_RMSE): {best_trial.value:.6f}")
# print("Best hyperparameters:")
# for key, value in best_trial.params.items():
#     print(f"    {key}={value},")

In [576]:
# def objective(trial):
#     # Learning parameters
#     learning_rate = trial.suggest_float("learning_rate", 0.01, 0.3, log=True)
#     # iterations = trial.suggest_int("iterations", 100, 2000, step=100)

#     # Tree parameters
#     depth = trial.suggest_int("depth", 4, 10)

#     # Handle grow_policy and dependent parameters correctly
#     grow_policy = trial.suggest_categorical("grow_policy", ["SymmetricTree", "Depthwise", "Lossguide"])

#     # Only suggest max_leaves when grow_policy is Lossguide
#     if grow_policy == "Lossguide":
#         max_leaves = trial.suggest_int("max_leaves", 10, 64)
#     else:
#         max_leaves = None

#     # Common parameters
#     min_data_in_leaf = trial.suggest_int("min_data_in_leaf", 1, 50)

#     # Sampling parameters
#     subsample = trial.suggest_float("subsample", 0.6, 1.0)

#     # Regularization parameters
#     l2_leaf_reg = trial.suggest_float("l2_leaf_reg", 1e-8, 10.0, log=True)

#     # Loss function - keep it simpler to avoid conflicts
#     loss_function = trial.suggest_categorical("loss_function", ["RMSE", "MAE"])

#     # Build params dictionary
#     params = {
#         "learning_rate": learning_rate,
#         # "iterations": iterations,
#         "depth": depth,
#         "grow_policy": grow_policy,
#         "min_data_in_leaf": min_data_in_leaf,
#         "subsample": subsample,
#         "l2_leaf_reg": l2_leaf_reg,
#         "loss_function": loss_function,
#         "random_seed": 42,
#         "verbose": 0,
#         "thread_count": -1  # Use all available cores
#     }

#     # Add max_leaves only if grow_policy is Lossguide
#     if max_leaves is not None:
#         params["max_leaves"] = max_leaves

#     # Build model with trial parameters
#     try:
#         model = CatBoostRegressor(**params)

#         # Cross-validation setup
#         cv = KFold(n_splits=5, random_state=666, shuffle=True)

#         # Evaluate using negative MSE (to maximize)
#         scores = cross_val_score(
#             model, processed_X, np.log(y),
#             scoring="neg_root_mean_squared_error",
#             cv=cv,
#             n_jobs=1  # CatBoost handles parallelization internally
#         )

#         # Return the mean score
#         return scores.mean()

#     except Exception as e:
#         # Return a very poor score if an error occurs
#         print(f"Trial failed with error: {str(e)}")
#         return float('-inf')

# # Create study with maximization objective
# study = optuna.create_study(direction="maximize")

# # Optuna provides a way to catch exceptions and continue
# study.optimize(objective, n_trials=500, catch=(Exception,))

# # Check if we have any successful trials
# if len(study.trials) > 0 and study.best_value > float('-inf'):
#     # Get best trial information
#     best_trial = study.best_trial
#     print(f"Best Score (neg_mean_squared_error)={best_trial.value:.8f},")
#     print("Best hyperparameters:")
#     for key, value in best_trial.params.items():
#         print(f"    {key}:={value},")

#     # Build best model
#     best_params = best_trial.params.copy()

#     # Handle grow_policy and max_leaves correctly
#     if best_params["grow_policy"] != "Lossguide" and "max_leaves" in best_params:
#         del best_params["max_leaves"]
# else:
#     print("No successful trials found.")

### Feature selection

In [596]:
def calculate_vif(df):
    vif_data = pd.DataFrame()
    vif_data["Feature"] = df.columns
    vif_data["VIF"] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]
    return vif_data

# Function to remove high VIF features iteratively
def remove_high_vif_features(df, threshold=5):
    while True:
        vif_scores = calculate_vif(df)
        max_vif = vif_scores["VIF"].max()
        
        if max_vif < threshold:
            break  # Stop if all VIFs are below the threshold
        
        # Drop feature with the highest VIF
        feature_to_drop = vif_scores.loc[vif_scores["VIF"].idxmax(), "Feature"]
        df = df.drop(columns=[feature_to_drop])
        print(f"Dropped {feature_to_drop} (VIF = {max_vif:.2f})")
    
    return df

# Get initial VIF scores
print("Initial VIF scores:")
print(calculate_vif(processed_X))

# Remove high VIF features
X_clean = remove_high_vif_features(processed_X)

# Print final dataset shape
print(f"Final dataset shape: {X_clean.shape}")

Initial VIF scores:


  vif = 1. / (1. - r_squared_i)


          Feature          VIF
0        BsmtQual     6.308477
1        BsmtCond     4.917137
2     FireplaceQu    15.645233
3      GarageCond    18.879945
4      GarageQual    18.245368
..            ...          ...
154       haspool     1.220878
155   has2ndfloor    27.222442
156     hasgarage  1650.707108
157       hasbsmt          inf
158  hasfireplace    14.940970

[159 rows x 2 columns]


  vif = 1. / (1. - r_squared_i)


Dropped LotShape_IR1 (VIF = inf)


  vif = 1. / (1. - r_squared_i)


Dropped LandContour_Bnk (VIF = inf)


  vif = 1. / (1. - r_squared_i)


Dropped LotConfig_Corner (VIF = inf)


  vif = 1. / (1. - r_squared_i)


Dropped Neighborhood_Blmngtn (VIF = inf)


  vif = 1. / (1. - r_squared_i)


Dropped BldgType_1Fam (VIF = inf)


  vif = 1. / (1. - r_squared_i)


Dropped HouseStyle_1.5Fin (VIF = inf)


  vif = 1. / (1. - r_squared_i)


Dropped Exterior1st_AsbShng (VIF = inf)


  vif = 1. / (1. - r_squared_i)


Dropped Exterior1st_CBlock (VIF = inf)


  vif = 1. / (1. - r_squared_i)


Dropped BsmtFinType1_missing_value (VIF = inf)


  vif = 1. / (1. - r_squared_i)


Dropped BsmtFinType2_ALQ (VIF = inf)


  vif = 1. / (1. - r_squared_i)


Dropped SaleType_COD (VIF = inf)
Dropped TotalSF (VIF = 2635541.47)
Dropped YrBltAndRemod (VIF = 18999.32)
Dropped hasgarage (VIF = 4319.86)
Dropped hasbsmt (VIF = 1460.07)
Dropped SaleCondition (VIF = 519.61)
Dropped Exterior2nd_VinylSd (VIF = 427.64)
Dropped GarageCond (VIF = 301.84)
Dropped Condition1 (VIF = 236.02)
Dropped Total_sqr_footage (VIF = 189.97)
Dropped GarageQual (VIF = 178.01)
Dropped RoofStyle_Gable (VIF = 168.08)
Dropped Electrical (VIF = 145.14)
Dropped BsmtCond (VIF = 122.46)
Dropped BsmtQual (VIF = 91.69)
Dropped Exterior1st_VinylSd (VIF = 81.23)
Dropped MasVnrType (VIF = 72.63)
Dropped BsmtFinType2_Unf (VIF = 70.06)
Dropped BsmtExposure (VIF = 67.70)
Dropped Foundation (VIF = 66.98)
Dropped GarageType (VIF = 64.58)
Dropped ExterQual (VIF = 61.75)
Dropped HouseStyle_1Story (VIF = 46.60)
Dropped GarageFinish (VIF = 43.47)
Dropped ExterCond (VIF = 42.32)
Dropped Exterior2nd_MetalSd (VIF = 40.46)
Dropped KitchenQual (VIF = 36.80)
Dropped hasfireplace (VIF = 30.94)
Dro

In [577]:
# Lasso
lasso = Lasso(alpha=0.3)
lasso.fit(processed_X, y)

lasso_imp = pd.Series(dict(zip(processed_X.columns, lasso.coef_))).sort_values(
    key=lambda x: abs(x), ascending=False
)

  model = cd_fast.enet_coordinate_descent(


In [578]:
lasso_imp.tail(30)

LandContour_Bnk        -485.096683
MiscVal                -298.599966
Exterior2nd_Brk Cmn    -285.841777
LotShape_Reg           -281.741055
LowQualFinSF            279.136160
BsmtFinType1_ALQ       -150.569120
BsmtCond                114.943718
SaleType_New           -114.060035
Neighborhood_Somerst   -112.279941
YrSold                 -101.712573
Exterior1st_Plywood      91.866090
YrBltAndRemod            63.390049
Neighborhood_IDOTRR      51.993903
LandContour_Lvl          28.993827
TotalSF                  27.264588
Total_porch_sf           11.574532
Total_sqr_footage        10.893704
BsmtExposure              0.223326
Condition1                0.178210
SaleCondition             0.154952
MasVnrType                0.063522
Electrical               -0.041653
Foundation                0.022902
GarageType               -0.016926
GarageFinish             -0.009847
SaleType_ConLI           -0.000000
LotConfig_Inside         -0.000000
Exterior1st_CemntBd      -0.000000
HouseStyle_2Story   

In [579]:
# Permutation importance
r = permutation_importance(reg, X_valid, y_valid, n_repeats=1, random_state=0)

per_imp = pd.Series(dict(zip(processed_X.columns, r["importances_mean"]))).sort_values(
    key=lambda x: abs(x), ascending=False
)

In [580]:
per_imp.tail(50)

LotShape_IR3                 -4.379929e-05
BsmtCond                      4.003110e-05
RoofStyle_Flat                3.958049e-05
Exterior2nd_Stucco            3.804313e-05
Exterior1st_VinylSd           3.767571e-05
LandSlope_Sev                -3.646946e-05
BsmtFinType2_missing_value    2.777655e-05
LandContour_Low               2.421781e-05
Exterior2nd_MetalSd           2.285854e-05
BsmtFinType2_LwQ             -2.103345e-05
Exterior2nd_BrkFace           1.976541e-05
Exterior1st_Stucco           -1.599292e-05
MSZoning_RH                  -1.559748e-05
Exterior2nd_CmentBd           1.442266e-05
Exterior2nd_ImStucc          -1.424312e-05
Exterior2nd_Plywood          -1.309758e-05
CentralAir                   -1.027938e-05
Neighborhood_SWISU           -9.094427e-06
HouseStyle_1.5Unf             8.252545e-06
BsmtFinType1_missing_value   -3.825472e-06
BsmtFinType1_Unf             -3.464516e-06
HouseStyle_SFoyer            -3.298056e-06
HouseStyle_2.5Fin             3.131739e-06
Exterior2nd

In [581]:
a = per_imp.tail(60).index
b = lasso_imp[abs(lasso_imp) < 0.06].index

a[a.isin(b)]

Index(['BsmtFinType2_Rec', 'Exterior1st_CemntBd', 'LotConfig_Inside'], dtype='object')

In [582]:
cb_f_imp = pd.DataFrame(
    data={"feature name": reg.feature_names_, "importance": reg.feature_importances_}
)

In [583]:
cb_f_imp.sort_values(by="importance", ascending=False).tail(30)["feature name"].tolist()

['Exterior1st_AsbShng',
 'SaleType_ConLD',
 'Neighborhood_SWISU',
 'Exterior2nd_MetalSd',
 'Exterior1st_CBlock',
 'SaleType_Con',
 'HouseStyle_SFoyer',
 'BsmtFinType2_ALQ',
 'SaleType_ConLw',
 'Exterior2nd_ImStucc',
 'BldgType_2fmCon',
 'Exterior2nd_AsbShng',
 'BsmtFinType1_missing_value',
 'LotConfig_FR3',
 'Exterior2nd_BrkFace',
 'Neighborhood_NPkVill',
 'Neighborhood_BrDale',
 'Exterior2nd_Stone',
 'Exterior2nd_Brk Cmn',
 'haspool',
 'SaleType_Oth',
 'hasbsmt',
 'Neighborhood_Blueste',
 'Exterior1st_AsphShn',
 'RoofStyle_Shed',
 'Exterior1st_BrkComm',
 'Exterior1st_Stone',
 'Exterior2nd_AsphShn',
 'Exterior2nd_Other',
 'Exterior1st_ImStucc']

### Inference

In [584]:
reg.fit(processed_X, np.log(y))

0:	learn: 0.2993041	total: 1.38ms	remaining: 2.75s
1:	learn: 0.2887494	total: 3.3ms	remaining: 3.29s
2:	learn: 0.2789056	total: 4.96ms	remaining: 3.3s
3:	learn: 0.2699554	total: 6.37ms	remaining: 3.18s
4:	learn: 0.2611619	total: 7.88ms	remaining: 3.15s
5:	learn: 0.2537083	total: 9.26ms	remaining: 3.08s
6:	learn: 0.2459576	total: 10.8ms	remaining: 3.09s
7:	learn: 0.2382323	total: 12.5ms	remaining: 3.12s
8:	learn: 0.2312806	total: 13.8ms	remaining: 3.05s
9:	learn: 0.2245330	total: 15.5ms	remaining: 3.09s
10:	learn: 0.2179880	total: 17.1ms	remaining: 3.09s
11:	learn: 0.2116234	total: 18.7ms	remaining: 3.1s
12:	learn: 0.2055569	total: 20.7ms	remaining: 3.17s
13:	learn: 0.2002155	total: 22ms	remaining: 3.12s
14:	learn: 0.1949532	total: 23.5ms	remaining: 3.11s
15:	learn: 0.1891662	total: 25.1ms	remaining: 3.12s
16:	learn: 0.1839251	total: 26.9ms	remaining: 3.14s
17:	learn: 0.1798100	total: 28.5ms	remaining: 3.13s
18:	learn: 0.1752059	total: 29.9ms	remaining: 3.11s
19:	learn: 0.1712445	total:

<catboost.core.CatBoostRegressor at 0x35332f740>

In [585]:
result = pd.DataFrame(test["Id"])
prep_test = preprocessor.transform(test)

In [586]:
result["SalePrice"] = np.exp(reg.predict(prep_test))

In [587]:
result.to_csv("submission-catb-xx.csv", index=False)