In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
# Load datasets
train_df_raw = pd.read_csv("../data/train.csv", index_col="Id")
test_df_raw = pd.read_csv("../data/test.csv", index_col="Id")

train_X_raw, train_y_raw = train_df_raw.drop("SalePrice", axis=1), train_df_raw["SalePrice"]
test_X_raw = test_df_raw.copy()

In [3]:
# Check shape of train and test df
print(f"train_X shape : {train_X_raw.shape}")
print(f"test_X shape : {test_X_raw.shape}")

train_X shape : (1460, 79)
test_X shape : (1459, 79)


In [4]:
numeric_features = train_X_raw.select_dtypes(include="number").columns
object_features = train_X_raw.select_dtypes(include="object").columns

print(f"Number of numeric features : {len(numeric_features)}")
print(f"Number of object features : {len(object_features)}")

Number of numeric features : 36
Number of object features : 43


In [5]:
# Check info of train_X_raw
train_X_raw_numeric_info = pd.DataFrame({
    "missing_sum": train_X_raw[numeric_features].isna().sum(),
    "missing_percent": train_X_raw[numeric_features].isna().sum() / len(train_X_raw) * 100,
    "dtypes": train_X_raw[numeric_features].dtypes,
    **train_X_raw[numeric_features].describe().T,
}).sort_values(by=["dtypes", "missing_sum"])

train_X_raw_numeric_info

Unnamed: 0,missing_sum,missing_percent,dtypes,count,mean,std,min,25%,50%,75%,max
MSSubClass,0,0.0,int64,1460.0,56.89726,42.300571,20.0,20.0,50.0,70.0,190.0
LotArea,0,0.0,int64,1460.0,10516.828082,9981.264932,1300.0,7553.5,9478.5,11601.5,215245.0
OverallQual,0,0.0,int64,1460.0,6.099315,1.382997,1.0,5.0,6.0,7.0,10.0
OverallCond,0,0.0,int64,1460.0,5.575342,1.112799,1.0,5.0,5.0,6.0,9.0
YearBuilt,0,0.0,int64,1460.0,1971.267808,30.202904,1872.0,1954.0,1973.0,2000.0,2010.0
YearRemodAdd,0,0.0,int64,1460.0,1984.865753,20.645407,1950.0,1967.0,1994.0,2004.0,2010.0
BsmtFinSF1,0,0.0,int64,1460.0,443.639726,456.098091,0.0,0.0,383.5,712.25,5644.0
BsmtFinSF2,0,0.0,int64,1460.0,46.549315,161.319273,0.0,0.0,0.0,0.0,1474.0
BsmtUnfSF,0,0.0,int64,1460.0,567.240411,441.866955,0.0,223.0,477.5,808.0,2336.0
TotalBsmtSF,0,0.0,int64,1460.0,1057.429452,438.705324,0.0,795.75,991.5,1298.25,6110.0


In [6]:
# Check info of train_X_raw
train_X_raw_object_info = pd.DataFrame({
    "missing_sum": train_X_raw[object_features].isna().sum(),
    "missing_percent": train_X_raw[object_features].isna().sum() / len(train_X_raw) * 100,
    "dtypes": train_X_raw[object_features].dtypes,
    **train_X_raw[object_features].describe().T,
}).sort_values(by=["dtypes", "missing_sum"])

train_X_raw_object_info

Unnamed: 0,missing_sum,missing_percent,dtypes,count,unique,top,freq
MSZoning,0,0.0,object,1460,5,RL,1151
Street,0,0.0,object,1460,2,Pave,1454
LotShape,0,0.0,object,1460,4,Reg,925
LandContour,0,0.0,object,1460,4,Lvl,1311
Utilities,0,0.0,object,1460,2,AllPub,1459
LotConfig,0,0.0,object,1460,5,Inside,1052
LandSlope,0,0.0,object,1460,3,Gtl,1382
Neighborhood,0,0.0,object,1460,25,NAmes,225
Condition1,0,0.0,object,1460,9,Norm,1260
Condition2,0,0.0,object,1460,8,Norm,1445


In [7]:
# Check info of test_X_raw
test_X_raw_numeric_info = pd.DataFrame({
    "missing_sum": test_X_raw[numeric_features].isna().sum(),
    "missing_percent": test_X_raw[numeric_features].isna().sum() / len(test_X_raw) * 100,
    "dtypes": test_X_raw[numeric_features].dtypes,
    **test_X_raw[numeric_features].describe().T,
}).sort_values(by=["dtypes", "missing_sum"])

test_X_raw_numeric_info

Unnamed: 0,missing_sum,missing_percent,dtypes,count,mean,std,min,25%,50%,75%,max
MSSubClass,0,0.0,int64,1459.0,57.378341,42.74688,20.0,20.0,50.0,70.0,190.0
LotArea,0,0.0,int64,1459.0,9819.161069,4955.517327,1470.0,7391.0,9399.0,11517.5,56600.0
OverallQual,0,0.0,int64,1459.0,6.078821,1.436812,1.0,5.0,6.0,7.0,10.0
OverallCond,0,0.0,int64,1459.0,5.553804,1.11374,1.0,5.0,5.0,6.0,9.0
YearBuilt,0,0.0,int64,1459.0,1971.357779,30.390071,1879.0,1953.0,1973.0,2001.0,2010.0
YearRemodAdd,0,0.0,int64,1459.0,1983.662783,21.130467,1950.0,1963.0,1992.0,2004.0,2010.0
1stFlrSF,0,0.0,int64,1459.0,1156.534613,398.16582,407.0,873.5,1079.0,1382.5,5095.0
2ndFlrSF,0,0.0,int64,1459.0,325.967786,420.610226,0.0,0.0,0.0,676.0,1862.0
LowQualFinSF,0,0.0,int64,1459.0,3.543523,44.043251,0.0,0.0,0.0,0.0,1064.0
GrLivArea,0,0.0,int64,1459.0,1486.045922,485.566099,407.0,1117.5,1432.0,1721.0,5095.0


In [8]:
# Check info of test_X_raw
test_X_raw_object_info = pd.DataFrame({
    "missing_sum": test_X_raw[object_features].isna().sum(),
    "missing_percent": test_X_raw[object_features].isna().sum() / len(test_X_raw) * 100,
    "dtypes": test_X_raw[object_features].dtypes,
    **test_X_raw[object_features].describe().T,
}).sort_values(by=["dtypes", "missing_sum"])

test_X_raw_object_info

Unnamed: 0,missing_sum,missing_percent,dtypes,count,unique,top,freq
Street,0,0.0,object,1459,2,Pave,1453
LotShape,0,0.0,object,1459,4,Reg,934
LandContour,0,0.0,object,1459,4,Lvl,1311
LotConfig,0,0.0,object,1459,5,Inside,1081
LandSlope,0,0.0,object,1459,3,Gtl,1396
Neighborhood,0,0.0,object,1459,25,NAmes,218
Condition1,0,0.0,object,1459,9,Norm,1251
Condition2,0,0.0,object,1459,5,Norm,1444
BldgType,0,0.0,object,1459,5,1Fam,1205
HouseStyle,0,0.0,object,1459,7,1Story,745


In [9]:
# Check all features with missing data from train_X
train_missing_features = set(train_X_raw.columns[train_X_raw.isna().any()])
train_missing_features

{'Alley',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'BsmtQual',
 'Electrical',
 'Fence',
 'FireplaceQu',
 'GarageCond',
 'GarageFinish',
 'GarageQual',
 'GarageType',
 'GarageYrBlt',
 'LotFrontage',
 'MasVnrArea',
 'MasVnrType',
 'MiscFeature',
 'PoolQC'}

In [10]:
# Check all features with missing data from test_X
test_missing_features = set(test_X_raw.columns[test_X_raw.isna().any()])
test_missing_features

{'Alley',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtFinType1',
 'BsmtFinType2',
 'BsmtFullBath',
 'BsmtHalfBath',
 'BsmtQual',
 'BsmtUnfSF',
 'Exterior1st',
 'Exterior2nd',
 'Fence',
 'FireplaceQu',
 'Functional',
 'GarageArea',
 'GarageCars',
 'GarageCond',
 'GarageFinish',
 'GarageQual',
 'GarageType',
 'GarageYrBlt',
 'KitchenQual',
 'LotFrontage',
 'MSZoning',
 'MasVnrArea',
 'MasVnrType',
 'MiscFeature',
 'PoolQC',
 'SaleType',
 'TotalBsmtSF',
 'Utilities'}

In [11]:
# Check total number of missing features
df_missing_features = train_missing_features | test_missing_features
print(f"Number of missing features in train and test: {df_missing_features}")
df_missing_features

Number of missing features in train and test: {'MasVnrType', 'GarageFinish', 'BsmtFinSF1', 'Alley', 'BsmtQual', 'GarageQual', 'GarageYrBlt', 'TotalBsmtSF', 'BsmtFinType2', 'MasVnrArea', 'GarageArea', 'BsmtExposure', 'Functional', 'Fence', 'LotFrontage', 'MiscFeature', 'BsmtFinSF2', 'FireplaceQu', 'GarageCond', 'PoolQC', 'BsmtFullBath', 'Utilities', 'BsmtCond', 'GarageType', 'KitchenQual', 'BsmtHalfBath', 'GarageCars', 'Exterior2nd', 'Exterior1st', 'Electrical', 'SaleType', 'BsmtFinType1', 'MSZoning', 'BsmtUnfSF'}


{'Alley',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtFinType1',
 'BsmtFinType2',
 'BsmtFullBath',
 'BsmtHalfBath',
 'BsmtQual',
 'BsmtUnfSF',
 'Electrical',
 'Exterior1st',
 'Exterior2nd',
 'Fence',
 'FireplaceQu',
 'Functional',
 'GarageArea',
 'GarageCars',
 'GarageCond',
 'GarageFinish',
 'GarageQual',
 'GarageType',
 'GarageYrBlt',
 'KitchenQual',
 'LotFrontage',
 'MSZoning',
 'MasVnrArea',
 'MasVnrType',
 'MiscFeature',
 'PoolQC',
 'SaleType',
 'TotalBsmtSF',
 'Utilities'}

In [12]:
# Check the missing features in test but not in train
print(f"Number of missing features in test and not in train: {len(test_missing_features - train_missing_features)}")
test_missing_features - train_missing_features

Number of missing features in test and not in train: 15


{'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtFullBath',
 'BsmtHalfBath',
 'BsmtUnfSF',
 'Exterior1st',
 'Exterior2nd',
 'Functional',
 'GarageArea',
 'GarageCars',
 'KitchenQual',
 'MSZoning',
 'SaleType',
 'TotalBsmtSF',
 'Utilities'}

**Note:** some numeric features should be converted to ordinal features. From examining the description of each feature, we will breakdown the numeric features into `numeric` and `ordinal`.

### 1.3 Summary

**Preprocessing Plan:**

Since `train` and `test` datasets contain different missing values, we need to account for all features in our preprocessing step. There are also many numeric features that are ordinal features. Amongst these features, there are values in the `test` dataset that are not in the `train` dataset. Therefore, to capture all values for these features we will need to perform an encodings on the joint dataset.

In [13]:
def merge(dfs: list[pd.DataFrame]) -> tuple[pd.DataFrame, list[int]]:
    """Concatonate dataframes along rows.

    Parameters
    ----------
    dfs : list[pd.DataFrame]

    Returns
    -------
    pd.DataFrame
        The merged dataframe.
    list[int]
        Indices to split between the dataframes.

    """

    indices = [0]
    for df in dfs:
        indices.append(len(df) + indices[-1]) 
    return pd.concat(dfs), indices

def split(df: pd.DataFrame, indices: list[int]) -> list[pd.DataFrame]:
    """Split dataframe along rows at index.

    Parameters
    ----------
    df : pd.DataFrame   
    indices : list[int]

    Returns
    -------
    list[pd.DataFrame]

    """

    return [df.iloc[i:j] for i, j in zip(indices, indices[1:])]

In [14]:
# Concatonate train and test datasets to ensure categorical and one-hot encoding covers all values
df, indices = merge([train_X_raw, test_X_raw])

In [15]:
# Define the ordinal_numeric, regular_numeric, ordinal_object, nominal_object features
ordinal_numeric_threshold = 15
ordinal_numeric_features = df[numeric_features].columns[df[numeric_features].max() <= ordinal_numeric_threshold].tolist()
regular_numeric_features = df[numeric_features].columns[df[numeric_features].max() > ordinal_numeric_threshold].tolist()

# NOTE: MSSubClass should be an ordinal numeric_feature
ordinal_numeric_features.append("MSSubClass")
regular_numeric_features.remove("MSSubClass")

# Ordinal object features
ordinal_object_features = {
    "Utilities": ["ELO", "NoSeWa", "NoSewr", "AllPub"],
    "LandSlope": ["Sev", "Mod", "Gtl"], #
    "ExterQual": ["Po", "Fa", "TA", "Gd", "Ex"],
    "ExterCond": ["Po", "Fa", "TA", "Gd", "Ex"],
    "Foundation": ["Wood", "Stone", "Slab", "PConc", "CBlock", "BrkTil"], #
    "BsmtQual": ["NA", "Po", "Fa", "TA", "Gd", "Ex"], # bias towards NA < Po
    "BsmtCond": ["NA", "Po", "Fa", "TA", "Gd", "Ex"], # bias towards NA < Po
    "BsmtExposure": ["NA", "No", "Mn", "Av", "Gd"],
    "BsmtFinType1": ["NA", "Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
    "BsmtFinType2": ["NA", "Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
    "HeatingQC": ["Po", "Fa", "TA", "Gd", "Ex"],
    "KitchenQual": ["Po", "Fa", "TA", "Gd", "Ex"],
    "Functional": ["Sal", "Sev", "Maj2", "Maj1", "Mod", "Min2", "Min1", "Typ"],
    "FireplaceQu": ["NA", "Po", "Fa", "TA", "Gd", "Ex"], # bias towards NA < Po
    "GarageType": ["NA", "Detchd", "CarPort", "BuiltIn", "Basment", "Attchd", "2Types"], #
    "GarageFinish": ["NA", "Unf", "RFn", "Fin"],
    "GarageQual": ["NA", "Po", "Fa", "TA", "Gd", "Ex"],
    "GarageCond": ["NA", "Po", "Fa", "TA", "Gd", "Ex"],
    "PavedDrive": ["N", "P", "Y"], #
    "PoolQC": ["NA", "Fa", "TA", "Gd", "Ex"],
    "Fence": ["NA", "MnWw", "GdWo", "MnPrv", "GdPrv"]
}

nominal_object_features = [
    "MSZoning", #
    "Street", #
    "Alley", #
    "LotShape", #
    "LandContour", #
    "LotConfig",
    "Neighborhood",
    "Condition1",
    "Condition2",
    "BldgType", #
    "HouseStyle", #
    "RoofStyle",
    "RoofMatl",
    "Exterior1st",
    "Exterior2nd",
    "MasVnrType",
    "Heating",
    "CentralAir", #
    "Electrical",
    "MiscFeature",
    "SaleType",
    "SaleCondition",
]

print(f"Ordinal numeric features:\n{ordinal_numeric_features}")
print("========================================================")
print(f"Regular numeric features:\n{regular_numeric_features}")
print("========================================================")
print(f"Ordinal object features:\n{ordinal_object_features}")
print("========================================================")
print(f"Nominal object features:\n{nominal_object_features}")

Ordinal numeric features:
['OverallQual', 'OverallCond', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'MoSold', 'MSSubClass']
Regular numeric features:
['LotFrontage', 'LotArea', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'GarageYrBlt', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'YrSold']
Ordinal object features:
{'Utilities': ['ELO', 'NoSeWa', 'NoSewr', 'AllPub'], 'LandSlope': ['Sev', 'Mod', 'Gtl'], 'ExterQual': ['Po', 'Fa', 'TA', 'Gd', 'Ex'], 'ExterCond': ['Po', 'Fa', 'TA', 'Gd', 'Ex'], 'Foundation': ['Wood', 'Stone', 'Slab', 'PConc', 'CBlock', 'BrkTil'], 'BsmtQual': ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex'], 'BsmtCond': ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex'], 'BsmtExposure': ['NA', 'No', 'Mn', 'Av', 'Gd'], 'BsmtFinType1': ['NA', '

In [16]:
# Check that the feature split still cover all the features
len(ordinal_numeric_features) + len(regular_numeric_features) + len(ordinal_object_features) + len(nominal_object_features)

79

In [17]:
# Impute ordinal numeric features
ordinal_numeric_missing_features = list(set(ordinal_numeric_features) & df_missing_features)
ordinal_numeric_missing_features

['GarageCars', 'BsmtFullBath', 'BsmtHalfBath']

In [18]:
df[ordinal_numeric_missing_features] = df[ordinal_numeric_missing_features].fillna(df[ordinal_numeric_missing_features].mode().iloc[0].to_dict())

In [19]:
# Check if all missing data for ordinal object features are filled
df[ordinal_numeric_missing_features].isna().sum()

GarageCars      0
BsmtFullBath    0
BsmtHalfBath    0
dtype: int64

In [20]:
# Impute ordinal object features
ordinal_object_missing_features = list(set(ordinal_object_features) & df_missing_features)
ordinal_object_missing_features

['BsmtFinType2',
 'FireplaceQu',
 'GarageCond',
 'BsmtFinType1',
 'PoolQC',
 'BsmtExposure',
 'GarageFinish',
 'Fence',
 'Utilities',
 'BsmtCond',
 'GarageType',
 'Functional',
 'KitchenQual',
 'BsmtQual',
 'GarageQual']

In [21]:
special_ordinal_object_features = ["Functional", "KitchenQual", "Utilities"]
df["Functional"] = df["Functional"].fillna("Typ")
df["KitchenQual"] = df["KitchenQual"].fillna(df["KitchenQual"].mode()[0]) # bias for mode?
df["Utilities"] = df["Utilities"].fillna(df["Utilities"].mode()[0]) # bias for mode?

normal_ordinal_object_features = list(set(ordinal_object_missing_features) - set(special_ordinal_object_features))
df[normal_ordinal_object_features] = df[normal_ordinal_object_features].fillna("NA")

In [22]:
# Check if all missing data for ordinal object features are filled
df[ordinal_object_missing_features].isna().sum()

BsmtFinType2    0
FireplaceQu     0
GarageCond      0
BsmtFinType1    0
PoolQC          0
BsmtExposure    0
GarageFinish    0
Fence           0
Utilities       0
BsmtCond        0
GarageType      0
Functional      0
KitchenQual     0
BsmtQual        0
GarageQual      0
dtype: int64

In [23]:
# Impute nominal features
nominal_object_missing_features = list(set(nominal_object_features) & df_missing_features)
print(nominal_object_missing_features)

['Exterior1st', 'MiscFeature', 'Electrical', 'SaleType', 'MSZoning', 'MasVnrType', 'Alley', 'Exterior2nd']


In [24]:
na_nominal_object_features = ["Alley", "MasVnrType", "MiscFeature"]
df[na_nominal_object_features] = df[na_nominal_object_features].fillna("NA")

mf_nominal_object_features = list(set(nominal_object_missing_features) - set(na_nominal_object_features))
df[mf_nominal_object_features] = df[mf_nominal_object_features].fillna(df[mf_nominal_object_features].mode().iloc[0].to_dict())


In [25]:
# Check if all missing data for nominal object features are filled
df[nominal_object_missing_features].isna().sum()

Exterior1st    0
MiscFeature    0
Electrical     0
SaleType       0
MSZoning       0
MasVnrType     0
Alley          0
Exterior2nd    0
dtype: int64

In [26]:
# Encode ordinal numeric features into categories
# df[ordinal_numeric_features] = df[ordinal_numeric_features].astype("category")

# Encode ordinal object features into categories
# for feature, categories in ordinal_object_features.items():
#     df[feature] = pd.Categorical(df[feature], categories=categories)

# Encode nominal object features using one-hot encoding
# df = pd.get_dummies(df, columns=nominal_object_features)

In [27]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [28]:
X_train_full, X_test = split(df, indices)
y_train_full = train_y_raw.copy()

X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=42)

print(f"X_train shape : {X_train.shape}")
print(f"y_train shape : {y_train.shape}")
print(f"X_valid shape : {X_valid.shape}")
print(f"y_valid shape : {y_valid.shape}")
print(f"X_test shape : {X_test.shape}")

X_train shape : (1168, 79)
y_train shape : (1168,)
X_valid shape : (292, 79)
y_valid shape : (292,)
X_test shape : (1459, 79)


In [29]:
# Numeric feature transformer
regular_numeric_transformer = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("impute", SimpleImputer(strategy="mean"))
])

# Ordinal numeric missing features
ordinal_numeric_transformer = Pipeline(steps=[
    ("encode", OrdinalEncoder()) # already encoded as category (?)
])

ordinal_object_feature_keys, ordinal_object_feature_values = zip(*list(ordinal_object_features.items()))
ordinal_object_transformer = Pipeline(steps=[
    ("encode", OrdinalEncoder(categories=list(ordinal_object_feature_values)))
])

nominal_object_transformer = Pipeline(steps=[
    ("encode", OneHotEncoder())
])

preprocessor = ColumnTransformer(transformers=[
    ("regular_numeric_transformer", regular_numeric_transformer, regular_numeric_features),
    ("ordinal_numeric_transformer", ordinal_numeric_transformer, ordinal_numeric_missing_features),
    ("ordinal_object_transformer", ordinal_object_transformer, list(ordinal_object_feature_keys)),
    ("nominal_object_transformer", nominal_object_transformer, nominal_object_features),
], remainder="passthrough")

### Modeling

In [30]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, make_scorer
from catboost import CatBoostRegressor

In [31]:
def log_rmse(true, pred):
    return np.sqrt(mean_squared_error(np.log(true), np.log(pred)))

scorer = make_scorer(log_rmse)

In [32]:
# Build a baseline model
model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", CatBoostRegressor(verbose=0, random_seed=42))
])

# Fit model on training set
model["preprocessor"].fit(X_train_full)
X_train_transformed = model["preprocessor"].transform(X_train)
model["model"].fit(X_train_transformed, y_train)

# Evaluate model on validation set
scorer(model, X_valid, y_valid)

0.13568553522237722

In [33]:
# Perform a grid search cross-validation
# Define the parameter grid
param_grid = {
    "iterations": [1000, 2000],
    "max_depth": [6, 8, 10],
}

# Build a grid search model
model_gs = GridSearchCV(model["model"], param_grid=param_grid, cv=3, scoring=scorer)

# Fit model on the full training dataset
X_train_full_transformed = model["preprocessor"].fit_transform(X_train_full)
model_gs.fit(X_train_full_transformed, y_train_full)

In [34]:
# Get best parameters found from grid search
best_params = model_gs.best_estimator_.get_params()
print(f"Best parameters found: {best_params}")

# Instantiate tuned model
model_tuned = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", CatBoostRegressor(**best_params))
])

# Fit tuned model on training set
model_tuned["preprocessor"].fit(X_train_full)
X_train_transformed = model_tuned["preprocessor"].transform(X_train)
model_tuned["model"].fit(X_train_transformed, y_train)

# Evaluate model on validation set
scorer(model_tuned, X_valid, y_valid)

Best parameters found: {'loss_function': 'RMSE', 'random_seed': 42, 'verbose': 0, 'iterations': 1000, 'max_depth': 10}


0.14238425349394954

In [35]:
# Results from baseline model are better, so we will predict on X_test with that

# Refit tuned model on the full training dataset
model["preprocessor"].fit(df)
X_train_full_transformed = model["preprocessor"].transform(X_train_full)
model["model"].fit(X_train_full_transformed, y_train_full)

# Make predictions on the test set
predictions = model.predict(X_test)
predictions

array([123031.92183869, 158671.78677668, 190220.40391914, ...,
       162076.24675934, 116344.24142935, 216564.73213192])

In [36]:
# Save model
from joblib import dump

model_folder_name = "../models/"
model_file_name = "simple_catboost_model.csv"
dump(model, model_folder_name + model_file_name)

['../models/simple_catboost_model.csv']

In [37]:
# Create submission dataframe
submission = pd.DataFrame({"Id": X_test.index, "SalePrice": predictions})

# Export submission to a csv
submission_folder_name = "../submissions/"
submission_file_name = "simple_catboost_submission.csv"
submission.to_csv(submission_folder_name + submission_file_name, index=False)