### General imports

In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats

# algorithms
from sklearn.model_selection import RandomizedSearchCV
from sklearn import dummy
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

# evaluation
from sklearn.model_selection import train_test_split
from sklearn import metrics
from scipy.stats import uniform, randint

# visualization
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.pyplot import figure  







### Import Catboost

In [None]:
try:
    from catboost import CatBoostRegressor

except Exception as e:
    print('importing XGBoost and LightGBM instead')

### Import XGBoost and LightGBM

In [3]:
try:
    import lightgbm as lgb
    import xgboost as xgb

except Exception as e:
    print('imported Catboost instead')

imported Catboost instead


Note: I had to split the imports between the models this way because I could manage to download all the models in the same environment.

# Data Preparation (both train & test set)

## Downloading and general checking

In [None]:
# Load the CSV file for the training dataset
data_train = pd.read_csv('train.csv')
df_train = pd.DataFrame(data_train)

df_train

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


In [None]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [6]:
# Check if there are any missing values in the DataFrame
has_missing = data_train.isnull().values.any()
print("Are there any missing values?", has_missing)

#df_train.isna().sum()

# missing_summary = pd.DataFrame({
#     'missing_count': df_train.isnull().sum(),
#     'missing_percentage': df_train.isnull().mean() * 100
# })
# print(missing_summary)

# Only show columns with missing values
df_train.isnull().sum()[df_train.isnull().sum() > 0]


Are there any missing values? True


LotFrontage      259
Alley           1369
MasVnrType       872
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64

## Removal

In [7]:
df_train = df_train.drop('Id', axis=1)
df_train

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,8,2007,WD,Normal,175000
1456,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,Inside,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,Inside,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,4,2010,WD,Normal,142125


### Removing non meaningful Nas

From the missing columns, NaN has meaning for some of them. The ones with no meaning are the ones I will remove. These are:

LotFrontage

Electrical

In [8]:
df_train = df_train.dropna(subset=['LotFrontage', 'Electrical'])

### Remove Outliers for random variables

Random variable means attributes can take either discrete or continuous random variables, and the random variable corresponds to a certain unit. 1,2,3... does not represent a scale. Furthermore, for simpler calculations, I chose attributes where 0 cannot mean non-existance. For example, Some houses 'basement squarefeet' could have 0 because the basement does not exist. Since I calculated the outliers with IQR, this could make the results wrong.

In [9]:
outlier_cols = ['YearBuilt',
                   '1stFlrSF', 'FullBath',
                   'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd',
                    'MoSold', 'YrSold' ]

In [None]:
def remove_outliers_iqr(df, columns):
    df_clean = df.copy()
    
    for col in columns:
        # Calculate Q1, Q3, and IQR
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        
        # Define outlier bounds
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        # Create mask for non-outliers
        mask = (df_clean[col] >= lower_bound) & (df_clean[col] <= upper_bound)
        df_clean = df_clean[mask]
        
        print(f"{col}: removed {len(df) - len(df_clean)} outliers")
    
    return df_clean

# Remove outliers
df_train_no_outliers = remove_outliers_iqr(df_train, outlier_cols)

# Print shape comparison
print("\nShape comparison:")
print(f"Original: {df_train.shape}")
print(f"After outlier removal: {df_train_no_outliers.shape}")

YearBuilt: removed 0 outliers
1stFlrSF: removed 13 outliers
FullBath: removed 13 outliers
BedroomAbvGr: removed 40 outliers
KitchenAbvGr: removed 91 outliers
TotRmsAbvGrd: removed 103 outliers
MoSold: removed 103 outliers
YrSold: removed 103 outliers

Shape comparison:
Original: (1200, 80)
After outlier removal: (1097, 80)


In [11]:
df_train = df_train_no_outliers

df_train.shape

(1097, 80)

## Changing all columns to numerical quantities

In [None]:
#Change columns with 'Yes'/'No' to 1/0
yes_no_cols = [col for col in df_train.columns if set(df_train[col].unique()) == {'Yes', 'No'}]
for col in yes_no_cols:
    df_train[col] = df_train[col].map({'Yes': 1, 'No': 0})

# One-hot encode categorical variables


# Identify categorical and object columns
cat_cols = df_train.select_dtypes(include=['object', 'category']).columns.tolist()

print(f"Categorical columns to encode: {cat_cols}")
print(f"Number of categorical columns: {len(cat_cols)}")

# Initialize OneHotEncoder
# sparse_output=False returns a dense array instead of sparse matrix
# handle_unknown='ignore' will ignore unknown categories during transform
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore', drop=None)

# Fit and transform the categorical columns
encoded_array = encoder.fit_transform(df_train[cat_cols])

# Get feature names for the encoded columns
encoded_feature_names = encoder.get_feature_names_out(cat_cols)

# Create a dataframe with the encoded features
df_encoded = pd.DataFrame(
    encoded_array, 
    columns=encoded_feature_names,
    index=df_train.index
)

# Drop original categorical columns and concatenate encoded columns
df_train_encoded = df_train.drop(columns=cat_cols)
df_train_encoded = pd.concat([df_train_encoded, df_encoded], axis=1)

# Convert boolean columns to integers (if any remain)
bool_cols = df_train_encoded.select_dtypes(include='bool').columns
if len(bool_cols) > 0:
    df_train_encoded[bool_cols] = df_train_encoded[bool_cols].astype(int)

print(f"\nOriginal shape: {df_train.shape}")
print(f"Encoded shape: {df_train_encoded.shape}")
print(f"\nFirst few rows:")
df_train_encoded.head()




Categorical columns to encode: ['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']
Number of categorical columns: 43

Original shape: (1097, 80)
Encoded shape: (1097, 290)

First few rows:


Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,60,65.0,8450,7,5,2003,2003,196.0,706,0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,20,80.0,9600,6,8,1976,1976,0.0,978,0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,60,68.0,11250,7,5,2001,2002,162.0,486,0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,70,60.0,9550,7,5,1915,1970,0.0,216,0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
4,60,84.0,14260,8,5,2000,2000,350.0,655,0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [13]:
df_train_encoded.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1097 entries, 0 to 1459
Columns: 290 entries, MSSubClass to SaleCondition_Partial
dtypes: float64(256), int64(34)
memory usage: 2.4 MB


## Applying the same method to the test data

In [31]:
data_test = pd.read_csv('test.csv')
df_test = pd.DataFrame(data_test)

df_test

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,6,2006,WD,Normal
1455,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2006,WD,Abnorml
1456,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,9,2006,WD,Abnorml
1457,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


In [32]:
#df_test = df_test.drop('Id', axis=1)
#df_test = df_test.dropna(subset=['LotFrontage', 'Electrical'])

# Apply Yes/No mapping to test data
for col in yes_no_cols:
    df_test[col] = df_test[col].map({'Yes': 1, 'No': 0})

# Transform test data using the fitted encoder
encoded_test_array = encoder.transform(df_test[cat_cols])

# Create DataFrame with encoded features
df_encoded = pd.DataFrame(
    encoded_test_array,
    columns=encoded_feature_names,
    index=df_test.index
)

# Drop original categorical columns and concatenate encoded columns
df_test_encoded = df_test.drop(columns=cat_cols)
df_test_encoded = pd.concat([df_test_encoded, df_encoded], axis=1)

# Convert boolean columns to integers (if any)
if len(bool_cols) > 0:
    df_test_encoded[bool_cols] = df_test_encoded[bool_cols].astype(int)

df_test_for_pred = df_test_encoded.drop('Id', axis=1)

print(f"Original test shape: {df_test.shape}")
print(f"Encoded test shape: {df_test_for_pred.shape}")


Original test shape: (1459, 80)
Encoded test shape: (1459, 289)


# Splitting and fitting the data

In [16]:
# setting up the data and target variables
data = df_train_encoded.drop('SalePrice', axis=1)
target = df_train_encoded['SalePrice']

# Splitting the training data into training and validation sets
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.1, random_state=42)

In [17]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 987 entries, 1217 to 1136
Columns: 289 entries, MSSubClass to SaleCondition_Partial
dtypes: float64(256), int64(33)
memory usage: 2.2 MB


## Baseline Regressor

In [18]:
dum = dummy.DummyRegressor()
dum.fit(X_train, y_train)
y_pred = dum.predict(X_test)
y_pred


array([179394.91489362, 179394.91489362, 179394.91489362, 179394.91489362,
       179394.91489362, 179394.91489362, 179394.91489362, 179394.91489362,
       179394.91489362, 179394.91489362, 179394.91489362, 179394.91489362,
       179394.91489362, 179394.91489362, 179394.91489362, 179394.91489362,
       179394.91489362, 179394.91489362, 179394.91489362, 179394.91489362,
       179394.91489362, 179394.91489362, 179394.91489362, 179394.91489362,
       179394.91489362, 179394.91489362, 179394.91489362, 179394.91489362,
       179394.91489362, 179394.91489362, 179394.91489362, 179394.91489362,
       179394.91489362, 179394.91489362, 179394.91489362, 179394.91489362,
       179394.91489362, 179394.91489362, 179394.91489362, 179394.91489362,
       179394.91489362, 179394.91489362, 179394.91489362, 179394.91489362,
       179394.91489362, 179394.91489362, 179394.91489362, 179394.91489362,
       179394.91489362, 179394.91489362, 179394.91489362, 179394.91489362,
       179394.91489362, 1

In [19]:
# Store the actual data and the predictions in a DataFrame for later
price_test = pd.DataFrame()
price_test["Actual_price"] = y_test
price_test["Dummy_price"] = y_pred

price_test

Unnamed: 0,Actual_price,Dummy_price
60,158000,179394.914894
741,142000,179394.914894
77,127000,179394.914894
829,147400,179394.914894
632,82500,179394.914894
...,...,...
1191,174000,179394.914894
510,164900,179394.914894
704,213000,179394.914894
374,219500,179394.914894


In [20]:
# Baseline: Dummy Regressor
print("\n" + "="*50)
print("DUMMY REGRESSOR (Baseline)")
print("="*50)
dum = dummy.DummyRegressor()
dum.fit(X_train, y_train)
y_pred_dummy = dum.predict(X_test)

dummy_mse = metrics.mean_squared_error(y_test, y_pred_dummy)
dummy_rmse = np.sqrt(dummy_mse)
dummy_r2 = metrics.r2_score(y_test, y_pred_dummy)

print(f"Dummy MSE:  {dummy_mse:,.2f}")
print(f"Dummy RMSE: {dummy_rmse:,.2f}")
print(f"Dummy R²:   {dummy_r2:.4f}")


DUMMY REGRESSOR (Baseline)
Dummy MSE:  3,942,264,023.56
Dummy RMSE: 62,787.45
Dummy R²:   -0.0244


## XGBoost

In [21]:
try:
    print("\n" + "="*50)
    print("XGBOOST REGRESSOR")
    print("="*50)

    # initialize the model
    model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)  
    # fit the model
    model.fit(X_train, y_train)
    # make predictions
    y_pred = model.predict(X_test)

    # Calculate regression metrics
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"Mean Squared Error: {mse:,.2f}")
    print(f"Root Mean Squared Error: {rmse:,.2f}")
    print(f"Mean Absolute Error: {mae:,.2f}")
    print(f"R² Score: {r2:,.2f}")

except Exception as e:
    print("Continuing with Catboost")


XGBOOST REGRESSOR
Continuing with Catboost


In [22]:
try:
    print("\n" + "="*50)
    print("LIGHTGBM REGRESSOR")
    print("="*50)

    # Initialize LightGBM
    lgbm = lgb.LGBMRegressor(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=5,
        random_state=42,
        verbose=-1
    )

    # Fit the model
    lgbm.fit(X_train, y_train)
    y_pred_lgbm = lgbm.predict(X_test)

    # Calculate metrics
    lgbm_mse = metrics.mean_squared_error(y_test, y_pred_lgbm)
    lgbm_rmse = np.sqrt(lgbm_mse)
    lgbm_r2 = metrics.r2_score(y_test, y_pred_lgbm)
    lgbm_mae = metrics.mean_absolute_error(y_test, y_pred_lgbm)

    # Print results
    print(f"LGBM MSE:  {lgbm_mse:,.2f}")
    print(f"LGBM RMSE: {lgbm_rmse:,.2f}")
    print(f"LGBM MAE:  {lgbm_mae:,.2f}")
    print(f"LGBM R²:   {lgbm_r2:.4f}")

except Exception as e:
    print("Continuing with Catboost")



LIGHTGBM REGRESSOR
Continuing with Catboost


In [23]:
try:
    print("\n" + "="*50)
    print("CATBOOST REGRESSOR")
    print("="*50)
    
    # Initialize CatBoost
    cat = CatBoostRegressor(
        iterations=100,
        learning_rate=0.1,
        depth=5,
        random_seed=42,
        verbose=False
    )

    # Fit the model
    cat.fit(X_train, y_train)
    y_pred_cat = cat.predict(X_test)

    # Calculate metrics
    cat_mse = metrics.mean_squared_error(y_test, y_pred_cat)
    cat_rmse = np.sqrt(cat_mse)
    cat_r2 = metrics.r2_score(y_test, y_pred_cat)
    cat_mae = metrics.mean_absolute_error(y_test, y_pred_cat)

    # Print results
    print(f"CAT MSE:  {cat_mse:,.2f}")
    print(f"CAT RMSE: {cat_rmse:,.2f}")
    print(f"CAT MAE:  {cat_mae:,.2f}")
    print(f"CAT R²:   {cat_r2:.4f}")

except Exception as e:
    print("Went with XGBoost and LightGBM instead")


CATBOOST REGRESSOR
CAT MSE:  347,693,061.10
CAT RMSE: 18,646.53
CAT MAE:  12,244.29
CAT R²:   0.9097


## Hyperparameter tuning

### Catboost

In [None]:
# Define parameter grid
param_dist = {
    'iterations': randint(100, 1000),
    'learning_rate': uniform(0.01, 0.3),
    'depth': randint(3, 10),
    'l2_leaf_reg': uniform(1, 10),
    'bagging_temperature': uniform(0, 1),
    'random_strength': uniform(1, 20)
}


# Setup RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=cat,
    param_distributions=param_dist,
    n_iter=20,
    cv=5,
    random_state=42,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=1
)

# Fit RandomizedSearchCV
random_search.fit(X_train, y_train)

# Get best parameters and score
print("\n" + "="*50)
print("CATBOOST BEST PARAMETERS")
print("="*50)
print(f"Best parameters: {random_search.best_params_}")
print(f"Best CV score: {-random_search.best_score_:.4f} MSE")

# Train final model with best parameters
best_cat = CatBoostRegressor(**random_search.best_params_, random_seed=42, verbose=False)
best_cat.fit(X_train, y_train)
y_pred = best_cat.predict(X_test)

# Calculate metrics
cat_mse = metrics.mean_squared_error(y_test, y_pred)
cat_rmse = np.sqrt(cat_mse)
cat_r2 = metrics.r2_score(y_test, y_pred)
cat_mae = metrics.mean_absolute_error(y_test, y_pred)

# Print results
print("\n" + "="*50)
print("OPTIMIZED CATBOOST PERFORMANCE")
print("="*50)
print(f"CAT MSE:  {cat_mse:,.2f}")
print(f"CAT RMSE: {cat_rmse:,.2f}")
print(f"CAT MAE:  {cat_mae:,.2f}")
print(f"CAT R²:   {cat_r2:.4f}")

Fitting 5 folds for each of 20 candidates, totalling 100 fits

CATBOOST BEST PARAMETERS
Best parameters: {'bagging_temperature': np.float64(0.3854165025399161), 'depth': 4, 'iterations': 664, 'l2_leaf_reg': np.float64(3.3089382562214897), 'learning_rate': np.float64(0.0823076398078035), 'random_strength': np.float64(14.665270376509165)}
Best CV score: 466372718.7792 MSE

OPTIMIZED CATBOOST PERFORMANCE
CAT MSE:  247,499,853.84
CAT RMSE: 15,732.13
CAT MAE:  10,440.87
CAT R²:   0.9357


### XGBoost

In [None]:
# Define parameter grid
param_dist = {
    'n_estimators': randint(100, 1000),
    'max_depth': randint(3, 10),
    'learning_rate': uniform(0.01, 0.3),
    'subsample': uniform(0, 1),
    'min_child_weight': randint(1, 7),
    'gamma': uniform(0, 0.5),
}

# Setup RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=xgb.XGBRegressor(objective='reg:squarederror'),
    param_distributions=param_dist,
    n_iter=20,
    cv=5,
    random_state=42,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=1
)

# Fit RandomizedSearchCV
random_search.fit(X_train, y_train)

# Get best parameters and score
print("\n" + "="*50)
print("XGBOOST BEST PARAMETERS")
print("="*50)
print(f"Best parameters: {random_search.best_params_}")
print(f"Best CV score: {-random_search.best_score_:.4f} MSE")

# Train final model with best parameters
best_xgb = xgb.XGBRegressor(
    objective='reg:squarederror',
    **random_search.best_params_,
    random_state=42
)
best_xgb.fit(X_train, y_train)
y_pred = best_xgb.predict(X_test)

# Calculate metrics
xgb_mse = metrics.mean_squared_error(y_test, y_pred)
xgb_rmse = np.sqrt(xgb_mse)
xgb_r2 = metrics.r2_score(y_test, y_pred)
xgb_mae = metrics.mean_absolute_error(y_test, y_pred)

# Print results
print("\n" + "="*50)
print("OPTIMIZED XGBOOST PERFORMANCE")
print("="*50)
print(f"XGB MSE:  {xgb_mse:,.2f}")
print(f"XGB RMSE: {xgb_rmse:,.2f}")
print(f"XGB MAE:  {xgb_mae:,.2f}")
print(f"XGB R²:   {xgb_r2:.4f}")

NameError: name 'xgb' is not defined

### LightGBM

In [34]:
# Define parameter grid
param_dist = {
    'n_estimators': randint(100, 1000),
    'learning_rate': uniform(0.01, 0.3),
    'max_depth': randint(3, 10),
    'num_leaves': randint(20, 100),
    'min_child_samples': randint(1, 50),
    'subsample': uniform(0,1),
    'colsample_bytree': uniform(0.6, 0.4),
    # 'reg_alpha': uniform(0, 2),
    # 'reg_lambda': uniform(0, 2)
}

# Setup RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=lgb.LGBMRegressor(),
    param_distributions=param_dist,
    n_iter=20,
    cv=5,
    random_state=42,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=1
)

# Fit RandomizedSearchCV
random_search.fit(X_train, y_train)

# Get best parameters and score
print("\n" + "="*50)
print("LIGHTGBM BEST PARAMETERS")
print("="*50)
print(f"Best parameters: {random_search.best_params_}")
print(f"Best CV score: {-random_search.best_score_:.4f} MSE")

# Train final model with best parameters
best_lgb = lgb.LGBMRegressor(**random_search.best_params_, random_state=42.fit(X_train, y_train)
y_pred = best_lgb.predict(X_test)

# Calculate metrics
lgb_mse = metrics.mean_squared_error(y_test, y_pred)
lgb_rmse = np.sqrt(lgb_mse)
lgb_r2 = metrics.r2_score(y_test, y_pred)
lgb_mae = metrics.mean_absolute_error(y_test, y_pred)

# Print results
print("\n" + "="*50)
print("OPTIMIZED LIGHTGBM PERFORMANCE")
print("="*50)
print(f"LGB MSE:  {lgb_mse:,.2f}")
print(f"LGB RMSE: {lgb_rmse:,.2f}")
print(f"LGB MAE:  {lgb_mae:,.2f}")
print(f"LGB R²:   {lgb_r2:.4f}")

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.026467 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2706
[LightGBM] [Info] Number of data points in the train set: 789, number of used features: 127
[LightGBM] [Info] Start training from score 180103.291508
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006998 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2881
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012084 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2723
[LightGBM] [Info] Number of data points in the train set: 789, number 

# Predicting for df_test

In [33]:
predictions = best_cat.predict(df_test_for_pred)


In [34]:
predictions_df = pd.DataFrame({
    'Id': df_test['Id'],  # Get ID column from original test data
    'SalePrice': predictions  # Predictions from the model
})

In [35]:
predictions_df

Unnamed: 0,Id,SalePrice
0,1461,122434.908372
1,1462,152458.926614
2,1463,188914.735959
3,1464,201762.244252
4,1465,182385.695561
...,...,...
1454,2915,81700.458595
1455,2916,81611.152605
1456,2917,173814.627093
1457,2918,120347.507739


In [36]:
predictions_df.to_csv('house_price_predictions_cat.csv', index=False)