# 02 Modelling (Encodings) 

In this notebook, we test different city encodings including target encoding, ordinal encoding and language model encodings.

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error

from xgboost import XGBRegressor
from sklearn.decomposition import PCA

In [5]:
df_train = pd.read_csv("data/target_encoding/df_train_encoded_target.csv", index_col=0)
df_test = pd.read_csv("data/df_test_2.csv", index_col=0)
df_val = pd.read_csv("data/df_val_2.csv", index_col=0)

In [6]:
X_train = df_train.drop(columns=["price", "id"])
X_val = df_val.drop(columns=["price", "id"])
X_test = df_test.drop(columns=["price", "id"])

y_train = df_train.loc[:, "price"]
y_val = df_val.loc[:, "price"]
y_test = df_test.loc[:, "price"]

In [7]:
# some loss functions
def rmse(y_true, y_pred):
    return np.sqrt(np.mean((y_pred - y_true) ** 2))

def smape(y_true, y_pred):
    return np.mean(200 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))

def rmspe(y_true, y_pred):
    epsilon = 1e-6  # Small constant to avoid division by zero
    return np.sqrt(np.mean(((y_true - y_pred) / (y_true + epsilon)) ** 2))

### Use transformer embeddings for city description

#### Finetuned DistillBert

In [8]:
df_finetune_encoding = pd.read_csv("data/finetune_encoding.csv", index_col=0)

In [9]:
df_finetune_encoding = df_finetune_encoding.drop(columns=["description", "county", "price"])

In [10]:
df_all = pd.read_csv("data/df_all.csv", index_col=0)

In [11]:
X_train["city_desc"] = df_all.loc[df_train.index].city
X_val["city_desc"] = df_all.loc[df_val.index].city
X_test["city_desc"] = df_all.loc[df_test.index].city

In [12]:
X_train_distill = X_train.merge(df_finetune_encoding, how="left", left_on="city_desc", right_on="city")
X_val_distill = X_val.merge(df_finetune_encoding, how="left", left_on="city_desc", right_on="city")
X_test_distill = X_test.merge(df_finetune_encoding, how="left", left_on="city_desc", right_on="city")

In [13]:
X_train_distill = X_train_distill.drop(columns=["city_desc", "city_y"])
X_test_distill = X_test_distill.drop(columns=["city_desc", "city_y"])
X_val_distill = X_val_distill.drop(columns=["city_desc", "city_y"])

In [14]:
parameters = {"objective": 'reg:squarederror',
            "n_estimators":350,  # Number of boosting rounds
            "learning_rate":0.1,  # Step size shrinkage
            "max_depth":10,  # Maximum depth of a tree
            }

    
model = XGBRegressor(**parameters)
model.fit(X_train_distill, y_train)
y_hat = model.predict(X_val_distill)

print(f"MSE: {mean_squared_error(y_true=y_val, y_pred=y_hat)}")
print(f"MAE: {mean_absolute_error(y_true=y_val, y_pred=y_hat)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_val, y_pred=y_hat)}")
print(f"R2: {r2_score(y_true=y_val, y_pred=y_hat)}")
print(f"RMSE: {rmse(y_true=y_val, y_pred=y_hat)}")
print(f"SMAPE: {smape(y_true=y_val, y_pred=y_hat)}")
print(f"RMSPE: {rmspe(y_true=y_val, y_pred=y_hat)}")

MSE: 39660029912.62561
MAE: 59758.32820244795
MAPE: 27.824942871981506
R2: 0.8103508449225142
RMSE: 199148.26113382363
SMAPE: 15.532924894071122
RMSPE: 1880.848403449926


In [15]:
parameters = {"objective": 'reg:squarederror',
            "n_estimators":350,  # Number of boosting rounds
            "learning_rate":0.1,  # Step size shrinkage
            "max_depth":10,  # Maximum depth of a tree
            }

    
model = XGBRegressor(**parameters)
model.fit(X_train_distill.drop(columns="city_x"), y_train)
y_hat = model.predict(X_val_distill.drop(columns="city_x"))

print(f"MSE: {mean_squared_error(y_true=y_val, y_pred=y_hat)}")
print(f"MAE: {mean_absolute_error(y_true=y_val, y_pred=y_hat)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_val, y_pred=y_hat)}")
print(f"R2: {r2_score(y_true=y_val, y_pred=y_hat)}")
print(f"RMSE: {rmse(y_true=y_val, y_pred=y_hat)}")
print(f"SMAPE: {smape(y_true=y_val, y_pred=y_hat)}")
print(f"RMSPE: {rmspe(y_true=y_val, y_pred=y_hat)}")

MSE: 39626861341.84901
MAE: 59825.11948297643
MAPE: 26.676661351653383
R2: 0.8105094527560627
RMSE: 199064.96764084083
SMAPE: 15.603688051883815
RMSPE: 1665.8716097177432


### PCA

#### finetuned distillBert

In [16]:
pca = PCA(n_components=3, random_state=56)

In [17]:
df_distill_pca = pd.DataFrame(pca.fit_transform(df_finetune_encoding.drop(columns="city")))

In [18]:
df_distill_pca["city"] = df_finetune_encoding.city

In [19]:
X_train["city_desc"] = df_all.loc[df_train.index].city
X_val["city_desc"] = df_all.loc[df_val.index].city
X_test["city_desc"] = df_all.loc[df_test.index].city

In [20]:
X_train_distill_pca = X_train.merge(df_distill_pca, how="left", left_on="city_desc", right_on="city")
X_val_distill_pca = X_val.merge(df_distill_pca, how="left", left_on="city_desc", right_on="city")
X_test_distill_pca = X_test.merge(df_distill_pca, how="left", left_on="city_desc", right_on="city")

In [21]:
X_train_distill_pca = X_train_distill_pca.drop(columns=["city_desc", "city_y"])
X_test_distill_pca = X_test_distill_pca.drop(columns=["city_desc", "city_y"])
X_val_distill_pca = X_val_distill_pca.drop(columns=["city_desc", "city_y"])

In [22]:
parameters = {"objective": 'reg:squarederror',
            "n_estimators":350,  # Number of boosting rounds
            "learning_rate":0.1,  # Step size shrinkage
            "max_depth":10,  # Maximum depth of a tree
            }

    
model = XGBRegressor(**parameters)
model.fit(X_train_distill_pca, y_train)
y_hat = model.predict(X_val_distill_pca)

print(f"MSE: {mean_squared_error(y_true=y_val, y_pred=y_hat)}")
print(f"MAE: {mean_absolute_error(y_true=y_val, y_pred=y_hat)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_val, y_pred=y_hat)}")
print(f"R2: {r2_score(y_true=y_val, y_pred=y_hat)}")
print(f"RMSE: {rmse(y_true=y_val, y_pred=y_hat)}")
print(f"SMAPE: {smape(y_true=y_val, y_pred=y_hat)}")
print(f"RMSPE: {rmspe(y_true=y_val, y_pred=y_hat)}")

MSE: 39652496302.67639
MAE: 59925.448281221914
MAPE: 27.649414421456086
R2: 0.8103868696750094
RMSE: 199129.34565923826
SMAPE: 15.589927156941393
RMSPE: 1929.5510105363237


- best performance with parameters (0.7856264868872997):
- n_estimators: 350, learning_rate: 0.1, max_depth: 10, pca: 3, random_state=56 and still onehot-encoding of city

In [23]:
parameters = {"objective": 'reg:squarederror',
            "n_estimators":350,  # Number of boosting rounds
            "learning_rate":0.1,  # Step size shrinkage
            "max_depth":10,  # Maximum depth of a tree
            }

    
model = XGBRegressor(**parameters)
model.fit(X_train_distill_pca.drop(columns="city_x"), y_train)
y_hat = model.predict(X_val_distill_pca.drop(columns="city_x"))

print(f"MSE: {mean_squared_error(y_true=y_val, y_pred=y_hat)}")
print(f"MAE: {mean_absolute_error(y_true=y_val, y_pred=y_hat)}")
print(f"MAPE: {mean_absolute_percentage_error(y_true=y_val, y_pred=y_hat)}")
print(f"R2: {r2_score(y_true=y_val, y_pred=y_hat)}")
print(f"RMSE: {rmse(y_true=y_val, y_pred=y_hat)}")
print(f"SMAPE: {smape(y_true=y_val, y_pred=y_hat)}")
print(f"RMSPE: {rmspe(y_true=y_val, y_pred=y_hat)}")

MSE: 39289647319.524734
MAE: 59837.555903141205
MAPE: 27.456369447054737
R2: 0.8121219667797528
RMSE: 198216.1631137197
SMAPE: 15.620485226565794
RMSPE: 1832.84401677657
