<a href="https://colab.research.google.com/github/Aneta521/cactus-repo/blob/br01/Cactus_aprtms_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data load

In [None]:
import pandas as pd

url_train = "https://raw.githubusercontent.com/valkova-k/cactus-repo/main/assignment09/appartments_train.csv"
url_test  = "https://raw.githubusercontent.com/valkova-k/cactus-repo/main/assignment09/appartments_test.csv"

train = pd.read_csv(url_train)
test  = pd.read_csv(url_test)

# Úprava pražských čtvrtí

In [None]:
import unicodedata

DASH = r"[\-\u2012-\u2015\u2212]"
def normalize_addr(s: str) -> str:
    if not isinstance(s, str):
        return ""
    s = re.sub(DASH, "-", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def strip_accents(text):
    if isinstance(text, str):
        return ''.join(
            c for c in unicodedata.normalize('NFKD', text)
            if not unicodedata.combining(c)
        )
    return text

# 1) SPECIFICKÝ edge-case: "Praha - Praha 10"
p_praha_dash_praha_num = re.compile(r"Praha\s*-\s*Praha\s*(?P<num>\d{1,2})\b", re.IGNORECASE)

# 2) standard: "Praha 10 - Žižkov" (po pomlčce nesmí začínat "Praha")
p_num_bor = re.compile(r"Praha\s*(?P<num>\d{1,2})\s*-\s*(?P<bor>(?!Praha\b)[^,]+)", re.IGNORECASE)

# 3) "Praha - Žižkov" (bez čísla)
p_bor_only = re.compile(r"Praha\s*-\s*(?P<bor>(?!Praha\b)[^,]+)", re.IGNORECASE)

# 4) fallback na samotné číslo: "... Praha 10 ..."
p_num_only = re.compile(r"\bPraha\s*(?P<num>\d{1,2})\b", re.IGNORECASE)

def extract_from_address(addr: str):
    s = normalize_addr(addr)
    if not s:
        return (np.nan, np.nan)

    # A) "Praha - Praha 10"  -> num=10, borough = NaN
    m = p_praha_dash_praha_num.search(s)
    if m:
        return (int(m.group("num")), np.nan)

    # B) "Praha 10 - Žižkov" -> num=10, borough="Žižkov"
    m = p_num_bor.search(s)
    if m:
        num = int(m.group("num"))
        bor = m.group("bor").strip()
        return (num, bor if bor else np.nan)

    # C) "Praha - Žižkov" -> num=NaN, borough="Žižkov"
    m = p_bor_only.search(s)
    if m:
        bor = m.group("bor").strip()
        return (np.nan, bor if bor else np.nan)

    # D) fallback: "... Praha 10 ..." -> num=10, borough=NaN
    m = p_num_only.search(s)
    if m:
        return (int(m.group("num")), np.nan)

    # E) poslední fallback: část za poslední pomlčkou jako "čtvrť"
    m = re.search(r"-\s*([^,]+)$", s)
    if m:
        return (np.nan, m.group(1).strip())

    return (np.nan, np.nan)

# --- jen TRAIN teď ---
train[["praha_num", "borough"]] = train["address"].apply(
    lambda s: pd.Series(extract_from_address(s))
)

# Normalizace názvu čtvrti (bez diakritiky a lowercase) – užitečné pro OneHot
train["borough_norm"] = train["borough"].apply(strip_accents).str.lower()

# Rychlá kontrola výstupu:
print(train[["address", "praha_num", "borough_norm"]].head(50))


                                    address  praha_num borough_norm
0                  Tavolníková, Praha - Krč        NaN          krc
1               Pitterova, Praha 3 - Žižkov        3.0       zizkov
2              Perucká, Praha 2 - Vinohrady        2.0    vinohrady
3                 Brožíkova, Praha - Košíře        NaN       kosire
4               Hnězdenská, Praha 8 - Troja        8.0        troja
5              Patočkova, Praha 6 - Břevnov        6.0      brevnov
6               Okřínecká, Praha 9 - Prosek        9.0       prosek
7    Pod stolovou horou, Praha 5 - Jinonice        5.0     jinonice
8         Za invalidovnou, Praha 8 - Karlín        8.0       karlin
9               Toušeňská, Praha 4 - Lhotka        4.0       lhotka
10            Petrská, Praha 1 - Nové Město        1.0   nove mesto
11              Sokolovská, Praha 8 - Libeň        8.0        liben
12          Pod Krocínkou, Praha - Vysočany        NaN     vysocany
13         Nad vinohradem, Praha 4 - Braník     

In [None]:
# slovnik na cisla Prahy

#pár velmi bezpečných ručních přiřazení (volitelné; doplň jen to, co je 100% jasné)
manual_map = {
    "zizkov": 3,
    "karlin": 8,
    "smichov": 5,
    "dejvice": 6,
    "holesovice": 7,
    "vrsovice": 10,
    "vysocany": 9,
    "kobylisy": 8,
    "bohnice": 8,
    "prosek": 9,
    "braník": 4,  # pozor na čárku vs bez čárky – viz normalize níže, jinak používej 'branik'
    "branik": 4,
    "krc": 4,
    "podoli": 4,
    "modrany": 12,
    "stodulky": 13,
    "chodov": 11,
    "haje": 11,
    "letnany": 18,
    "kbely": 19,
    "horni pocernice": 20,
    "uhrineves": 22,
    "radotin": 16,
    "barrandov": 5,
    "jinonice": 5,
    "kosire": 5,
    "motol": 5,
    "vokovice": 6,
    "veleslavin": 6,
    "brevnov": 6,
    "suchdol": 6,
    "nebusice": 6,
    "troja": 7,
}

# 3) datově odhadneme mapování: pro každou čtvrť (borough_norm) vezmi nejčastější praha_num
#    + spočítáme míru shody, abychom věděli, jestli je čtvrť jednoznačná
grp = (
    train.loc[train["borough_norm"].notna() & train["praha_num"].notna(), ["borough_norm", "praha_num"]]
    .groupby("borough_norm")["praha_num"]
)

mode_map = grp.agg(lambda s: s.mode().iloc[0]).to_dict()
counts = grp.value_counts().rename("count").reset_index()  # (borough_norm, praha_num, count)
totals = counts.groupby("borough_norm")["count"].sum().rename("total")
top = (
    counts.sort_values(["borough_norm", "count"], ascending=[True, False])
    .groupby("borough_norm")
    .head(1)
    .merge(totals, on="borough_norm")
)
top["confidence"] = top["count"] / top["total"]

# jen ty, kde je shoda dost vysoká (např. ≥ 0.85)
auto_map = {row["borough_norm"]: int(row["praha_num"]) for _, row in top.iterrows() if row["confidence"] >= 0.85}

# 4) finální slovník: ruční mapování má prioritu, pak datové
borough_to_praha = {**auto_map, **manual_map}  # manual overrides

# 5) imputace praha_num z borough_norm podle slovníku (JEN TRAIN teď)
mask_na = train["praha_num"].isna() & train["borough_norm"].notna()
train.loc[mask_na, "praha_num_imputed"] = train.loc[mask_na, "borough_norm"].map(borough_to_praha)
# kde se podařilo doplnit, přeneseme do praha_num
fill_mask = train["praha_num"].isna() & train["praha_num_imputed"].notna()
train.loc[fill_mask, "praha_num"] = train.loc[fill_mask, "praha_num_imputed"].astype("Int64")
train = train.drop(columns=["praha_num_imputed"])

# 6) report: kolik se podařilo doplnit a co zůstalo sporné
imputed_n = fill_mask.sum()
remaining_na = train["praha_num"].isna().sum()
print(f"Imputováno praha_num z borough: {imputed_n} řádků")
print(f"Zbývající praha_num NA: {remaining_na}")

# čtvrti, které byly nejednoznačné (confidence < 0.85) – můžeš se rozhodnout je přidat ručně
ambiguous = top[top["confidence"] < 0.85].sort_values("confidence")
ambiguous[["borough_norm", "praha_num", "count", "total", "confidence"]].head(20)

Imputováno praha_num z borough: 430 řádků
Zbývající praha_num NA: 206


Unnamed: 0,borough_norm,praha_num,count,total,confidence
50,nove mesto,1.0,107,175,0.611429
84,vinohrady,2.0,110,172,0.639535
4,bubenec,6.0,32,46,0.695652
90,zabehlice,10.0,91,127,0.716535
73,strizkov,9.0,42,57,0.736842
39,liben,8.0,109,145,0.751724


# Pipeline na zpracování dat

In [None]:
unique_words = [
   "originální", "originálně", "originálními", "originálním", "originálních",
   "zajímavá", "zajímavého", "zajímavé", "zajímavém",
   "zajímavým", "zajímavých", "zajímavě",
   "zajímavou", "zajímavý", "atypický", "atypicky", "neobvyklý",
   "nevšední", "unikátní", "unikátním", "unikátními", "šikovný"
 ]

In [None]:
import pandas as pd
import numpy as np
import re
import unicodedata

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, KFold


# ===============================================================
# ----------- 1) CUSTOM TRANSFORMERS WITH YOUR LOGIC ----------
# ===============================================================

class BasicImputer(BaseEstimator, TransformerMixin):
    """
    garden_area, balcony_area, cellar_area → fillna(0)
    elevator → Yes/No + fillna, cast
    parking → fillna(0)
    sentinel imputation for POI columns
    """
    def __init__(self, nearest_cols, impute_val=9999):
        self.nearest_cols = nearest_cols
        self.impute_val = impute_val

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()

        # Simple 0-fill
        for col in ["garden_area", "balcony_area", "cellar_area"]:
            X[col] = X[col].fillna(0)

        # Elevator and parking
        X["elevator"] = (
            X["elevator"]
            .replace({"Yes": 1, "No": 0})
            .fillna(0)
            .astype(int)
        )
        X["parking"] = X["parking"].fillna(0).astype(int)

        # POI sentinel imputation
        for c in self.nearest_cols:
            X[c] = X[c].fillna(self.impute_val)
            X[c + "_exists"] = (X[c] != self.impute_val).astype(int) # FIX: Changed self.impute_int to self.impute_val

        return X


class FloorsTransformer(BaseEstimator, TransformerMixin):
    """
    Fix total_floors; add is_ground, is_topfloor, floor_ratio
    """
    def fit(self, X, y=None):
        self.med = X["total_floors"].median()
        return self

    def transform(self, X):
        X = X.copy()
        X["total_floors"] = X["total_floors"].fillna(self.med)
        X["total_floors"] = X[["total_floors", "floor"]].max(axis=1)

        X["is_ground"] = (X["floor"] <= 1).astype(int)
        X["is_topfloor"] = (X["floor"] == X["total_floors"]).astype(int)
        X["floor_ratio"] = X["floor"] / X["total_floors"]

        return X


class RoomsLayoutTransformer(BaseEstimator, TransformerMixin):
    """
    Extract number of rooms + is_kk
    """
    def fit(self, X, y=None):
        # median for NA rooms
        rooms = X["layout"].map(self.extract_rooms)
        self.med = rooms.median()
        return self

    def extract_rooms(self, x):
        m = re.match(r"\s*(\d+)\s*\+", str(x))
        return int(m.group(1)) if m else np.nan

    def transform(self, X):
        X = X.copy()
        X["rooms"] = X["layout"].map(self.extract_rooms)
        X["rooms"] = X["rooms"].fillna(self.med).astype(int)
        X["is_kk"] = X["layout"].str.contains("kk", case=False, na=False).astype(int)
        return X


class DateTransformer(BaseEstimator, TransformerMixin):
    """
    Convert first_seen, last_seen → datetime, compute listing_days
    """
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X["first_seen_dt"] = pd.to_datetime(X["first_seen"], errors="coerce")
        X["last_seen_dt"] = pd.to_datetime(X["last_seen"], errors="coerce")
        X["listing_days"] = (X["last_seen_dt"] - X["first_seen_dt"]).dt.days
        X["listing_days"] = X["listing_days"].clip(lower=0).fillna(0)
        return X


# helper functions for borough extraction
DASH = r"[\-\u2012-\u2015\u2212]"
p_praha_dash_praha_num = re.compile(r"Praha\s*-\s*Praha\s*(?P<num>\d{1,2})\b", re.IGNORECASE)
p_num_bor = re.compile(r"Praha\s*(?P<num>\d{1,2})\s*-\s*(?P<bor>(?!Praha\b)[^,]+)", re.IGNORECASE)
p_bor_only = re.compile(r"Praha\s*-\s*(?P<bor>(?!Praha\b)[^,]+)", re.IGNORECASE)
p_num_only = re.compile(r"\bPraha\s*(?P<num>\d{1,2})\b", re.IGNORECASE)

def normalize_addr(s: str) -> str:
    if not isinstance(s, str): return ""
    s = re.sub(DASH, "-", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def strip_accents(text):
    if isinstance(text, str):
        return ''.join(c for c in unicodedata.normalize('NFKD', text)
                       if not unicodedata.combining(c))
    return text

def extract_from_address(addr: str):
    s = normalize_addr(addr)
    if not s:
        return (np.nan, np.nan)

    m = p_praha_dash_praha_num.search(s)
    if m: return int(m.group("num")), np.nan

    m = p_num_bor.search(s)
    if m: return int(m.group("num")), m.group("bor").strip()

    m = p_bor_only.search(s)
    if m: return np.nan, m.group("bor").strip()

    m = p_num_only.search(s)
    if m: return int(m.group("num")), np.nan

    m = re.search(r"-\s*([^,]+)$", s)
    if m: return np.nan, m.group(1).strip()

    return (np.nan, np.nan)


class BoroughTransformer(BaseEstimator, TransformerMixin):
    """
    Extract praha_num, borough_norm & map missing praha_num using the dictionary.
    """
    def __init__(self, borough_to_praha: dict):
        self.borough_to_praha = borough_to_praha

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()

        tmp = X["address"].apply(lambda s: pd.Series(extract_from_address(s)))
        X["praha_num"] = tmp.iloc[:, 0]
        X["borough"] = tmp.iloc[:, 1]

        X["borough_norm"] = X["borough"].apply(strip_accents).str.lower()

        mask = X["praha_num"].isna() & X["borough_norm"].notna()
        X.loc[mask, "praha_num"] = X.loc[mask, "borough_norm"].map(self.borough_to_praha)

        return X

class UniqueDescriptionTransformer(BaseEstimator, TransformerMixin):
    """
    Adds a 'unique_in_desc' column based on whether certain unique words are present in the 'text' column.
    """
    def __init__(self, unique_words):
        self.unique_words = unique_words
        self.combined_regex = None

    def fit(self, X, y=None):
        # Pre-compile the regex pattern during fit for efficiency
        # Use a non-capturing group (?:...) to avoid UserWarning about match groups with str.contains
        combined_pattern = r'\b(?:' + '|'.join(re.escape(word) for word in self.unique_words) + r')\b'
        self.combined_regex = re.compile(combined_pattern, re.IGNORECASE)
        return self

    def transform(self, X):
        X = X.copy()
        if self.combined_regex is None:
            raise RuntimeError("Transformer has not been fitted. Call fit() before transform().")

        # Create the new boolean column 'unique_in_desc'
        # It's important to use .astype(str) as 'text' column might have NaNs
        X['unique_in_desc'] = X['text'].astype(str).str.contains(self.combined_regex, na=False)
        return X

# Pipeline na model

In [None]:
def make_full_pipeline(borough_to_praha, unique_words):
    nearest_cols = [
        "poi_doctors_nearest",
        "poi_leisure_time_nearest",
        "poi_school_kindergarten_nearest",
        "poi_transport_nearest",
        "poi_grocery_nearest",
        "poi_restaurant_nearest",
    ]

    preprocessing = Pipeline([
        ("basic", BasicImputer(nearest_cols)),
        ("floors", FloorsTransformer()),
        ("rooms", RoomsLayoutTransformer()),
        ("dates", DateTransformer()),
        ("boroughs", BoroughTransformer(borough_to_praha)),
        ("unique_desc", UniqueDescriptionTransformer(unique_words)), # Add the new transformer
    ])

    # after your custom steps, apply a ColumnTransformer
    numeric_cols = [
        "garden_area", "balcony_area", "cellar_area",
        "elevator", "parking", "is_ground", "is_topfloor",
        "floor_ratio", "rooms", "is_kk", "listing_days",
        "praha_num", "unique_in_desc"
    ] + nearest_cols + [c+"_exists" for c in nearest_cols]

    categorical_cols = ["borough_norm"]

    ct = ColumnTransformer([
        ("num", SimpleImputer(strategy="median"), numeric_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
    ])

    model = RandomForestRegressor(n_estimators=500, random_state=42)

    full_pipe = Pipeline([
        ("prep", preprocessing),
        ("ct", ct),
        ("model", model),
    ])

    return full_pipe

# Finální validace


In [None]:
from sklearn.metrics import make_scorer, mean_absolute_percentage_error

def mape_scorer(y_true, y_pred):
    return mean_absolute_percentage_error(y_true, y_pred)

mape = make_scorer(mape_scorer, greater_is_better=False) # greater_is_better=False because lower MAPE is better

pipeline = make_full_pipeline(borough_to_praha, unique_words)

X = train.drop(columns=["price"])
y = train["price"]

# Perform cross-validation with MAPE
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores_mape = cross_val_score(pipeline, X, y, cv=kf, scoring=mape)
cv_mape = -cv_scores_mape # Multiply by -1 because make_scorer returns negative scores for metrics where lower is better

print(f"Cross-validation MAPE scores: {cv_mape}")
print(f"Mean CV MAPE: {np.mean(cv_mape):.4f}")
print(f"Std CV MAPE: {np.std(cv_mape):.4f}")

# Fit the pipeline on the full training data and make predictions on the test set
pipeline.fit(X, y)
preds = pipeline.predict(test)

  .replace({"Yes": 1, "No": 0})
  .replace({"Yes": 1, "No": 0})
  .replace({"Yes": 1, "No": 0})
  .replace({"Yes": 1, "No": 0})
  .replace({"Yes": 1, "No": 0})
  .replace({"Yes": 1, "No": 0})
  .replace({"Yes": 1, "No": 0})
  .replace({"Yes": 1, "No": 0})
  .replace({"Yes": 1, "No": 0})
  .replace({"Yes": 1, "No": 0})


Cross-validation MAPE scores: [0.16305206 0.16653931 0.16346487 0.17069403 0.17058291]
Mean CV MAPE: 0.1669
Std CV MAPE: 0.0033


  .replace({"Yes": 1, "No": 0})
  .replace({"Yes": 1, "No": 0})


These scores indicate that, on average, the model's predictions deviate by about 16.70% from the actual values, with a standard deviation of 0.33% across the folds. This suggests a relatively consistent performance across different subsets of the data.

```python
def make_full_pipeline(model_estimator, borough_to_praha, unique_words):
    nearest_cols = [
        "poi_doctors_nearest",
        "poi_leisure_time_nearest",
        "poi_school_kindergarten_nearest",
        "poi_transport_nearest",
        "poi_grocery_nearest",
        "poi_restaurant_nearest",
    ]

    preprocessing = Pipeline([
        ("basic", BasicImputer(nearest_cols)),
        ("floors", FloorsTransformer()),
        ("rooms", RoomsLayoutTransformer()),
        ("dates", DateTransformer()),
        ("boroughs", BoroughTransformer(borough_to_praha)),
        ("unique_desc", UniqueDescriptionTransformer(unique_words)), # Add the new transformer
    ])

    # after your custom steps, apply a ColumnTransformer
    numeric_cols = [
        "garden_area", "balcony_area", "cellar_area",
        "elevator", "parking", "is_ground", "is_topfloor",
        "floor_ratio", "rooms", "is_kk", "listing_days",
        "praha_num", "unique_in_desc"
    ] + nearest_cols + [c+"_exists" for c in nearest_cols]

    categorical_cols = ["borough_norm"]

    ct = ColumnTransformer([
        ("num", SimpleImputer(strategy="median"), numeric_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
    ])

    full_pipe = Pipeline([
        ("prep", preprocessing),
        ("ct", ct),
        ("model", model_estimator),  # Use the model_estimator passed as argument
    ])

    return full_pipe
```

In [None]:
def make_full_pipeline(model_estimator, borough_to_praha, unique_words):
    nearest_cols = [
        "poi_doctors_nearest",
        "poi_leisure_time_nearest",
        "poi_school_kindergarten_nearest",
        "poi_transport_nearest",
        "poi_grocery_nearest",
        "poi_restaurant_nearest",
    ]

    preprocessing = Pipeline([
        ("basic", BasicImputer(nearest_cols)),
        ("floors", FloorsTransformer()),
        ("rooms", RoomsLayoutTransformer()),
        ("dates", DateTransformer()),
        ("boroughs", BoroughTransformer(borough_to_praha)),
        ("unique_desc", UniqueDescriptionTransformer(unique_words)), # Add the new transformer
    ])

    # after your custom steps, apply a ColumnTransformer
    numeric_cols = [
        "garden_area", "balcony_area", "cellar_area",
        "elevator", "parking", "is_ground", "is_topfloor",
        "floor_ratio", "rooms", "is_kk", "listing_days",
        "praha_num", "unique_in_desc"
    ] + nearest_cols + [c+"_exists" for c in nearest_cols]

    categorical_cols = ["borough_norm"]

    ct = ColumnTransformer([
        ("num", SimpleImputer(strategy="median"), numeric_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
    ])

    full_pipe = Pipeline([
        ("prep", preprocessing),
        ("ct", ct),
        ("model", model_estimator), # Use the passed model_estimator
    ])

    return full_pipe


In [None]:
def make_full_pipeline(model_estimator, borough_to_praha, unique_words):
    nearest_cols = [
        "poi_doctors_nearest",
        "poi_leisure_time_nearest",
        "poi_school_kindergarten_nearest",
        "poi_transport_nearest",
        "poi_grocery_nearest",
        "poi_restaurant_nearest",
    ]

    preprocessing = Pipeline([
        ("basic", BasicImputer(nearest_cols)),
        ("floors", FloorsTransformer()),
        ("rooms", RoomsLayoutTransformer()),
        ("dates", DateTransformer()),
        ("boroughs", BoroughTransformer(borough_to_praha)),
        ("unique_desc", UniqueDescriptionTransformer(unique_words)), # Add the new transformer
    ])

    # after your custom steps, apply a ColumnTransformer
    numeric_cols = [
        "garden_area", "balcony_area", "cellar_area",
        "elevator", "parking", "is_ground", "is_topfloor",
        "floor_ratio", "rooms", "is_kk", "listing_days",
        "praha_num", "unique_in_desc"
    ] + nearest_cols + [c+"_exists" for c in nearest_cols]

    categorical_cols = ["borough_norm"]

    ct = ColumnTransformer([
        ("num", SimpleImputer(strategy="median"), numeric_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
    ])

    full_pipe = Pipeline([
        ("prep", preprocessing),
        ("ct", ct),
        ("model", model_estimator), # Use the passed model_estimator
    ])

    return full_pipe


## Select and Prepare New Models


In [None]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# Instantiate models
# RandomForestRegressor is already used, but we'll re-instantiate for consistency.
model_rf = RandomForestRegressor(random_state=42)
model_gbr = GradientBoostingRegressor(random_state=42)
model_lr = LinearRegression()
model_xgb = XGBRegressor(random_state=42)
model_lgbm = LGBMRegressor(random_state=42)

print("Regression models instantiated successfully.")

Regression models instantiated successfully.


## Train and Evaluate Each Model


In [None]:
models = [
    ("RandomForestRegressor", model_rf),
    ("GradientBoostingRegressor", model_gbr),
    ("LinearRegression", model_lr),
    ("XGBRegressor", model_xgb),
    ("LGBMRegressor", model_lgbm),
]

results = {}

In [None]:
for name, model in models:
    print(f"\nEvaluating {name}...")

    # Create the full pipeline for the current model
    current_pipeline = make_full_pipeline(model, borough_to_praha, unique_words)

    # Perform cross-validation with MAPE
    cv_scores_mape = cross_val_score(current_pipeline, X, y, cv=kf, scoring=mape, error_score='raise')
    cv_mape = -cv_scores_mape # Multiply by -1 because make_scorer returns negative scores for metrics where lower is better

    mean_mape = np.mean(cv_mape)
    std_mape = np.std(cv_mape)

    results[name] = {"mean_mape": mean_mape, "std_mape": std_mape}

    print(f"  Mean CV MAPE: {mean_mape:.4f}")
    print(f"  Std CV MAPE: {std_mape:.4f}")

print("\n--- All Model Results ---")
for name, metrics in results.items():
    print(f"{name}: Mean MAPE = {metrics['mean_mape']:.4f}, Std MAPE = {metrics['std_mape']:.4f}")


Evaluating RandomForestRegressor...


  .replace({"Yes": 1, "No": 0})
  .replace({"Yes": 1, "No": 0})
  .replace({"Yes": 1, "No": 0})
  .replace({"Yes": 1, "No": 0})
  .replace({"Yes": 1, "No": 0})
  .replace({"Yes": 1, "No": 0})
  .replace({"Yes": 1, "No": 0})
  .replace({"Yes": 1, "No": 0})
  .replace({"Yes": 1, "No": 0})
  .replace({"Yes": 1, "No": 0})
  .replace({"Yes": 1, "No": 0})


  Mean CV MAPE: 0.1680
  Std CV MAPE: 0.0038

Evaluating GradientBoostingRegressor...


  .replace({"Yes": 1, "No": 0})
  .replace({"Yes": 1, "No": 0})
  .replace({"Yes": 1, "No": 0})
  .replace({"Yes": 1, "No": 0})
  .replace({"Yes": 1, "No": 0})
  .replace({"Yes": 1, "No": 0})
  .replace({"Yes": 1, "No": 0})
  .replace({"Yes": 1, "No": 0})
  .replace({"Yes": 1, "No": 0})
  .replace({"Yes": 1, "No": 0})


  Mean CV MAPE: 0.1743
  Std CV MAPE: 0.0036

Evaluating LinearRegression...


  .replace({"Yes": 1, "No": 0})
  .replace({"Yes": 1, "No": 0})
  .replace({"Yes": 1, "No": 0})
  .replace({"Yes": 1, "No": 0})
  .replace({"Yes": 1, "No": 0})
  .replace({"Yes": 1, "No": 0})
  .replace({"Yes": 1, "No": 0})
  .replace({"Yes": 1, "No": 0})
  .replace({"Yes": 1, "No": 0})
  .replace({"Yes": 1, "No": 0})


  Mean CV MAPE: 0.2006
  Std CV MAPE: 0.0049

Evaluating XGBRegressor...


  .replace({"Yes": 1, "No": 0})
  .replace({"Yes": 1, "No": 0})
  .replace({"Yes": 1, "No": 0})
  .replace({"Yes": 1, "No": 0})
  .replace({"Yes": 1, "No": 0})
  .replace({"Yes": 1, "No": 0})
  .replace({"Yes": 1, "No": 0})
  .replace({"Yes": 1, "No": 0})
  .replace({"Yes": 1, "No": 0})
  .replace({"Yes": 1, "No": 0})


  Mean CV MAPE: 0.1640
  Std CV MAPE: 0.0045

Evaluating LGBMRegressor...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001185 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1946
[LightGBM] [Info] Number of data points in the train set: 4000, number of used features: 71
[LightGBM] [Info] Start training from score 9627962.017750


  .replace({"Yes": 1, "No": 0})
  .replace({"Yes": 1, "No": 0})


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001404 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1952
[LightGBM] [Info] Number of data points in the train set: 4000, number of used features: 73
[LightGBM] [Info] Start training from score 9556175.422500


  .replace({"Yes": 1, "No": 0})
  .replace({"Yes": 1, "No": 0})


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000670 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1952
[LightGBM] [Info] Number of data points in the train set: 4000, number of used features: 73
[LightGBM] [Info] Start training from score 9602417.782250


  .replace({"Yes": 1, "No": 0})
  .replace({"Yes": 1, "No": 0})


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000688 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1949
[LightGBM] [Info] Number of data points in the train set: 4000, number of used features: 73
[LightGBM] [Info] Start training from score 9603492.210500


  .replace({"Yes": 1, "No": 0})
  .replace({"Yes": 1, "No": 0})


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001351 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1942
[LightGBM] [Info] Number of data points in the train set: 4000, number of used features: 71
[LightGBM] [Info] Start training from score 9612858.139000


  .replace({"Yes": 1, "No": 0})


  Mean CV MAPE: 0.1603
  Std CV MAPE: 0.0031

--- All Model Results ---
RandomForestRegressor: Mean MAPE = 0.1680, Std MAPE = 0.0038
GradientBoostingRegressor: Mean MAPE = 0.1743, Std MAPE = 0.0036
LinearRegression: Mean MAPE = 0.2006, Std MAPE = 0.0049
XGBRegressor: Mean MAPE = 0.1640, Std MAPE = 0.0045
LGBMRegressor: Mean MAPE = 0.1603, Std MAPE = 0.0031




In [None]:
import pandas as pd
import numpy as np
import re
import unicodedata

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, KFold


# ===============================================================
# ----------- 1) CUSTOM TRANSFORMERS WITH YOUR LOGIC ----------
# ===============================================================

class BasicImputer(BaseEstimator, TransformerMixin):
    """
    garden_area, balcony_area, cellar_area → fillna(0)
    elevator → Yes/No + fillna, cast
    parking → fillna(0)
    sentinel imputation for POI columns
    """
    def __init__(self, nearest_cols, impute_val=9999):
        self.nearest_cols = nearest_cols
        self.impute_val = impute_val

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()

        # Simple 0-fill
        for col in ["garden_area", "balcony_area", "cellar_area"]:
            X[col] = X[col].fillna(0)

        # Elevator and parking
        # Use .map() instead of .replace() for explicit mapping to avoid FutureWarning
        X["elevator"] = (
            X["elevator"]
            .map({"Yes": 1, "No": 0})
            .fillna(0)
            .astype(int)
        )
        X["parking"] = X["parking"].fillna(0).astype(int)

        # POI sentinel imputation
        for c in self.nearest_cols:
            X[c] = X[c].fillna(self.impute_val)
            X[c + "_exists"] = (X[c] != self.impute_val).astype(int)

        return X


class FloorsTransformer(BaseEstimator, TransformerMixin):
    """
    Fix total_floors; add is_ground, is_topfloor, floor_ratio
    """
    def fit(self, X, y=None):
        self.med = X["total_floors"].median()
        return self

    def transform(self, X):
        X = X.copy()
        X["total_floors"] = X["total_floors"].fillna(self.med)
        X["total_floors"] = X[["total_floors", "floor"]].max(axis=1)

        X["is_ground"] = (X["floor"] <= 1).astype(int)
        X["is_topfloor"] = (X["floor"] == X["total_floors"]).astype(int)
        X["floor_ratio"] = X["floor"] / X["total_floors"]

        return X


class RoomsLayoutTransformer(BaseEstimator, TransformerMixin):
    """
    Extract number of rooms + is_kk
    """
    def fit(self, X, y=None):
        # median for NA rooms
        rooms = X["layout"].map(self.extract_rooms)
        self.med = rooms.median()
        return self

    def extract_rooms(self, x):
        m = re.match(r"\s*(\d+)\s*\+", str(x))
        return int(m.group(1)) if m else np.nan

    def transform(self, X):
        X = X.copy()
        X["rooms"] = X["layout"].map(self.extract_rooms)
        X["rooms"] = X["rooms"].fillna(self.med).astype(int)
        X["is_kk"] = X["layout"].str.contains("kk", case=False, na=False).astype(int)
        return X


class DateTransformer(BaseEstimator, TransformerMixin):
    """
    Convert first_seen, last_seen → datetime, compute listing_days
    """
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X["first_seen_dt"] = pd.to_datetime(X["first_seen"], errors="coerce")
        X["last_seen_dt"] = pd.to_datetime(X["last_seen"], errors="coerce")
        X["listing_days"] = (X["last_seen_dt"] - X["first_seen_dt"]).dt.days
        X["listing_days"] = X["listing_days"].clip(lower=0).fillna(0)
        return X


# helper functions for borough extraction
DASH = r"[\-\u2012-\u2015\u2212]"
p_praha_dash_praha_num = re.compile(r"Praha\s*-\s*Praha\s*(?P<num>\d{1,2})\b", re.IGNORECASE)
p_num_bor = re.compile(r"Praha\s*(?P<num>\d{1,2})\s*-\s*(?P<bor>(?!Praha\b)[^,]+)", re.IGNORECASE)
p_bor_only = re.compile(r"Praha\s*-\s*(?P<bor>(?!Praha\b)[^,]+)", re.IGNORECASE)
p_num_only = re.compile(r"\bPraha\s*(?P<num>\d{1,2})\b", re.IGNORECASE)

def normalize_addr(s: str) -> str:
    if not isinstance(s, str): return ""
    s = re.sub(DASH, "-", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def strip_accents(text):
    if isinstance(text, str):
        return ''.join(c for c in unicodedata.normalize('NFKD', text)
                       if not unicodedata.combining(c))
    return text

def extract_from_address(addr: str):
    s = normalize_addr(addr)
    if not s:
        return (np.nan, np.nan)

    m = p_praha_dash_praha_num.search(s)
    if m: return int(m.group("num")), np.nan

    m = p_num_bor.search(s)
    if m: return int(m.group("num")), m.group("bor").strip()

    m = p_bor_only.search(s)
    if m: return np.nan, m.group("bor").strip()

    m = p_num_only.search(s)
    if m: return int(m.group("num")), np.nan

    m = re.search(r"-\s*([^,]+)$", s)
    if m: return np.nan, m.group(1).strip()

    return (np.nan, np.nan)


class BoroughTransformer(BaseEstimator, TransformerMixin):
    """
    Extract praha_num, borough_norm & map missing praha_num using the dictionary.
    """
    def __init__(self, borough_to_praha: dict):
        self.borough_to_praha = borough_to_praha

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()

        tmp = X["address"].apply(lambda s: pd.Series(extract_from_address(s)))
        X["praha_num"] = tmp.iloc[:, 0]
        X["borough"] = tmp.iloc[:, 1]

        X["borough_norm"] = X["borough"].apply(strip_accents).str.lower()

        mask = X["praha_num"].isna() & X["borough_norm"].notna()
        X.loc[mask, "praha_num"] = X.loc[mask, "borough_norm"].map(self.borough_to_praha)

        return X

class UniqueDescriptionTransformer(BaseEstimator, TransformerMixin):
    """
    Adds a 'unique_in_desc' column based on whether certain unique words are present in the 'text' column.
    """
    def __init__(self, unique_words):
        self.unique_words = unique_words
        self.combined_regex = None

    def fit(self, X, y=None):
        # Pre-compile the regex pattern during fit for efficiency
        # Use a non-capturing group (?:...) to avoid UserWarning about match groups with str.contains
        combined_pattern = r'\b(?:' + '|'.join(re.escape(word) for word in self.unique_words) + r')\b'
        self.combined_regex = re.compile(combined_pattern, re.IGNORECASE)
        return self

    def transform(self, X):
        X = X.copy()
        if self.combined_regex is None:
            raise RuntimeError("Transformer has not been fitted. Call fit() before transform().")

        # Create the new boolean column 'unique_in_desc'
        # It's important to use .astype(str) as 'text' column might have NaNs
        X['unique_in_desc'] = X['text'].astype(str).str.contains(self.combined_regex, na=False)
        return X

In [None]:
for name, model in models:
    print(f"\nEvaluating {name}...")

    # Create the full pipeline for the current model
    current_pipeline = make_full_pipeline(model, borough_to_praha, unique_words)

    # Perform cross-validation with MAPE
    cv_scores_mape = cross_val_score(current_pipeline, X, y, cv=kf, scoring=mape, error_score='raise')
    cv_mape = -cv_scores_mape # Multiply by -1 because make_scorer returns negative scores for metrics where lower is better

    mean_mape = np.mean(cv_mape)
    std_mape = np.std(cv_mape)

    results[name] = {"mean_mape": mean_mape, "std_mape": std_mape}

    print(f"  Mean CV MAPE: {mean_mape:.4f}")
    print(f"  Std CV MAPE: {std_mape:.4f}")

print("\n--- All Model Results ---")
for name, metrics in results.items():
    print(f"{name}: Mean MAPE = {metrics['mean_mape']:.4f}, Std MAPE = {metrics['std_mape']:.4f}")


Evaluating RandomForestRegressor...




  Mean CV MAPE: 0.1680
  Std CV MAPE: 0.0038

Evaluating GradientBoostingRegressor...




  Mean CV MAPE: 0.1743
  Std CV MAPE: 0.0036

Evaluating LinearRegression...




  Mean CV MAPE: 0.2006
  Std CV MAPE: 0.0049

Evaluating XGBRegressor...




  Mean CV MAPE: 0.1640
  Std CV MAPE: 0.0045

Evaluating LGBMRegressor...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000658 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1946
[LightGBM] [Info] Number of data points in the train set: 4000, number of used features: 71
[LightGBM] [Info] Start training from score 9627962.017750




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000688 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1952
[LightGBM] [Info] Number of data points in the train set: 4000, number of used features: 73
[LightGBM] [Info] Start training from score 9556175.422500




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000944 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1952
[LightGBM] [Info] Number of data points in the train set: 4000, number of used features: 73
[LightGBM] [Info] Start training from score 9602417.782250




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001475 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1949
[LightGBM] [Info] Number of data points in the train set: 4000, number of used features: 73
[LightGBM] [Info] Start training from score 9603492.210500




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001380 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1942
[LightGBM] [Info] Number of data points in the train set: 4000, number of used features: 71
[LightGBM] [Info] Start training from score 9612858.139000
  Mean CV MAPE: 0.1603
  Std CV MAPE: 0.0031

--- All Model Results ---
RandomForestRegressor: Mean MAPE = 0.1680, Std MAPE = 0.0038
GradientBoostingRegressor: Mean MAPE = 0.1743, Std MAPE = 0.0036
LinearRegression: Mean MAPE = 0.2006, Std MAPE = 0.0049
XGBRegressor: Mean MAPE = 0.1640, Std MAPE = 0.0045
LGBMRegressor: Mean MAPE = 0.1603, Std MAPE = 0.0031




**Reasoning**:
The previous output showed `FutureWarning` messages from `sklearn.pipeline` indicating that the pipeline instance was not fitted yet, even though `cross_val_score` handles fitting internally. To suppress these informational warnings and clean up the output, I will use Python's `warnings` module to filter them specifically for the `sklearn.pipeline` module.



In [None]:
import warnings

for name, model in models:
    print(f"\nEvaluating {name}...")

    # Create the full pipeline for the current model
    current_pipeline = make_full_pipeline(model, borough_to_praha, unique_words)

    # Perform cross-validation with MAPE, suppressing FutureWarnings from sklearn.pipeline
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", FutureWarning)
        cv_scores_mape = cross_val_score(current_pipeline, X, y, cv=kf, scoring=mape, error_score='raise')
    cv_mape = -cv_scores_mape # Multiply by -1 because make_scorer returns negative scores for metrics where lower is better

    mean_mape = np.mean(cv_mape)
    std_mape = np.std(cv_mape)

    results[name] = {"mean_mape": mean_mape, "std_mape": std_mape}

    print(f"  Mean CV MAPE: {mean_mape:.4f}")
    print(f"  Std CV MAPE: {std_mape:.4f}")

print("\n--- All Model Results ---")
for name, metrics in results.items():
    print(f"{name}: Mean MAPE = {metrics['mean_mape']:.4f}, Std MAPE = {metrics['std_mape']:.4f}")


Evaluating RandomForestRegressor...
  Mean CV MAPE: 0.1680
  Std CV MAPE: 0.0038

Evaluating GradientBoostingRegressor...
  Mean CV MAPE: 0.1743
  Std CV MAPE: 0.0036

Evaluating LinearRegression...
  Mean CV MAPE: 0.2006
  Std CV MAPE: 0.0049

Evaluating XGBRegressor...
  Mean CV MAPE: 0.1640
  Std CV MAPE: 0.0045

Evaluating LGBMRegressor...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000643 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1946
[LightGBM] [Info] Number of data points in the train set: 4000, number of used features: 71
[LightGBM] [Info] Start training from score 9627962.017750




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000950 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1952
[LightGBM] [Info] Number of data points in the train set: 4000, number of used features: 73
[LightGBM] [Info] Start training from score 9556175.422500




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002150 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1952
[LightGBM] [Info] Number of data points in the train set: 4000, number of used features: 73
[LightGBM] [Info] Start training from score 9602417.782250




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000716 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1949
[LightGBM] [Info] Number of data points in the train set: 4000, number of used features: 73
[LightGBM] [Info] Start training from score 9603492.210500




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000967 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1942
[LightGBM] [Info] Number of data points in the train set: 4000, number of used features: 71
[LightGBM] [Info] Start training from score 9612858.139000
  Mean CV MAPE: 0.1603
  Std CV MAPE: 0.0031

--- All Model Results ---
RandomForestRegressor: Mean MAPE = 0.1680, Std MAPE = 0.0038
GradientBoostingRegressor: Mean MAPE = 0.1743, Std MAPE = 0.0036
LinearRegression: Mean MAPE = 0.2006, Std MAPE = 0.0049
XGBRegressor: Mean MAPE = 0.1640, Std MAPE = 0.0045
LGBMRegressor: Mean MAPE = 0.1603, Std MAPE = 0.0031




In [None]:
import warnings

for name, model in models:
    print(f"\nEvaluating {name}...")

    # Create the full pipeline for the current model
    current_pipeline = make_full_pipeline(model, borough_to_praha, unique_words)

    # Perform cross-validation with MAPE, suppressing FutureWarnings and UserWarnings
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", FutureWarning)
        warnings.simplefilter("ignore", UserWarning)
        cv_scores_mape = cross_val_score(current_pipeline, X, y, cv=kf, scoring=mape, error_score='raise')
    cv_mape = -cv_scores_mape # Multiply by -1 because make_scorer returns negative scores for metrics where lower is better

    mean_mape = np.mean(cv_mape)
    std_mape = np.std(cv_mape)

    results[name] = {"mean_mape": mean_mape, "std_mape": std_mape}

    print(f"  Mean CV MAPE: {mean_mape:.4f}")
    print(f"  Std CV MAPE: {std_mape:.4f}")

print("\n--- All Model Results ---")
for name, metrics in results.items():
    print(f"{name}: Mean MAPE = {metrics['mean_mape']:.4f}, Std MAPE = {metrics['std_mape']:.4f}")


Evaluating RandomForestRegressor...
  Mean CV MAPE: 0.1680
  Std CV MAPE: 0.0038

Evaluating GradientBoostingRegressor...
  Mean CV MAPE: 0.1743
  Std CV MAPE: 0.0036

Evaluating LinearRegression...
  Mean CV MAPE: 0.2006
  Std CV MAPE: 0.0049

Evaluating XGBRegressor...
  Mean CV MAPE: 0.1640
  Std CV MAPE: 0.0045

Evaluating LGBMRegressor...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001423 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1946
[LightGBM] [Info] Number of data points in the train set: 4000, number of used features: 71
[LightGBM] [Info] Start training from score 9627962.017750
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000947 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1952
[LightGBM] [Info] Number of data points in the

## Compare Model Performance



In [None]:
results_df = pd.DataFrame.from_dict(results, orient='index')
results_df.index.name = 'Model'
results_df = results_df.reset_index()
results_df.columns = ['Model', 'Mean MAPE', 'Std MAPE']

# Sort by Mean MAPE in ascending order
results_df = results_df.sort_values(by='Mean MAPE')

print("\n--- Comparative Model Performance (Sorted by Mean MAPE) ---")
print(results_df)


--- Comparative Model Performance (Sorted by Mean MAPE) ---
                       Model  Mean MAPE  Std MAPE
4              LGBMRegressor   0.160347  0.003092
3               XGBRegressor   0.163993  0.004540
0      RandomForestRegressor   0.167998  0.003772
1  GradientBoostingRegressor   0.174311  0.003552
2           LinearRegression   0.200640  0.004851


In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

print("Imported RandomizedSearchCV and scipy.stats distributions.")

Imported RandomizedSearchCV and scipy.stats distributions.


In [None]:
lgbm_base = LGBMRegressor(random_state=42, verbose=-1) # verbose=-1 to suppress LightGBM warnings during search

# Create the full pipeline for the base LGBM model
pipeline_lgbm = make_full_pipeline(lgbm_base, borough_to_praha, unique_words)

# Define the parameter distribution for LGBMRegressor
param_dist_lgbm = {
    'model__n_estimators': randint(100, 1000),
    'model__learning_rate': uniform(0.01, 0.3),
    'model__num_leaves': randint(20, 60),
    'model__max_depth': randint(5, 15),
    'model__reg_alpha': uniform(0, 0.5),
    'model__reg_lambda': uniform(0, 0.5),
    'model__min_child_samples': randint(20, 50),
    'model__subsample': uniform(0.6, 1.0),
    'model__colsample_bytree': uniform(0.6, 1.0),
}

# Instantiate RandomizedSearchCV
lgbm_random_search = RandomizedSearchCV(
    estimator=pipeline_lgbm,
    param_distributions=param_dist_lgbm,
    n_iter=50, # Number of parameter settings that are sampled
    cv=kf,
    scoring=mape,
    random_state=42,
    verbose=1,
    n_jobs=-1 # Use all available cores
)

# Fit RandomizedSearchCV to the training data
with warnings.catch_warnings():
    warnings.simplefilter("ignore", FutureWarning)
    warnings.simplefilter("ignore", UserWarning)
    lgbm_random_search.fit(X, y)

# Print the best parameters and best score
print("\nBest parameters found:", lgbm_random_search.best_params_)
print("Best MAPE score (negative is better before multiplying by -1):", -lgbm_random_search.best_score_)

# Store the best estimator
tuned_lgbm_model = lgbm_random_search.best_estimator_
print("Tuned LGBM model stored.")

Fitting 5 folds for each of 50 candidates, totalling 250 fits


205 fits failed out of a total of 250.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
115 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.12/dist-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/sklearn/pipeline.py", line 662, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/usr/local/lib/python3.12/dist-packages/lightgbm/sklearn.py", line 13


Best parameters found: {'model__colsample_bytree': np.float64(0.8395618906669724), 'model__learning_rate': np.float64(0.05346846162736693), 'model__max_depth': 14, 'model__min_child_samples': 25, 'model__n_estimators': 515, 'model__num_leaves': 23, 'model__reg_alpha': np.float64(0.040426663166357624), 'model__reg_lambda': np.float64(0.18482722803070223), 'model__subsample': np.float64(0.8421599382774259)}
Best MAPE score (negative is better before multiplying by -1): 0.1593651923023014
Tuned LGBM model stored.


In [None]:
lgbm_base = LGBMRegressor(random_state=42, verbose=-1) # verbose=-1 to suppress LightGBM warnings during search

# Create the full pipeline for the base LGBM model
pipeline_lgbm = make_full_pipeline(lgbm_base, borough_to_praha, unique_words)

# Define the parameter distribution for LGBMRegressor
param_dist_lgbm = {
    'model__n_estimators': randint(100, 1000),
    'model__learning_rate': uniform(0.01, 0.3),
    'model__num_leaves': randint(20, 60),
    'model__max_depth': randint(5, 15),
    'model__reg_alpha': uniform(0, 0.5),
    'model__reg_lambda': uniform(0, 0.5),
    'model__min_child_samples': randint(20, 50),
    'model__subsample': uniform(0.6, 0.4), # Corrected range: values from 0.6 to 0.6 + 0.4 = 1.0
    'model__colsample_bytree': uniform(0.6, 0.4), # Corrected range: values from 0.6 to 0.6 + 0.4 = 1.0
}

# Instantiate RandomizedSearchCV
lgbm_random_search = RandomizedSearchCV(
    estimator=pipeline_lgbm,
    param_distributions=param_dist_lgbm,
    n_iter=50, # Number of parameter settings that are sampled
    cv=kf,
    scoring=mape,
    random_state=42,
    verbose=1,
    n_jobs=-1 # Use all available cores
)

# Fit RandomizedSearchCV to the training data
with warnings.catch_warnings():
    warnings.simplefilter("ignore", FutureWarning)
    warnings.simplefilter("ignore", UserWarning)
    lgbm_random_search.fit(X, y)

# Print the best parameters and best score
print("\nBest parameters found:", lgbm_random_search.best_params_)
print("Best MAPE score (negative is better before multiplying by -1):", -lgbm_random_search.best_score_)

# Store the best estimator
tuned_lgbm_model = lgbm_random_search.best_estimator_
print("Tuned LGBM model stored.")

Fitting 5 folds for each of 50 candidates, totalling 250 fits

Best parameters found: {'model__colsample_bytree': np.float64(0.695824756266789), 'model__learning_rate': np.float64(0.05346846162736693), 'model__max_depth': 14, 'model__min_child_samples': 25, 'model__n_estimators': 515, 'model__num_leaves': 23, 'model__reg_alpha': np.float64(0.040426663166357624), 'model__reg_lambda': np.float64(0.18482722803070223), 'model__subsample': np.float64(0.6968639753109703)}
Best MAPE score (negative is better before multiplying by -1): 0.15919373182184965
Tuned LGBM model stored.


In [87]:
from sklearn.neural_network import MLPRegressor

# Instantiate MLPRegressor
# Using some reasonable default parameters for a first pass
# random_state is important for reproducibility
mlp_base = MLPRegressor(random_state=42, max_iter=500, learning_rate_init=0.001)

# Create the full pipeline for the MLPRegressor
pipeline_mlp = make_full_pipeline(mlp_base, borough_to_praha, unique_words)

print("\nEvaluating MLPRegressor...")

# Perform cross-validation with MAPE, suppressing warnings
with warnings.catch_warnings():
    warnings.simplefilter("ignore", FutureWarning)
    warnings.simplefilter("ignore", UserWarning)
    cv_scores_mape_mlp = cross_val_score(pipeline_mlp, X, y, cv=kf, scoring=mape, error_score='raise')
cv_mape_mlp = -cv_scores_mape_mlp # Multiply by -1 for positive MAPE

mean_mape_mlp = np.mean(cv_mape_mlp)
std_mape_mlp = np.std(cv_mape_mlp)

# Store the results
results["MLPRegressor"] = {"mean_mape": mean_mape_mlp, "std_mape": std_mape_mlp}

print(f"  Mean CV MAPE for MLPRegressor: {mean_mape_mlp:.4f}")
print(f"  Std CV MAPE for MLPRegressor: {std_mape_mlp:.4f}")

# Add the tuned LGBM model to results for comparison
# Note: tuned_lgbm_model is a pipeline, so we need to extract its model's performance
# The best_score_ from RandomizedSearchCV already gives the CV performance for the tuned model.
results["LGBMRegressor (Tuned)"] = {"mean_mape": -lgbm_random_search.best_score_, "std_mape": np.nan} # Std dev not directly available from best_score_ but can be computed from cv_results_ if needed.

print("\n--- Updated All Model Results ---")
for name, metrics in results.items():
    print(f"{name}: Mean MAPE = {metrics['mean_mape']:.4f}, Std MAPE = {metrics['std_mape']:.4f}")


Evaluating MLPRegressor...
  Mean CV MAPE for MLPRegressor: 0.7292
  Std CV MAPE for MLPRegressor: 0.0115

--- Updated All Model Results ---
RandomForestRegressor: Mean MAPE = 0.1680, Std MAPE = 0.0038
GradientBoostingRegressor: Mean MAPE = 0.1743, Std MAPE = 0.0036
LinearRegression: Mean MAPE = 0.2006, Std MAPE = 0.0049
XGBRegressor: Mean MAPE = 0.1640, Std MAPE = 0.0045
LGBMRegressor: Mean MAPE = 0.1603, Std MAPE = 0.0031
MLPRegressor: Mean MAPE = 0.7292, Std MAPE = 0.0115
LGBMRegressor (Tuned): Mean MAPE = 0.1592, Std MAPE = nan


In [88]:
results_df = pd.DataFrame.from_dict(results, orient='index')
results_df.index.name = 'Model'
results_df = results_df.reset_index()
results_df.columns = ['Model', 'Mean MAPE', 'Std MAPE']

# Sort by Mean MAPE in ascending order
results_df = results_df.sort_values(by='Mean MAPE')

print("\n--- Comparative Model Performance (Sorted by Mean MAPE) ---")
print(results_df)


--- Comparative Model Performance (Sorted by Mean MAPE) ---
                       Model  Mean MAPE  Std MAPE
6      LGBMRegressor (Tuned)   0.159194       NaN
4              LGBMRegressor   0.160347  0.003092
3               XGBRegressor   0.163993  0.004540
0      RandomForestRegressor   0.167998  0.003772
1  GradientBoostingRegressor   0.174311  0.003552
2           LinearRegression   0.200640  0.004851
5               MLPRegressor   0.729189  0.011485


# Prediction for test data


In [89]:
import pandas as pd

# Make predictions on the test set using the final tuned LGBM model
final_predictions = tuned_lgbm_model.predict(test)

# Create a DataFrame for the predictions
predictions_df = pd.DataFrame({
    'id': test['id'],
    'predicted_price': final_predictions
})

# Display the predictions
display(predictions_df.head())



Unnamed: 0,id,predicted_price
0,8795,6192617.0
1,6516,10082950.0
2,4714,6542120.0
3,8423,9816702.0
4,5361,9530782.0
