In [57]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from scipy.sparse import hstack, csr_matrix
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import warnings

warnings.filterwarnings("ignore")

# =======================
# DATA PREPROCESSING
# =======================

In [71]:
def preprocess_data(df):
    # Copy dataframe to avoid modifying original
    df_clean = df.copy()

    # 1. Filter data
    df_clean = df_clean[df_clean["Waluta"] != "EURO"]
    df_clean = df_clean[~df_clean["Rodzaj_paliwa"].isin(["Electric", "LPG"])]

    # 2. Remove unnecessary columns
    cols_to_drop = [
        "Data_pierwszej_rejestracji",
        "Data_publikacji_oferty",
        "Wyposazenie",
        "Stan",
        "Lokalizacja_oferty",
        "Waluta",
        "Marka_pojazdu",
        "Model_pojazdu",
        "Generacja_pojazdu",
        "Pierwszy_wlasciciel",
        "Wersja_pojazdu",
        "Kraj_pochodzenia",
    ]
    df_clean = df_clean.drop(
        columns=[col for col in cols_to_drop if col in df_clean.columns]
    )

    # 3. Feature engineering
    current_year = pd.to_datetime("today").year
    df_clean["Wiek"] = current_year - df_clean["Rok_produkcji"]
    df_clean["Usage_Rate"] = df_clean["Przebieg_km"] / df_clean["Wiek"]
    df_clean = df_clean.drop(columns=["Przebieg_km", "Wiek", "Rok_produkcji"])

    # 4. Handle missing values
    df_clean = df_clean.fillna(method="ffill").dropna()

    return df_clean


# Load and preprocess data
train_df = pd.read_csv("data/sales_ads_train.csv")
test_df = pd.read_csv("data/sales_ads_test.csv")

# Apply preprocessing
train_df = preprocess_data(train_df)
test_df = preprocess_data(test_df)

# =======================
# FEATURE ENGINEERING
# =======================

In [72]:
# Separate features and target
X_train = train_df.drop(columns=["Cena"])
y_train = np.log1p(train_df["Cena"])  # Log transformation

X_test = test_df.drop(columns=["Cena"], errors="ignore")
if "Cena" in test_df.columns:
    y_test = np.log1p(test_df["Cena"])
else:
    y_test = None

# Identify feature types
cat_cols = X_train.select_dtypes(include="object").columns.tolist()
num_cols = X_train.select_dtypes(include=np.number).columns.tolist()

# Frequency encoding for high cardinality features
high_card_cols = [col for col in cat_cols if X_train[col].nunique() > 20]
freq_maps = {}

for col in high_card_cols:
    freq_map = X_train[col].value_counts(normalize=True)
    freq_maps[col] = freq_map
    X_train[col + "_freq"] = X_train[col].map(freq_map).fillna(0)
    X_train = X_train.drop(col, axis=1)

    # Apply same mapping to test data
    X_test[col + "_freq"] = X_test[col].map(freq_map).fillna(0)
    X_test = X_test.drop(col, axis=1)

# Update categorical columns after frequency encoding
cat_cols = list(set(cat_cols) - set(high_card_cols))

# One-hot encoding for remaining categorical features
ohe = OneHotEncoder(sparse_output=True, handle_unknown="ignore")
if cat_cols:
    ohe.fit(X_train[cat_cols])
    train_ohe = ohe.transform(X_train[cat_cols])
    test_ohe = ohe.transform(X_test[cat_cols])
else:
    train_ohe = csr_matrix((len(X_train), 0))
    test_ohe = csr_matrix((len(X_test), 0))

# Combine numerical and encoded features
X_train_final = hstack([csr_matrix(X_train[num_cols].values), train_ohe])
X_test_final = hstack([csr_matrix(X_test[num_cols].values), test_ohe])

# =======================
# MODEL TRAINING
# =======================

In [73]:
# Split validation set
X_tr, X_val, y_tr, y_val = train_test_split(
    X_train_final, y_train, test_size=0.2, random_state=42
)

# XGBoost hyperparameter tuning
xgb = XGBRegressor(objective="reg:squarederror", random_state=42, n_jobs=-1)
xgb_params = {
    "n_estimators": [500, 800, 1000],
    "max_depth": [3, 5, 7],
    "learning_rate": [0.01, 0.05, 0.1],
    "subsample": [0.6, 0.8],
    "colsample_bytree": [0.6, 0.8],
    "gamma": [0, 0.1, 0.3],
}

xgb_search = RandomizedSearchCV(
    xgb,
    xgb_params,
    n_iter=20,
    scoring="neg_root_mean_squared_error",
    cv=3,
    random_state=42,
    n_jobs=-1,
)
xgb_search.fit(X_tr, y_tr)
best_xgb = xgb_search.best_estimator_

# Random Forest training
rf = RandomForestRegressor(
    n_estimators=300, max_depth=15, min_samples_split=5, random_state=42, n_jobs=-1
)
rf.fit(X_tr, y_tr)

# =======================
# MODEL EVALUATION
# =======================

In [74]:
# Modified evaluation function
def evaluate_model(model, X, y_true_log):
    y_pred_log = model.predict(X)
    y_true = np.expm1(y_true_log)  # Convert back to original scale
    y_pred = np.expm1(y_pred_log)  # Convert back to original scale

    rmse_log = np.sqrt(mean_squared_error(y_true_log, y_pred_log))
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)

    return rmse_log, rmse, r2, y_pred


# Updated evaluation section
xgb_rmse_log, xgb_rmse, xgb_r2, xgb_pred = evaluate_model(best_xgb, X_val, y_val)
rf_rmse_log, rf_rmse, rf_r2, rf_pred = evaluate_model(rf, X_val, y_val)

# Ensemble evaluation
ensemble_pred = xgb_pred * 0.6 + rf_pred * 0.4
ensemble_rmse = np.sqrt(mean_squared_error(np.expm1(y_val), ensemble_pred))
ensemble_r2 = r2_score(np.expm1(y_val), ensemble_pred)

print("\nValidation Results:")
print(f"XGBoost RMSE (log): {xgb_rmse_log:.4f} | XGBoost RMSE (PLN): {xgb_rmse:,.2f}")
print(f"Random Forest RMSE (log): {rf_rmse_log:.4f} | RF RMSE (PLN): {rf_rmse:,.2f}")
print(f"Ensemble RMSE (PLN): {ensemble_rmse:,.2f}")


Validation Results:
XGBoost RMSE (log): 0.4522 | XGBoost RMSE (PLN): 41,629.75
Random Forest RMSE (log): 0.4623 | RF RMSE (PLN): 41,149.36
Ensemble RMSE (PLN): 40,548.19
