# Price Prediction Modeling

In this notebook, we build and evaluate machine learning models to predict property prices in Portugal based on location, area, and typology.

In [ ]:
import os
import pandas as pd
import numpy as np
import psycopg2
import matplotlib.pyplot as plt
import seaborn as sns
from dotenv import load_dotenv

# Preprocessing & Metrics
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import category_encoders as ce

# Models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

load_dotenv("../.env")
sns.set_theme(style="whitegrid")

## 1. Data Loading & Initial Cleaning

In [ ]:
def get_db_connection():
    return psycopg2.connect(
        host=os.getenv('DB_HOST', 'localhost'),
        database=os.getenv('DB_NAME'),
        user=os.getenv('DB_USER'),
        password=os.getenv('DB_PASSWORD')
    )

conn = get_db_connection()
query = "SELECT price, distrito, concelho, freguesia, area_m2, room_count FROM properties"
df = pd.read_sql(query, conn)
conn.close()

# Apply basic filters identified in EDA
df = df[(df['price'] > 0) & (df['area_m2'] > 0)]

def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    return df[(df[column] >= Q1 - 1.5*IQR) & (df[column] <= Q3 + 1.5*IQR)]

df_clean = remove_outliers(df, 'price')
df_clean = remove_outliers(df_clean, 'area_m2')

print(f"Dataset size after cleaning: {len(df_clean)}")

## 2. Train-Validation-Test Split

We split the data into 70% training, 15% validation (for tuning), and 15% testing (final evaluation).

In [ ]:
X = df_clean.drop('price', axis=1)
y = df_clean['price']

# First split: train + val vs test
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

# Second split: train vs val
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.1765, random_state=42) # 0.1765 * 0.85 approx 0.15

print(f"Train size: {len(X_train)}")
print(f"Val size: {len(X_val)}")
print(f"Test size: {len(X_test)}")

## 3. Preprocessing Pipeline

- **Numeric**: Scale `area_m2` and impute missing `room_count`.
- **High Cardinality Cat**: `TargetEncoder` for `freguesia` and `concelho`.
- **Low Cardinality Cat**: `OneHotEncoder` for `distrito`.

In [ ]:
numeric_features = ['area_m2', 'room_count']
target_encoded_features = ['freguesia', 'concelho']
onehot_features = ['distrito']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('target', ce.TargetEncoder(), target_encoded_features),
        ('onehot', OneHotEncoder(handle_unknown='ignore'), onehot_features)
    ]
)

## 4. Model Benchmarking (Default Params)

Evaluating Linear Regression, Random Forest, and XGBoost with default parameters.

In [ ]:
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(random_state=42),
    "XGBoost": XGBRegressor(random_state=42)
}

results = []

for name, model in models.items():
    clf = Pipeline(steps=[('preprocessor', preprocessor),
                          ('regressor', model)])
    
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_val)
    
    mae = mean_absolute_error(y_val, y_pred)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    r2 = r2_score(y_val, y_pred)
    
    results.append({"Model": name, "MAE": mae, "RMSE": rmse, "R2": r2})
    print(f"{name} - R2: {r2:.4f}")

pd.DataFrame(results)

## 5. Hyperparameter Tuning

We use `RandomizedSearchCV` for efficiency given the large parameter space.

In [ ]:
# Random Forest Tuning
rf_param_grid = {
    'regressor__n_estimators': [100, 200, 500],
    'regressor__max_depth': [10, 20, 30, None],
    'regressor__min_samples_split': [2, 5, 10],
    'regressor__min_samples_leaf': [1, 2, 4],
    'regressor__bootstrap': [True, False]
}

rf_pipe = Pipeline(steps=[('preprocessor', preprocessor),
                          ('regressor', RandomForestRegressor(random_state=42))])

rf_search = RandomizedSearchCV(rf_pipe, rf_param_grid, n_iter=15, cv=3, scoring='r2', n_jobs=-1, random_state=42)
rf_search.fit(X_train, y_train)

print(f"Best RF Params: {rf_search.best_params_}")
print(f"Best RF Val R2: {rf_search.best_score_:.4f}")

In [ ]:
# XGBoost Tuning
xgb_param_grid = {
    'regressor__n_estimators': [100, 200, 500],
    'regressor__max_depth': [3, 6, 10],
    'regressor__learning_rate': [0.01, 0.1, 0.2],
    'regressor__subsample': [0.7, 0.8, 1.0],
    'regressor__colsample_bytree': [0.7, 0.8, 1.0]
}

xgb_pipe = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', XGBRegressor(random_state=42))])

xgb_search = RandomizedSearchCV(xgb_pipe, xgb_param_grid, n_iter=15, cv=3, scoring='r2', n_jobs=-1, random_state=42)
xgb_search.fit(X_train, y_train)

print(f"Best XGB Params: {xgb_search.best_params_}")
print(f"Best XGB Val R2: {xgb_search.best_score_:.4f}")

## 6. Final Evaluation & Feature Importance

Testing the overall best model on the unseen Test set.

In [ ]:
best_rf_score = rf_search.best_score_
best_xgb_score = xgb_search.best_score_

if best_rf_score > best_xgb_score:
    best_model = rf_search.best_estimator_
    print("Best model found: Random Forest")
else:
    best_model = xgb_search.best_estimator_
    print("Best model found: XGBoost")

y_test_pred = best_model.predict(X_test)

print("--- Final Test Set Metrics ---")
print(f"MAE:  {mean_absolute_error(y_test, y_test_pred):.2f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_test_pred)):.2f}")
print(f"R2:   {r2_score(y_test, y_test_pred):.4f}")

# Feature Importance
importances = best_model.named_steps['regressor'].feature_importances_
# Get feature names from preprocessor
cat_features = list(best_model.named_steps['preprocessor'].transformers_[2][1].get_feature_names_out())
feature_names = numeric_features + target_encoded_features + cat_features

feat_imp = pd.Series(importances, index=feature_names).sort_values(ascending=False).head(15)

plt.figure(figsize=(10, 8))
feat_imp.plot(kind='barh', color='teal')
plt.title('Top 15 Feature Importances (Best Model)')
plt.gca().invert_yaxis()
plt.show()