In [1]:
import pandas as pd
import joblib
import logging
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    r2_score,
    mean_absolute_error,
    median_absolute_error,
    explained_variance_score,
    mean_absolute_percentage_error
)

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# 1. Data Loading
try:
    df = pd.read_csv('apartments_sqm.csv')
    logging.info("Data loaded successfully.")
except FileNotFoundError:
    logging.error("File 'apartments_sqm.csv' not found.")
    raise
except Exception as e:
    logging.error(f"Error loading data: {e}")
    raise

# 2. Data Preprocessing
# Define feature categories
cat_features = ['state_building', 'zip_code', 'province', "heating_type"]
num_features = ["total_area_sqm", "construction_year", "nbr_bedrooms", "terrace_sqm"]
dummy_features = ["fl_furnished", "fl_terrace", "fl_double_glazing"]

# Separate features and target
X = df[num_features + dummy_features + cat_features]
y = np.log1p(df['price'])  # Log transformation of the target variable

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=535
)
logging.info("Data split into training and testing sets.")

# Handle rare categories
for cat in cat_features:
    top_categories = X_train[cat].value_counts().nlargest(50).index
    X_train[cat] = X_train[cat].apply(lambda x: str(x) if x in top_categories else 'Other')
    X_test[cat] = X_test[cat].apply(lambda x: str(x) if x in top_categories else 'Other')
    logging.info(f"Handled rare categories in '{cat}'.")

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), num_features),
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore'))
        ]), cat_features),
        ('dummy', 'passthrough', dummy_features)
    ]
)

# **Create a full pipeline with the preprocessor and model**
model = xgb.XGBRegressor(
    objective='reg:absoluteerror',
    eval_metric='mae',
    n_estimators=200,
    learning_rate=0.2,
    max_depth=6,
    random_state=535,
    reg_alpha=1
)

# Combine preprocessor and model into a pipeline
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

# Fit the pipeline
model_pipeline.fit(X_train, y_train)
logging.info("Model training completed.")

# 3. Model Evaluation
y_pred_test = model_pipeline.predict(X_test)

# Calculate evaluation metrics
r2 = r2_score(y_test, y_pred_test)
mae = mean_absolute_error(y_test, y_pred_test)
median_ae = median_absolute_error(y_test, y_pred_test)
explained_variance = explained_variance_score(y_test, y_pred_test)
mape = mean_absolute_percentage_error(y_test, y_pred_test)

# Log evaluation metrics
logging.info(f"R-squared (Test): {r2:.4f}")
logging.info(f"Mean Absolute Error (MAE): {mae:.2f}")
logging.info(f"Median Absolute Error: {median_ae:.2f}")
logging.info(f"Explained Variance Score: {explained_variance:.4f}")
logging.info(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%")

# 4. Saving the Model Pipeline
joblib.dump(model_pipeline, 'apartments_xgb_model_log.joblib')
logging.info("Model pipeline saved as 'apartments_xgb_model_log.joblib'.")


2024-11-13 15:11:05,026 - INFO - Data loaded successfully.
2024-11-13 15:11:05,041 - INFO - Data split into training and testing sets.
2024-11-13 15:11:05,053 - INFO - Handled rare categories in 'state_building'.
2024-11-13 15:11:05,078 - INFO - Handled rare categories in 'zip_code'.
2024-11-13 15:11:05,091 - INFO - Handled rare categories in 'province'.
2024-11-13 15:11:05,104 - INFO - Handled rare categories in 'heating_type'.
2024-11-13 15:11:06,368 - INFO - Model training completed.
2024-11-13 15:11:06,424 - INFO - R-squared (Test): 0.7303
2024-11-13 15:11:06,426 - INFO - Mean Absolute Error (MAE): 0.13
2024-11-13 15:11:06,427 - INFO - Median Absolute Error: 0.10
2024-11-13 15:11:06,429 - INFO - Explained Variance Score: 0.7303
2024-11-13 15:11:06,430 - INFO - Mean Absolute Percentage Error (MAPE): 0.01%
2024-11-13 15:11:06,452 - INFO - Model pipeline saved as 'apartments_xgb_model_log.joblib'.
