In [1]:
# Import necessary libraries
import pandas as pd
import os
import sys
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer 

# Add the parent directory (utils folder) to the system path to import our custom modules
sys.path.append(os.path.join(os.getcwd(), '..'))

# Import our custom data loading and model utility functions
from utils.data_loader import load_and_clean_data
from utils.model_utils import prepare_features, evaluate_model, train_and_evaluate_model

In [2]:
# Load the cleaned Munich dataset
df_munich = load_and_clean_data('munich')
print("\nMunich dataset loaded and ready for modeling.")

# Drop columns that are not suitable for our model
df_munich.drop(columns=['host_since', 'calendar_last_scraped', 'first_review', 'last_review'], errors='ignore', inplace=True)

# Prepare features (X) and target (y)
X, y = prepare_features(df_munich, target_column='price')

# Handle missing values (NaNs)
imputer = SimpleImputer(strategy='mean')
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

print("\nMissing values in features (X) have been filled with the mean.")

# Split the data into 80% training and 20% testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("\nData has been split into training and testing sets.")
print(f"Training set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")

Loading cleaned data for Munich from processed directory...

Munich dataset loaded and ready for modeling.
Categorical features have been one-hot encoded.
Shape of features (X) after encoding: (4687, 4871)

Missing values in features (X) have been filled with the mean.

Data has been split into training and testing sets.
Training set shape: (3749, 4871)
Testing set shape: (938, 4871)


In [3]:
# Initialize a Random Forest Regressor model
random_forest_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)

# Train and evaluate the model using our new function
y_pred_rf_munich, rmse_munich, r2_munich = train_and_evaluate_model(random_forest_model, X_train, X_test, y_train, y_test)


Training RandomForestRegressor model...
Training complete.

--- Model Evaluation ---
Root Mean Squared Error (RMSE): 103.06
R-squared (R²): 0.47
