In [1]:
# Import necessary libraries
import pandas as pd
import os
import sys
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer 

# Add the parent directory (utils folder) to the system path to import our custom modules
sys.path.append(os.path.join(os.getcwd(), '..'))

# Import our custom data loading and model utility functions
from utils.data_loader import load_and_clean_data
from utils.model_utils import prepare_features, evaluate_model

In [2]:
# Load the cleaned Munich dataset
df_munich = load_and_clean_data('munich')
print("\nMunich dataset loaded and ready for modeling.")

# Drop columns that are not suitable for our baseline model
df_munich.drop(columns=['host_since', 'calendar_last_scraped', 'first_review', 'last_review'], errors='ignore', inplace=True)

# Prepare features (X) and target (y)
X, y = prepare_features(df_munich, target_column='price')

Loading cleaned data for Munich from processed directory...

Munich dataset loaded and ready for modeling.
Categorical features have been one-hot encoded.
Shape of features (X) after encoding: (4687, 4871)


In [3]:
# Handle missing values (NaNs) before training the model.
print("\nChecking for missing values in features (X)...")
print(X.isnull().sum()[X.isnull().sum() > 0])

imputer = SimpleImputer(strategy='mean')
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

print("\nMissing values in features (X) have been filled with the mean.")


Checking for missing values in features (X)...
host_listings_count            185
host_total_listings_count      185
bathrooms                        3
bedrooms                         5
beds                             2
minimum_minimum_nights           2
maximum_minimum_nights           2
minimum_maximum_nights           2
maximum_maximum_nights           2
review_scores_rating           835
review_scores_accuracy         835
review_scores_cleanliness      835
review_scores_checkin          835
review_scores_communication    835
review_scores_location         835
review_scores_value            835
reviews_per_month              835
dtype: int64

Missing values in features (X) have been filled with the mean.


In [4]:
# Split the data into 80% training and 20% testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("\nData has been split into training and testing sets.")
print(f"Training set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")


Data has been split into training and testing sets.
Training set shape: (3749, 4871)
Testing set shape: (938, 4871)


In [5]:
# Initialize and train a Linear Regression model
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

print("\nLinear Regression model training complete.")

# Make predictions on the test set
y_pred_linear = linear_model.predict(X_test)


Linear Regression model training complete.


In [6]:
# Evaluate the model
evaluate_model(y_test, y_pred_linear)


--- Model Evaluation ---
Root Mean Squared Error (RMSE): 142.19
R-squared (R²): -0.00
