In [1]:
# Import necessary libraries
import pandas as pd
import os
import sys
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor # Yeni eklenen model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer 

# Add the parent directory (utils folder) to the system path to import our custom modules
sys.path.append(os.path.join(os.getcwd(), '..'))

# Import our custom data loading and model utility functions
from utils.data_loader import load_and_clean_data
from utils.model_utils import prepare_features, evaluate_model

In [2]:
# Load the cleaned Berlin dataset
df_berlin = load_and_clean_data('berlin')
print("\nBerlin dataset loaded and ready for modeling.")

# Drop columns that are not suitable for our baseline model
df_berlin.drop(columns=['host_since', 'calendar_last_scraped', 'first_review', 'last_review'], errors='ignore', inplace=True)

# Prepare features (X) and target (y)
X, y = prepare_features(df_berlin, target_column='price')

# Handle missing values (NaNs)
imputer = SimpleImputer(strategy='mean')
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

print("\nMissing values in features (X) have been filled with the mean.")

# Split the data into 80% training and 20% testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("\nData has been split into training and testing sets.")
print(f"Training set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")

Loading cleaned data for Berlin from processed directory...

Berlin dataset loaded and ready for modeling.
Categorical features have been one-hot encoded.
Shape of features (X) after encoding: (9135, 8936)

Missing values in features (X) have been filled with the mean.

Data has been split into training and testing sets.
Training set shape: (7308, 8936)
Testing set shape: (1827, 8936)


In [3]:
# Initialize and train a Random Forest Regressor model
# n_estimators: The number of trees in the forest.
# random_state: For reproducibility.
random_forest_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
random_forest_model.fit(X_train, y_train)

print("\nRandom Forest model training complete.")

# Make predictions on the test set
y_pred_rf = random_forest_model.predict(X_test)

# Evaluate the model
evaluate_model(y_test, y_pred_rf)


Random Forest model training complete.

--- Model Evaluation ---
Root Mean Squared Error (RMSE): 48.90
R-squared (R²): 0.73
