# **Black-box Models**

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv('./filtered_df.csv')

## Remove Outliers

In [6]:
def remove_outliers(df, numerical_columns):
    Q1 = df[numerical_columns].quantile(0.25)
    Q3 = df[numerical_columns].quantile(0.75)
    IQR = Q3 - Q1
    
    # Filter out outliers by removing rows that have any value outside of the IQR range
    df_cleaned = df[~((df[numerical_columns] < (Q1 - 1.5 * IQR)) | (df[numerical_columns] > (Q3 + 1.5 * IQR))).any(axis=1)]
    return df_cleaned

In [10]:
numerical_columns = [ 'Parking','TotalArea', 'LivingArea', 'NumberOfBathrooms','Price']
# Remove outliers
df_cleaned_remove_outliers = remove_outliers(df, numerical_columns)

## **Black-box Model** : Random Forest Regressor

In [11]:
X = df.drop(["Price"],axis=1)
y = df['Price']
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize the Random Forest Regressor
rf = RandomForestRegressor(max_depth=30,max_features='sqrt',min_samples_leaf=1,min_samples_split=2,n_estimators=300, random_state=42)

# Train the model
rf.fit(X_train, y_train)

# Make predictions
y_pred = rf.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.2f}")
print(f"R^2 Score: {r2:.2f}")

# Feature Importance Visualization
importances = rf.feature_importances_
feature_names = X.columns

# Plotting feature importances
plt.figure(figsize=(10, 6))
plt.barh(feature_names, importances, color='skyblue')
plt.xlabel('Feature Importance')
plt.ylabel('Features')
plt.title('Feature Importance in Random Forest Regressor')
plt.show()

ValueError: could not convert string to float: 'Lisboa'