# **Black-box Models**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv('./filtered_df.csv')

## Remove Outliers

In [6]:
def remove_outliers(df, numerical_columns):
    Q1 = df[numerical_columns].quantile(0.25)
    Q3 = df[numerical_columns].quantile(0.75)
    IQR = Q3 - Q1
    
    # Filter out outliers by removing rows that have any value outside of the IQR range
    df_cleaned = df[~((df[numerical_columns] < (Q1 - 1.5 * IQR)) | (df[numerical_columns] > (Q3 + 1.5 * IQR))).any(axis=1)]
    return df_cleaned

In [10]:
numerical_columns = [ 'Parking','TotalArea', 'LivingArea', 'NumberOfBathrooms','Price']
# Remove outliers
df_cleaned_remove_outliers = remove_outliers(df, numerical_columns)

## **Black-box Model** : Random Forest Regressor

In [11]:
X = df.drop(["Price"],axis=1)
y = df['Price']
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize the Random Forest Regressor
rf = RandomForestRegressor(max_depth=30,max_features='sqrt',min_samples_leaf=1,min_samples_split=2,n_estimators=300, random_state=42)

# Train the model
rf.fit(X_train, y_train)

# Make predictions
y_pred = rf.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.2f}")
print(f"R^2 Score: {r2:.2f}")

# Feature Importance Visualization
importances = rf.feature_importances_
feature_names = X.columns

# Plotting feature importances
plt.figure(figsize=(10, 6))
plt.barh(feature_names, importances, color='skyblue')
plt.xlabel('Feature Importance')
plt.ylabel('Features')
plt.title('Feature Importance in Random Forest Regressor')
plt.show()

ValueError: could not convert string to float: 'Lisboa'

In [None]:
from sklearn.model_selection import GridSearchCV

# Parameter grid for RandomForestRegressor
param_grid = {
    'n_estimators': [50, 100, 200,300],        # Number of trees in the forest
    'max_depth': [5, 10, 20, 30],       # Maximum depth of the trees
    'min_samples_split': [2, 5, 10],       # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],         # Minimum number of samples required to be at a leaf node
    'max_features': ['auto', 'sqrt', 'log2']  # Number of features to consider when looking for the best split
}

# Setting up the GridSearchCV
grid_search = GridSearchCV(
    estimator=RandomForestRegressor(random_state=42),
    param_grid=param_grid,
    cv=5,  # 5-fold cross-validation
    scoring='neg_mean_squared_error',  # Optimization metric
    n_jobs=-1,  # Use all available cores
    verbose=2  # Output progress
)

# Perform the grid search
grid_search.fit(X_train, y_train)

# Best hyperparameters and corresponding model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print("Best Hyperparameters:", best_params)

# Evaluate the best model on the test set
y_pred = best_model.predict(X_test)

# Metrics for the best model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Best Model Mean Squared Error:", mse)
print("Best Model R2 Score:", r2)

Fitting 5 folds for each of 432 candidates, totalling 2160 fits


KeyboardInterrupt: 

## **Black-box Model** : Neural Network Model

In [12]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam

# Improved Feedforward Neural Network Model
model = Sequential([
    # First dense layer with 128 units and ReLU activation
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    BatchNormalization(),
    Dropout(0.2),
    
    # Second dense layer with 64 units and ReLU activation
    Dense(64, activation='relu'),
    BatchNormalization(),
    Dropout(0.2),
    
    # Third dense layer with 32 units and ReLU activation
    Dense(32, activation='relu'),
    BatchNormalization(),
    Dropout(0.2),
    
    # Fully connected layer to combine features
    Dense(16, activation='relu'),
    Dropout(0.2),
    
    # Output layer: predicting house price (regression task)
    Dense(1)  # No activation function here for regression
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')

# Train the model
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=50)

y_pred = model.predict(X_test)


mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.2f}")
print(f"R^2 Score: {r2:.2f}")

ModuleNotFoundError: No module named 'tensorflow'