In [4]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import sys
import os

sys.path.append(os.path.dirname(os.path.dirname(os.getcwd())))


from housing_price_prediction.scripts.preprocess_data import load_and_preprocess_data

# Load the processed data
X_train, X_test, y_train, y_test = load_and_preprocess_data()

# Initialize the Random Forest model
random_forest_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Fit the model on the training data
random_forest_model.fit(X_train, y_train)

# Predict and evaluate the model on the training set
y_train_pred_rf = random_forest_model.predict(X_train)
train_mse_rf = mean_squared_error(y_train, y_train_pred_rf)
train_r2_rf = r2_score(y_train, y_train_pred_rf)

# Predict and evaluate the model on the test set
y_test_pred_rf = random_forest_model.predict(X_test)
test_mse_rf = mean_squared_error(y_test, y_test_pred_rf)
test_r2_rf = r2_score(y_test, y_test_pred_rf)

print(f"Random Forest - Training MSE: {train_mse_rf:.2f}, Training R-squared: {train_r2_rf:.2f}")
print(f"Random Forest - Test MSE: {test_mse_rf:.2f}, Test R-squared: {test_r2_rf:.2f}")

Random Forest - Training MSE: 154529367020.07, Training R-squared: 0.95
Random Forest - Test MSE: 1961585044320.34, Test R-squared: 0.61


In [6]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
import sys
import os

sys.path.append(os.path.dirname(os.path.dirname(os.getcwd())))


from housing_price_prediction.scripts.preprocess_data import load_and_preprocess_data
# Load the processed data
X_train, X_test, y_train, y_test = load_and_preprocess_data()

# Initialize the GBM model
gbm_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

# Fit the model on the training data
gbm_model.fit(X_train, y_train)

# Predict and evaluate the model on the training set
y_train_pred_gbm = gbm_model.predict(X_train)
train_mse_gbm = mean_squared_error(y_train, y_train_pred_gbm)
train_r2_gbm = r2_score(y_train, y_train_pred_gbm)

# Predict and evaluate the model on the test set
y_test_pred_gbm = gbm_model.predict(X_test)
test_mse_gbm = mean_squared_error(y_test, y_test_pred_gbm)
test_r2_gbm = r2_score(y_test, y_test_pred_gbm)

print(f"GBM - Training MSE: {train_mse_gbm:.2f}, Training R-squared: {train_r2_gbm:.2f}")
print(f"GBM - Test MSE: {test_mse_gbm:.2f}, Test R-squared: {test_r2_gbm:.2f}")

GBM - Training MSE: 424108142514.51, Training R-squared: 0.86
GBM - Test MSE: 1693306118911.06, Test R-squared: 0.66


In [7]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
import sys
import os

sys.path.append(os.path.dirname(os.path.dirname(os.getcwd())))


from housing_price_prediction.scripts.preprocess_data import load_and_preprocess_data
X_train, X_test, y_train, y_test = load_and_preprocess_data()

# Define a set of parameters to test
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 4],
    'learning_rate': [0.05, 0.1],
    'min_samples_split': [2, 4]
}

gbm = GradientBoostingRegressor(random_state=42)
grid_search = GridSearchCV(estimator=gbm, param_grid=param_grid, cv=3, scoring='r2')
grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

Best parameters: {'learning_rate': 0.05, 'max_depth': 3, 'min_samples_split': 2, 'n_estimators': 100}
Best cross-validation score: 0.60


In [8]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
import sys
import os

sys.path.append(os.path.dirname(os.path.dirname(os.getcwd())))


from housing_price_prediction.scripts.preprocess_data import load_and_preprocess_data
X_train, X_test, y_train, y_test = load_and_preprocess_data()

# Initialize the GBM model with the best parameters
gbm_best = GradientBoostingRegressor(
    learning_rate=0.05,
    max_depth=3,
    min_samples_split=2,
    n_estimators=100,
    random_state=42
)

# Fit the model on the training data
gbm_best.fit(X_train, y_train)

# Predict and evaluate the model on the test set
y_test_pred = gbm_best.predict(X_test)
test_mse = mean_squared_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)

print(f"GBM with Best Parameters - Test MSE: {test_mse:.2f}, Test R-squared: {test_r2:.2f}")

GBM with Best Parameters - Test MSE: 1765664723059.14, Test R-squared: 0.65


In [11]:
import numpy as np
import sys
import os

sys.path.append(os.path.dirname(os.path.dirname(os.getcwd())))
from housing_price_prediction.scripts.preprocess_data import load_and_preprocess_data

# Load the processed data
X_train, X_test, y_train, y_test = load_and_preprocess_data()
model = RandomForestRegressor(n_estimators=100, random_state=42)



# Applying log transformation
y_train_log = np.log(y_train)
y_test_log = np.log(y_test)

# Fit the model on the transformed data
model.fit(X_train, y_train_log)

# Predict on the test set
y_test_pred_log = model.predict(X_test)
y_test_pred = np.exp(y_test_pred_log)  # Transform back to the original scale

# Calculate MSE on the original scale
test_mse = mean_squared_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)
print(f"Test MSE: {test_mse:.2f}, Test R-squared: {test_r2:.2f}")

Test MSE: 2043498756605.84, Test R-squared: 0.60
