In [2]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

#START: Liz Choi

In [8]:
test = pd.read_csv('../processed_data/test_cleaned.csv')
train = pd.read_csv('../processed_data/train_cleaned.csv')

Target Transformation
Dependent Variable: LogClosePrice
Natural log applied to ClosePrice
Reduces right skew
Improves model stability
Helps satisfy regression assumptions

In [9]:
train['LogClosePrice'] = np.log(train['ClosePrice'])
test['LogClosePrice'] = np.log(test['ClosePrice'])

Feature Engineering
Predictors
All variables except:
ClosePrice
LogClosePrice
Encoding
Categorical variables → One-hot encoding
drop_first=True prevents multicollinearity
Alignment
Test columns reindexed to match training columns.

In [10]:
features = [c for c in train.columns if c not in ['ClosePrice', 'LogClosePrice']]

X_train = pd.get_dummies(train[features], drop_first=True)
y_train = train['LogClosePrice']

X_test = pd.get_dummies(test[features], drop_first=True)
y_test = test['LogClosePrice']

X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

Random Forest Model Training
Model: RandomForestRegressor
Parameters:
500 trees
√(features) sampled at each split
Fixed random seed
Parallel processing enabled
Model learns non-linear relationships and feature interactions.

In [8]:
rf_model = RandomForestRegressor(
    n_estimators=500,
    random_state=42,
    n_jobs=-1,
    max_features='sqrt'
)

rf_model.fit(X_train, y_train)

R² Evaluation (Log Scale)
Measures variance explained in:
Training set
Test set
Higher R² → stronger predictive power.

In [10]:
y_train_pred_log = rf_model.predict(X_train)
y_test_pred_log = rf_model.predict(X_test)

train_r2_log = r2_score(y_train, y_train_pred_log)
test_r2_log = r2_score(y_test, y_test_pred_log)

print(f"Random Forest Train R^2 (log scale): {train_r2_log:.4f}")
print(f"Random Forest Test R^2 (log scale): {test_r2_log:.4f}")

Random Forest Train R^2 (log scale): 0.9821
Random Forest Test R^2 (log scale): 0.8659


Predictions converted back to dollars.
This reflects real-world explanatory power on actual house prices.

In [12]:
y_train_pred_orig = np.exp(y_train_pred_log)
y_test_pred_orig = np.exp(y_test_pred_log)

y_train_orig = np.exp(y_train)
y_test_orig = np.exp(y_test)

train_r2_orig = r2_score(y_train_orig, y_train_pred_orig)
test_r2_orig = r2_score(y_test_orig, y_test_pred_orig)

print(f"Random Forest Train R^2 (original dollars): {train_r2_orig:.4f}")
print(f"Random Forest Test R^2 (original dollars): {test_r2_orig:.4f}")

Random Forest Train R^2 (original dollars): 0.9534
Random Forest Test R^2 (original dollars): 0.7579


Percentage-Based Error Metrics
MAPE → Average percentage error
MdAPE → Median percentage error
MdAPE is more robust to extreme price values.

In [23]:
def percentage_error(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    mask = y_true != 0
    return (y_true[mask] - y_pred[mask]) / y_true[mask] * 100

def mean_abs_percentage_error(y_true, y_pred):
    return np.mean(np.abs(percentage_error(y_true, y_pred)))

def median_abs_percentage_error(y_true, y_pred):
    return np.median(np.abs(percentage_error(y_true, y_pred)))

Prediction Error Summary
MAPE → Average percent deviation
MdAPE → Typical percent deviation
Lower values indicate better accuracy.

In [16]:
train_mape = mean_abs_percentage_error(y_train_orig, y_train_pred_orig)
train_mdape = median_abs_percentage_error(y_train_orig, y_train_pred_orig)

test_mape = mean_abs_percentage_error(y_test_orig, y_test_pred_orig)
test_mdape = median_abs_percentage_error(y_test_orig, y_test_pred_orig)

print(f"Train MAPE (%): {train_mape:.2f}, MdAPE (%): {train_mdape:.2f}")
print(f"Test MAPE (%): {test_mape:.2f}, MdAPE (%): {test_mdape:.2f}")

Train MAPE (%): 5.72, MdAPE (%): 3.83
Test MAPE (%): 15.98, MdAPE (%): 10.43


Model Results Table
This table summarizes:
Log-scale performance
Dollar-scale performance
Percentage-based error metrics
Used for comparison against baseline Linear Regression.

In [18]:
rf_results = pd.DataFrame({
    'Model': ['Random Forest Regressor'],
    'Dependent Variable': ['LogClosePrice'],
    'Feature(s)': ['All features (one-hot encoded)'],
    'Train R^2 (log scale)': [train_r2_log],
    'Test R^2 (log scale)': [test_r2_log],
    'Train R^2 (original $)': [train_r2_orig],
    'Test R^2 (original $)': [test_r2_orig],
    'Train MAPE (%)': [train_mape],
    'Test MAPE (%)': [test_mape],
    'Train MdAPE (%)': [train_mdape],
    'Test MdAPE (%)': [test_mdape],
})

rf_results

Unnamed: 0,Model,Dependent Variable,Feature(s),Train R^2 (log scale),Test R^2 (log scale),Train R^2 (original $),Test R^2 (original $),Train MAPE (%),Test MAPE (%),Train MdAPE (%),Test MdAPE (%)
0,Random Forest Regressor,LogClosePrice,All features (one-hot encoded),0.982088,0.865883,0.95344,0.757937,5.719985,15.979177,3.825515,10.428067


#END: Liz Choi

#START: Vivian Lin

In [17]:
X_train = pd.get_dummies(train[features], drop_first=True)
y_train_log = train['LogClosePrice']
y_train_obs = train['ClosePrice']

X_test = pd.get_dummies(test[features], drop_first=True)
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)
y_test_log = test['LogClosePrice']
y_test_obs = test['ClosePrice']

### Using ClosePrice as Dependent Variable

In [19]:
# 5 MINS TO RUN
rf_model_obs = RandomForestRegressor(
    n_estimators=500,
    random_state=42,
    n_jobs=-1,
    max_features='sqrt'
)
rf_model_obs.fit(X_train, y_train_obs)

In [25]:
# Training Evaluations
y_train_obs_pred = rf_model_obs.predict(X_train)

train_r2_score = r2_score(y_train_obs, y_train_obs_pred) #R^2 Score
print(f'Train R^2: {train_r2_score}')

train_mape_obs = mean_abs_percentage_error(y_train_obs, y_train_obs_pred) #MAPE
print(f'Train MAPE (%): {train_mape_obs}')

train_mdape_obs = median_abs_percentage_error(y_train_obs, y_train_obs_pred) #MDAPE
print(f"Train MDAPE (%): {train_mdape_obs}")

Train R^2: 0.9756420311472179
Train MAPE (%): 6.700209268605046
Train MDAPE (%): 4.083744690781802


In [26]:
# Test Evaluations
y_test_obs_pred = rf_model_obs.predict(X_test)

test_r2_score = r2_score(y_test_obs, y_test_obs_pred) #R^2 Score
print(f'Test R^2: {test_r2_score}')

test_mape_obs = mean_abs_percentage_error(y_test_obs, y_test_obs_pred) #MAPE
print(f'Test MAPE: {test_mape_obs}')

test_mdape_obs = median_abs_percentage_error(y_test_obs, y_test_obs_pred) #MDAPE
print(f'Test MDAPE: {test_mdape_obs}')

Test R^2: 0.8120750057132997
Test MAPE: 18.377775859845272
Test MDAPE: 11.097775694956958


In [27]:
rf_results_obs = pd.DataFrame({
    'Model': ['Random Forest Regressor'],
    'Dependent Variable': ['ClosePrice'],
    'Feature(s)': ['All features (one-hot encoded)'],
    'Train R^2 (original $)': [train_r2_score],
    'Test R^2 (original $)': [test_r2_score],
    'Train MAPE (%)': [train_mape_obs],
    'Test MAPE (%)': [test_mape_obs],
    'Train MdAPE (%)': [train_mdape_obs],
    'Test MdAPE (%)': [test_mdape_obs],
})

rf_results_obs

Unnamed: 0,Model,Dependent Variable,Feature(s),Train R^2 (original $),Test R^2 (original $),Train MAPE (%),Test MAPE (%),Train MdAPE (%),Test MdAPE (%)
0,Random Forest Regressor,ClosePrice,All features (one-hot encoded),0.975642,0.812075,6.700209,18.377776,4.083745,11.097776


| Metric | Baseline Train | Baseline Test | RF Train | RF Test |
|--------|-------|------|------|------|
| **R² (log scale)** | 0.762 | 0.727 | 0.982088 | 0.865883 |
| **R² (original $)** | 0.632 | 0.590 | 0.975642 | 0.812075 |
| **MAPE (%)** | 22.16 | 23.90 | 5.719985 | 15.979177 |
| **MdAPE (%)** | 15.91 | 17.08 | 3.825515	| 10.428067 |

## Interpretation

- **R² (log scale):** Indicates how well the model explains variance in the log-transformed target.  
  - Train: ~98% of variance explained.  
  - Test: ~86% of variance explained, showing good generalization.

- **R² (original $):** Explains variance in actual house prices (dollars).  
  - Train: ~97.5% of variance explained.  
  - Test: ~81% of variance explained.

- **MAPE (Mean Absolute Percentage Error):** Measures the average percentage error of predictions.  
  - Train: ~5.7% deviation on average.  
  - Test: ~16% deviation on average.  
  - Sensitive to outliers; higher values indicate larger errors on expensive houses.

- **MdAPE (Median Absolute Percentage Error):** Median percentage error, more robust to extreme prices.  
  - Train: ~3.8% typical deviation.  
  - Test: ~10% typical deviation.  
  - Gives a better sense of the “typical” and realistic prediction error.

  The model generalizes learned from the training data and generalized pretty well on the test set, giving an $R^2$ value of 0.87, but does show signs of slightly overfitting to the training as shown by the gap of 0.11. The Random Forest classifier does perform significantly better than the baseline linear regression models.

#END: Vivian Lin