Importing all the packages needed

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import LassoCV
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

from sklearn.ensemble import StackingRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.base import clone

# Housing Dataset

Reading in the file

In [7]:
housingdata = pd.read_csv("/Users/rox/Desktop/Team12_STA160_Website/housingvars.csv")

splitting the data into training/test sets

In [8]:
counties = ["Los Angeles", "Fresno", "San Diego", "Sacramento", "San Francisco", "Santa Clara", "Alameda"]
housingdata = housingdata[housingdata["County"].isin(counties)].copy()
housingdata = housingdata.drop(columns=['City', 'State'], errors='ignore')

for county in counties:
    county_data = housingdata[housingdata['County'] == county]

    # Split 80% train, 20% test
    X_train, X_test = train_test_split(
        county_data,
        test_size=0.2,
        random_state=42
    )

    # Save to globals with county-specific names
    train_var_name = f"{county.replace(' ', '_')}_train"
    test_var_name  = f"{county.replace(' ', '_')}_test"

    globals()[train_var_name] = X_train.reset_index(drop=True)
    globals()[test_var_name]  = X_test.reset_index(drop=True)

# --------------------------
# Step 3: Print row counts per county
# --------------------------
print(f"{'County':<15} {'Train Rows':<12} {'Test Rows':<12}")
print("-" * 40)

for county in counties:
    train_rows = globals()[f"{county.replace(' ', '_')}_train"].shape[0]
    test_rows  = globals()[f"{county.replace(' ', '_')}_test"].shape[0]
    print(f"{county:<15} {train_rows:<12} {test_rows:<12}")

County          Train Rows   Test Rows   
----------------------------------------
Los Angeles     1260         315         
Fresno          816          205         
San Diego       807          202         
Sacramento      799          200         
San Francisco   679          170         
Santa Clara     637          160         
Alameda         546          137         


cleaning the data

In [9]:
#dropping problematic SF columns, cleaning all other columns too
sf_train = globals()['San_Francisco_train'].drop(columns=['Population'])
sf_test  = globals()['San_Francisco_test'].drop(columns=['Population'])

# Fill NAs only in numeric columns
numeric_cols_train = sf_train.select_dtypes(include='number').columns
numeric_cols_test  = sf_test.select_dtypes(include='number').columns

sf_train[numeric_cols_train] = sf_train[numeric_cols_train].fillna(0)
sf_test[numeric_cols_test]   = sf_test[numeric_cols_test].fillna(0)

# Update global variables
globals()['San_Francisco_train'] = sf_train
globals()['San_Francisco_test']  = sf_test

# --------------------------
# Handle other counties
# --------------------------
other_counties = ["Los_Angeles", "Fresno", "San_Diego", "Sacramento", "Santa_Clara", "Alameda"]

for county in other_counties:
    globals()[f"{county}_train"] = globals()[f"{county}_train"].dropna()
    globals()[f"{county}_test"]  = globals()[f"{county}_test"].dropna()

print(f"{'County':<15} {'Train Rows':<12} {'Test Rows':<12}")
print("-" * 40)

for county in counties:
    train_rows = globals()[f"{county.replace(' ', '_')}_train"].shape[0]
    test_rows  = globals()[f"{county.replace(' ', '_')}_test"].shape[0]
    print(f"{county:<15} {train_rows:<12} {test_rows:<12}")

County          Train Rows   Test Rows   
----------------------------------------
Los Angeles     1111         291         
Fresno          710          177         
San Diego       675          167         
Sacramento      676          167         
San Francisco   679          170         
Santa Clara     632          157         
Alameda         545          136         


In [10]:
def evaluate_county_models(model, counties, globals_dict, 
                           feature_func=None, log_target=False, winsor_q=0.99):
    county_models = {}
    county_metrics = {}

    print(f"{'County':<15} {'RMSE':<12} {'R2':<10} {'Extra':<10}")
    print("-"*55)

    for county in counties:
        county_key = county.replace(" ", "_")
        train_df = globals_dict[f"{county_key}_train"].copy()
        test_df  = globals_dict[f"{county_key}_test"].copy()

        X_train = train_df.drop(columns=['Price', 'County'], errors='ignore')\
                          .select_dtypes(include=['float64','int64'])
        X_test = test_df[X_train.columns]

        # Apply optional feature engineering
        if feature_func:
            X_train = feature_func(X_train)
            X_test  = feature_func(X_test)

        y_train = train_df['Price'].values
        y_test  = test_df['Price'].values

        # Apply winsorization + log transform if needed
        if log_target:
            y_train = np.log1p(np.clip(y_train, None, np.quantile(y_train, winsor_q)))
            y_test  = np.log1p(np.clip(y_test,  None, np.quantile(y_test,  winsor_q)))

        # Clone and fit model
        county_model = clone(model)
        county_model.fit(X_train, y_train)
        y_pred = county_model.predict(X_test)

        # If log target, inverse transform
        if log_target:
            y_pred = np.expm1(y_pred)
            y_test = np.expm1(y_test)

        rmse = root_mean_squared_error(y_test, y_pred)
        r2   = r2_score(y_test, y_pred)

        # Extra info for RF
        extra_info = ""
        if isinstance(county_model, RandomForestRegressor):
            oob_r2 = getattr(county_model, 'oob_score_', None)
            extra_info = f"OOB={oob_r2:.4f}" if oob_r2 else "N/A"
            # Top 5 features
            if hasattr(county_model, 'feature_importances_'):
                fi = pd.Series(county_model.feature_importances_, index=X_train.columns)\
                        .sort_values(ascending=False)
                top5 = ", ".join(fi.head(5).index.tolist())
                extra_info += f" | top: {top5}"

        county_models[county] = county_model
        county_metrics[county] = {'RMSE': rmse, 'R2': r2, 'Extra': extra_info}

        print(f"{county:<15} {rmse:<12.2f} {r2:<10.4f} {extra_info:<10}")

    return county_models, county_metrics

**Best performing Random Forests model:

In [11]:
rf_model = RandomForestRegressor(n_estimators=200, oob_score=True, random_state=42)

county_rf_models, county_rf_metrics = evaluate_county_models(
    rf_model,
    counties,
    globals(),
    feature_func=None,
    log_target=True)

County          RMSE         R2         Extra     
-------------------------------------------------------
Los Angeles     504704.68    0.8613     OOB=0.8044 | top: Living Space, Longitude, Latitude, City Average Household Income, Beds
Fresno          117050.50    0.6061     OOB=0.6351 | top: Living Space, Latitude, Beds, Longitude, Baths
San Diego       470626.17    0.7593     OOB=0.8181 | top: Living Space, Longitude, Latitude, Beds, Baths
Sacramento      149599.82    0.6830     OOB=0.5807 | top: Living Space, Latitude, Longitude, Beds, Baths
San Francisco   618900.33    0.7057     OOB=0.7126 | top: Living Space, Latitude, Baths, Longitude, Beds
Santa Clara     357813.97    0.6728     OOB=0.6512 | top: Living Space, Longitude, Latitude, Beds, Baths
Alameda         238611.22    0.6799     OOB=0.7594 | top: Living Space, Latitude, Longitude, Beds, Baths


The XGBoost and Random Forests models perform similarly for the Housing data, but since RF has more accurate predictions for expensive markets such as Los Angeles and San Francisco, we will stick with the Random Forests model as our final model to use.