In [75]:
import pandas as pd
import numpy as np

# Display options (purely for readability)
pd.set_option("display.max_columns", 100)
pd.set_option("display.width", 120)


In [76]:
# Load the full dataset (uses all 300k+ rows)
df = pd.read_csv("../data/realtor-data.csv")

In [77]:
df.info()        # column types & null counts
df.describe(include=[np.number])
df.isna().mean().sort_values(ascending=False).head(10)  # top missing columns (fraction)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2226382 entries, 0 to 2226381
Data columns (total 12 columns):
 #   Column          Dtype  
---  ------          -----  
 0   brokered_by     float64
 1   status          object 
 2   price           float64
 3   bed             float64
 4   bath            float64
 5   acre_lot        float64
 6   street          float64
 7   city            object 
 8   state           object 
 9   zip_code        float64
 10  house_size      float64
 11  prev_sold_date  object 
dtypes: float64(8), object(4)
memory usage: 203.8+ MB


prev_sold_date    0.329816
house_size        0.255340
bath              0.229867
bed               0.216188
acre_lot          0.146241
street            0.004881
brokered_by       0.002036
price             0.000692
city              0.000632
zip_code          0.000134
dtype: float64

In [78]:
# keep only needed columns
keep = ["price", "bed", "bath", "house_size", "acre_lot", "zip_code", "state"]
df = df[keep].copy()

# coerce numerics
for col in ["price", "bed", "bath", "house_size", "acre_lot"]:
    df[col] = pd.to_numeric(df[col], errors="coerce")

# drop rows missing target/core features
df = df.dropna(subset=["price", "house_size"])

# fill small gaps
df["bed"] = df["bed"].fillna(df["bed"].median())
df["bath"] = df["bath"].fillna(df["bath"].median())
df["acre_lot"] = df["acre_lot"].fillna(df["acre_lot"].median())

# normalize ZIP to digits-as-string
df["zip_code"] = df["zip_code"].astype(str).str.extract(r"(\d+)")
df["zip_code"] = df["zip_code"].fillna("unknown")

df[df["state"] == "California"]["zip_code"].nunique()

# ✅ filter to California only
df = df[df["state"] == "California"].copy()

# basic sanity filters to remove obvious junk
df = df[(df["price"] > 1000) & (df["house_size"] > 200)]

df.shape, df.head()


((197534, 7),
                 price  bed  bath  house_size  acre_lot zip_code       state
 160666   3.000000e+06  7.0   6.0      6889.0      0.21        0  California
 221994   2.147484e+09  2.0   2.0       885.0      0.12  unknown  California
 1208977  1.999000e+05  3.0   1.0      1014.0      0.18    92225  California
 1208998  1.729990e+05  3.0   2.0      1132.0      0.16    92225  California
 1209109  7.990000e+04  4.0   2.0      1272.0      0.16    92225  California)

In [79]:
# one-hot encode ZIP only (state is constant now)
df_enc = pd.get_dummies(df, columns=["zip_code"], drop_first=True)

# features (X) and target (y)
X = df_enc.drop(columns=["price", "state"])   # state is constant 'California'
y = df_enc["price"]

X.shape


(197534, 1667)

In [80]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42
)

X_train.shape, X_test.shape


((158027, 1667), (39507, 1667))

In [81]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error, r2_score

model = Ridge(alpha=1.0)  # try 0.1, 1.0, 10.0 for tuning
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("MAE: ${:,.0f}".format(mean_absolute_error(y_test, y_pred)))
print("R²:", round(r2_score(y_test, y_pred), 4))


MAE: $389,607
R²: 0.4577


In [82]:
def predict_price_zip(bed, bath, sqft, acre_lot, zip_code, model=model):
    # base numeric features row
    row = pd.DataFrame([{
        "bed": bed, "bath": bath, "house_size": sqft, "acre_lot": acre_lot
    }])

    # align columns with training X
    row = row.reindex(columns=X.columns, fill_value=0)

    # flip the correct ZIP one-hot flag if it exists
    col = f"zip_code_{zip_code}"
    if col in row.columns:
        row[col] = 1
    else:
        # fallback: no exact ZIP in training → leave all ZIP dummies at 0
        # (model will behave like "average ZIP" within CA)
        pass

    return float(model.predict(row)[0])

# example: San Ramon 94582 (change as needed)
pred = predict_price_zip(bed=4, bath=2, sqft=2513, acre_lot=0.15, zip_code="94583")
print(f"Predicted price (94582): ${pred:,.0f}")


Predicted price (94582): $1,566,561


In [83]:
def renovation_roi_zip(bed, bath, sqft, acre_lot, zip_code, add_sqft=400, capex_per_sqft=300):
    base = predict_price_zip(bed, bath, sqft, acre_lot, zip_code)
    after = predict_price_zip(bed, bath, sqft + add_sqft, acre_lot, zip_code)
    capex = add_sqft * capex_per_sqft
    roi_gain = after - base
    roi_pct = (roi_gain - capex) / capex if capex > 0 else np.nan
    return {
        "base_price": base,
        "after_price": after,
        "value_uplift": roi_gain,
        "capex": capex,
        "roi_percent": roi_pct
    }

roi = renovation_roi_zip(3, 2, 1450, 0.10, "94582", add_sqft=400, capex_per_sqft=350)
roi


{'base_price': 1065121.9166157863,
 'after_price': 1279873.4704744015,
 'value_uplift': 214751.55385861523,
 'capex': 140000,
 'roi_percent': 0.5339396704186802}

In [84]:
import joblib
joblib.dump(model, "../ml/model_ca_zip_ridge.joblib")
# joblib.dump(gbr, "../ml/model_ca_zip_hgbr.joblib")  # if you trained HGBR

['../ml/model_ca_zip_ridge.joblib']