In [1]:
import pandas as pd
import glob

data_dir = "../ml/data/"

listings_path = sorted(glob.glob(data_dir + "clean_sales_listings_*.csv"))[-1]
listings = pd.read_csv(listings_path)

print("Loaded listings:", listings.shape)


Loaded listings: (13428, 28)


In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# Ensure lowercase column names
listings.columns = [c.lower() for c in listings.columns]

# Parse dates
for c in ["listed_date", "removed_date", "created_date", "last_seen_ts"]:
    if c in listings.columns:
        listings[c] = pd.to_datetime(listings[c], errors="coerce")

df = listings.copy()

# Fix invalid values
for col in ["square_footage", "lot_size", "current_price", "hoa_fee",
            "price_per_sq_ft", "bedrooms", "bathrooms"]:
    if col in df.columns:
        df.loc[df[col] <= 0, col] = np.nan

# Target
df = df[df["days_on_market"].notna() & (df["days_on_market"] > 0)]
df["days_on_market_capped"] = df["days_on_market"].clip(
    upper=df["days_on_market"].quantile(0.99)
)

y = df["days_on_market_capped"]

# Feature engineering
df["listed_year"] = df["listed_date"].dt.year
df["listed_month"] = df["listed_date"].dt.month
df["listed_dayofweek"] = df["listed_date"].dt.dayofweek
df["log_price"] = np.log1p(df["current_price"])
df["log_sqft"] = np.log1p(df["square_footage"])
df["price_per_sqft_calc"] = df["current_price"] / df["square_footage"]
df["price_per_sq_ft"] = df["price_per_sq_ft"].fillna(df["price_per_sqft_calc"])

# Leakage + ID columns
drop_cols = [
    "days_on_market", "days_on_market_capped",
    "removed_date", "last_seen_ts",
    "status",
    "listing_id", "address", "street", "mls_number",
    "agent_id", "office_id"
]

X = df.drop(columns=[c for c in drop_cols if c in df.columns])

# Fill missing values (baseline)
X[X.select_dtypes(include="number").columns] = \
    X.select_dtypes(include="number").fillna(X.median(numeric_only=True))

X[X.select_dtypes(include="object").columns] = \
    X.select_dtypes(include="object").fillna("Unknown")

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("X_train:", X_train.shape)
print("X_test :", X_test.shape)
print("y_train:", y_train.shape)
print("y_test :", y_test.shape)


X_train: (10742, 24)
X_test : (2686, 24)
y_train: (10742,)
y_test : (2686,)


See https://pandas.pydata.org/docs/user_guide/migration-3-strings.html#string-migration-select-dtypes for details on how to write code that works with pandas 2 and 3.
  X.select_dtypes(include="object").fillna("Unknown")
See https://pandas.pydata.org/docs/user_guide/migration-3-strings.html#string-migration-select-dtypes for details on how to write code that works with pandas 2 and 3.
  X[X.select_dtypes(include="object").columns] = \


Root Mean Sqaure Error is a commonly used metric to measure the standard deviation of the errors.The typical listing stays on the market for around 3 months. Our model can estimate how long a specific property will remain on the market, with an average prediction error of about 1.5 months.

Improving Accuracy 

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Identify numeric and categorical columns
num_cols = X_train.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_cols = X_train.select_dtypes(include=["object", "category", "string"]).columns.tolist()

print("Numeric features:", len(num_cols))
print("Categorical features:", len(cat_cols))

# Preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ("num", "passthrough", num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ]
)

# Model
rf = RandomForestRegressor(
    n_estimators=300,
    max_depth=18,
    min_samples_leaf=10,
    random_state=42,
    n_jobs=-1
)

pipeline = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", rf)
])

# Train
pipeline.fit(X_train, y_train)

# Predict  
y_pred = pipeline.predict(X_test)

# Evaluate
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("\n=== Baseline DOM Model Performance ===")
print(f"MAE : {mae:.2f} days")
print(f"RMSE: {rmse:.2f} days")
print(f"RÂ²  : {r2:.3f}")


Numeric features: 14
Categorical features: 5

=== Baseline DOM Model Performance ===
MAE : 45.26 days
RMSE: 81.53 days
RÂ²  : 0.588


MAE (Mean Absolute Error) = 45.26 days
On average, your modelâ€™s prediction is about 45 days off from the true number of days a property stays on the market.
ðŸ‘‰ This is much better than a naive approach like just guessing the average DOM for every property.

RMSE (Root Mean Squared Error) = 81.53 days
This penalizes large mistakes more heavily, which tells us there are some properties (likely outliers like luxury or very stale listings) where the model is still struggling. Thatâ€™s normal in real estate data.

RÂ² (R-squared) = 0.588
This means the model explains ~59% of the variation in Days on Market.
ðŸ‘‰ In messy, real-world domains like real estate, anything between 0.5â€“0.7 is genuinely strong for an early model

In [7]:
df["county_avg_price"] = df.groupby("county")["current_price"].transform("mean")
df["price_vs_county_avg"] = df["current_price"] / df["county_avg_price"]

df["county_avg_ppsf"] = df.groupby("county")["price_per_sq_ft"].transform("mean")
df["ppsf_vs_county_avg"] = df["price_per_sq_ft"] / df["county_avg_ppsf"]


In [8]:
# Feature engineering
df["listed_year"] = df["listed_date"].dt.year
df["listed_month"] = df["listed_date"].dt.month
df["listed_dayofweek"] = df["listed_date"].dt.dayofweek
df["log_price"] = np.log1p(df["current_price"])
df["log_sqft"] = np.log1p(df["square_footage"])
df["price_per_sqft_calc"] = df["current_price"] / df["square_footage"]
df["price_per_sq_ft"] = df["price_per_sq_ft"].fillna(df["price_per_sqft_calc"])


In [9]:
# --- Relative pricing features (VERY important) ---

df["county_avg_price"] = df.groupby("county")["current_price"].transform("mean")
df["price_vs_county_avg"] = df["current_price"] / df["county_avg_price"]

df["county_avg_ppsf"] = df.groupby("county")["price_per_sq_ft"].transform("mean")
df["ppsf_vs_county_avg"] = df["price_per_sq_ft"] / df["county_avg_ppsf"]


In [10]:
# Handle any divide-by-zero or missing group stats
df[["price_vs_county_avg", "ppsf_vs_county_avg"]] = \
    df[["price_vs_county_avg", "ppsf_vs_county_avg"]].replace([np.inf, -np.inf], np.nan)

df[["price_vs_county_avg", "ppsf_vs_county_avg"]] = \
    df[["price_vs_county_avg", "ppsf_vs_county_avg"]].fillna(1)


In [11]:
df.columns


Index(['listing_id', 'address', 'street', 'unit', 'city', 'county', 'zip_code',
       'latitude', 'longitude', 'property_type', 'bedrooms', 'bathrooms',
       'square_footage', 'lot_size', 'year_built', 'status', 'current_price',
       'hoa_fee', 'price_per_sq_ft', 'listing_type', 'mls_number',
       'days_on_market', 'listed_date', 'removed_date', 'created_date',
       'last_seen_ts', 'agent_id', 'office_id', 'days_on_market_capped',
       'listed_year', 'listed_month', 'listed_dayofweek', 'log_price',
       'log_sqft', 'price_per_sqft_calc', 'county_avg_price',
       'price_vs_county_avg', 'county_avg_ppsf', 'ppsf_vs_county_avg'],
      dtype='str')

In [12]:
df[[
    "county",
    "current_price",
    "county_avg_price",
    "price_vs_county_avg"
]].head(5)


Unnamed: 0,county,current_price,county_avg_price,price_vs_county_avg
0,Grayson,205900.0,714842.771971,0.288035
1,Fort Bend,599940.0,530801.040724,1.130254
2,Bexar,348990.0,389300.125666,0.896455
3,Tarrant,1263900.0,540550.22807,2.338173
4,Denton,355000.0,608899.712766,0.583019


In [14]:
df["price_vs_county_avg"].describe()


count    13428.000000
mean         1.000000
std          1.467920
min          0.017789
25%          0.446269
50%          0.700823
75%          1.060080
max         43.924979
Name: price_vs_county_avg, dtype: float64