# **Setup & Data Loading** 

In [2]:
import warnings
warnings.filterwarnings("ignore")

import os
from glob import glob
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 180)

### 1) Locate and load monthly CRMLS exports and adjust the folder

In [6]:
possible_dirs = ["./data", ".", "../data"]
csv_paths = []

for directory in possible_dirs:
    pattern = os.path.join(directory, "CRMLSSold*.csv")
    csv_paths.extend(glob(pattern))
    
# Sort and ensure we found files
csv_paths = sorted(set(csv_paths))
assert len(csv_paths) > 0, "No CRMLSSold*.csv files found --> check data path."

print(f"✅ Found {len(csv_paths)} monthly CRMLS files:")
for p in csv_paths:
    print("   •", os.path.basename(p))

✅ Found 7 monthly CRMLS files:
   • CRMLSSold202502.csv
   • CRMLSSold202503.csv
   • CRMLSSold202504.csv
   • CRMLSSold202505.csv
   • CRMLSSold202506.csv
   • CRMLSSold202507.csv
   • CRMLSSold202508.csv


### 2) Read and merge all CSV files

In [7]:
dfs = []
for p in csv_paths:
    try:
        df_i = pd.read_csv(p, low_memory=False)
        df_i["__source_file"] = os.path.basename(p)
        dfs.append(df_i)
    except Exception as e:
        print(f"Skipping {p}: {e}")

# Concatenate all months
df_all_raw = pd.concat(dfs, ignore_index=True)
print(f"Loaded {len(df_all_raw):,} rows from {len(csv_paths)} files.")

Loaded 156,064 rows from 7 files.


In [1]:


# 2) Read & concatenate (keep original column names)
dfs = []
for p in csv_paths:
    try:
        df_i = pd.read_csv(p, low_memory=False)
        df_i["__source_file"] = os.path.basename(p)
        dfs.append(df_i)
    except Exception as e:
        print(f"Skipping {p}: {e}")

df_all_raw = pd.concat(dfs, ignore_index=True)
print(f"Loaded {len(df_all_raw):,} rows from {len(csv_paths)} files.")

# 3) Parse dates if present
for col in ["CloseDate", "CloseDateTime", "COE", "COEDate", "CloseOfEscrowDate"]:
    if col in df_all_raw.columns:
        df_all_raw[col] = pd.to_datetime(df_all_raw[col], errors="coerce")

# 4) Filter to Residential Single Family Residence, if columns exist
df_all = df_all_raw.copy()
if "PropertyType" in df_all.columns:
    df_all = df_all[df_all["PropertyType"].astype(str).str.strip().str.lower() == "residential"]
if "PropertySubType" in df_all.columns:
    df_all = df_all[df_all["PropertySubType"].astype(str).str.strip().str.lower() == "singlefamilyresidence"]

print(f"After SFR filter: {len(df_all):,} rows.")

# 5) Ensure target is present
assert "ClosePrice" in df_all.columns, "Expected 'ClosePrice' not found in columns."


Loaded 156,064 rows from 7 files.
After SFR filter: 78,387 rows.
