In [10]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.ensemble import RandomForestRegressor

In [None]:
in_path = Path("entire_dataset.csv")
out_path = Path("processed_data") / "entire_dataset_processed.csv"

df = pd.read_csv(in_path, low_memory=False)

# original shape BEFORE any changes
orig_shape = df.shape
print("Original shape:", orig_shape)

# Drop ListPrice columns
list_price_cols = [col for col in df.columns if "ListPrice" in col]
if list_price_cols:
    df = df.drop(columns=list_price_cols)
    print("Dropped list price columns:", list_price_cols)

# Drop columns >90% missing
col_missing_pct = df.isna().mean()
dropped_missing_cols = col_missing_pct[col_missing_pct > 0.90].index.tolist()
df = df.drop(columns=dropped_missing_cols)
print("Dropped >90% missing columns:", len(dropped_missing_cols))

# Numeric columns (optionally exclude target)
numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
if "ClosePrice" in numeric_cols:
    numeric_cols.remove("ClosePrice")


print("Numeric cols to impute/scale:", len(numeric_cols))
missing_numeric_before = int(df[numeric_cols].isna().sum().sum())
print("Missing numeric values before impute:", missing_numeric_before)

# Median impute numeric
medians = df[numeric_cols].median()
df[numeric_cols] = df[numeric_cols].fillna(medians)

# Standardize numeric (z-score)
means = df[numeric_cols].mean()
stds = df[numeric_cols].std(ddof=0).replace(0, 1)
df[numeric_cols] = (df[numeric_cols] - means) / stds

# processed shape
print("Processed shape:", df.shape)

out_path.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(out_path, index=False)
print("Saved output:", out_path)

Original shape: (102605, 80)
Dropped list price columns: ['OriginalListPrice', 'ListPrice']
Dropped >90% missing columns: 15
Numeric cols to impute/scale: 20
Missing numeric values before impute: 90416
Processed shape: (102605, 63)
Saved output: processed_data/entire_dataset_processed.csv


In [None]:
# =====================================================
# FEATURE SELECTION SECTION
# =====================================================
# Feature selection strategy:
# First, remove constant (zero-variance) features since they contain no predictive information.
# Next, eliminate highly correlated features (correlation > 0.95) to reduce redundancy
# and multicollinearity. Then, use a Random Forest regressor to rank remaining features
# by importance and select the top K most predictive variables for ClosePrice.
# Latitude and Longitude are retained to preserve key location-based effects.
# Remove constant features (std == 0)
low_variance_cols = df[numeric_cols].columns[df[numeric_cols].std(ddof=0) == 0]
low_variance_cols = low_variance_cols.tolist()

df = df.drop(columns=low_variance_cols)
numeric_cols = [c for c in numeric_cols if c not in low_variance_cols]
print("Removed constant features:", len(low_variance_cols))

# Remove highly correlated features (>0.95)
corr_matrix = df[numeric_cols].corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

high_corr_cols = [col for col in upper.columns if (upper[col] > 0.95).any()]
df = df.drop(columns=high_corr_cols)
numeric_cols = [c for c in numeric_cols if c not in high_corr_cols]
print("Removed highly correlated features:", len(high_corr_cols))

# Rank features using Random Forest
X = df[numeric_cols]
y = df["ClosePrice"]

rf = RandomForestRegressor(
    n_estimators=200,
    random_state=42,
    n_jobs=-1
)
rf.fit(X, y)

importances = (
    pd.Series(rf.feature_importances_, index=X.columns)
    .sort_values(ascending=False)
)

print("\nTop 15 Important Features:")
print(importances.head(15))

# Select top K features
top_k = 20
selected_features = importances.head(top_k).index.tolist()

# Ensure location is always included
for col in ["Latitude", "Longitude"]:
    if col in df.columns and col not in selected_features:
        selected_features.append(col)

print("\nFinal selected feature count:", len(selected_features))

Removed constant features: 0
Removed highly correlated features: 1

Top 15 Important Features:
Latitude               0.236777
StreetNumberNumeric    0.104000
LivingArea             0.090202
Longitude              0.083651
Unnamed: 0             0.081988
LotSizeArea            0.076684
LotSizeAcres           0.067325
LotSizeSquareFeet      0.058405
ListingKey             0.050360
DaysOnMarket           0.043359
YearBuilt              0.023635
BedroomsTotal          0.019681
month                  0.018585
ParkingTotal           0.012150
MainLevelBedrooms      0.011080
dtype: float64

Final selected feature count: 19


In [12]:
#BEGIN: Vivian Lin

# Mount Google Drive for permanent storage of data
# from google.colab import drive
# drive.mount('/content/drive')

# Creating test set: December 2025
test_set = pd.read_csv('california/CRMLSSold202512.csv')
test_set = test_set[(test_set['PropertySubType'] == 'SingleFamilyResidence') & (test_set['PropertyType'] == 'Residential')]
test_set.to_csv('processed_data/california_housing_test.csv')

# Creating training set: April to November 2025 
training_set = pd.DataFrame()

for i in range(4, 12):
    if i < 10:
        month = '0' + str(i)
    else:
        month = str(i)
    monthly_data = pd.read_csv(f'california/CRMLSSold2025{month}.csv')
    monthly_data['month'] = month
    training_set = pd.concat([training_set, monthly_data])
training_set = training_set[(training_set['PropertySubType'] == 'SingleFamilyResidence') & (training_set['PropertyType'] == 'Residential')]
training_set = training_set.reset_index(drop=True)

training_set.to_csv('processed_data/california_housing_train.csv')

#END: Vivian Lin

FileNotFoundError: [Errno 2] No such file or directory: 'california/CRMLSSold202512.csv'

In [None]:
test_set['month'] = 12
entire_dataset = pd.concat([training_set, test_set])
entire_dataset.to_csv('entire_dataset.csv')