In [3]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.ensemble import RandomForestRegressor

In [None]:
#Start:LC
# Need to upload data to colab and change path
in_path = 'processed_data/entire_dataset.csv'

df = pd.read_csv(in_path, low_memory=False)
list_price_cols = [col for col in df.columns if "ListPrice" in col]

if list_price_cols:
    df = df.drop(columns=list_price_cols)
    print("Dropped list price columns:", list_price_cols)
# Drop columns >90% missing
col_missing_pct = df.isna().mean()
df = df.drop(columns=col_missing_pct[col_missing_pct > 0.90].index)

# Numeric columns (optionally exclude target)
numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
if "ClosePrice" in numeric_cols:
    numeric_cols.remove("ClosePrice")  # recommended: don't scale target

# Median impute numeric
medians = df[numeric_cols].median()
df[numeric_cols] = df[numeric_cols].fillna(medians)

# Standardize numeric (z-score)
means = df[numeric_cols].mean()
stds = df[numeric_cols].std(ddof=0).replace(0, 1)  # avoid divide-by-zero
df[numeric_cols] = (df[numeric_cols] - means) / stds

out_path.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(out_path, index=False)
print("Saved:", out_path, "shape:", df.shape)

# =====================================================
# FEATURE SELECTION SECTION (NEW)
# =====================================================

# Remove constant features
low_variance_cols = df[numeric_cols].columns[df[numeric_cols].std() == 0]
numeric_cols = [c for c in numeric_cols if c not in low_variance_cols]
df = df.drop(columns=low_variance_cols)

print("Removed constant features:", len(low_variance_cols))

# Remove highly correlated features (>0.95)
corr_matrix = df[numeric_cols].corr().abs()
upper = corr_matrix.where(
    np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
)

high_corr_cols = [
    column for column in upper.columns
    if any(upper[column] > 0.95)
]

df = df.drop(columns=high_corr_cols)
numeric_cols = [c for c in numeric_cols if c not in high_corr_cols]

print("Removed highly correlated features:", len(high_corr_cols))

# Rank features using Random Forest
X = df[numeric_cols]
y = df["ClosePrice"]

rf = RandomForestRegressor(
    n_estimators=200,
    random_state=42,
    n_jobs=-1
)

rf.fit(X, y)

importances = pd.Series(
    rf.feature_importances_,
    index=X.columns
).sort_values(ascending=False)

print("\nTop 15 Important Features:")
print(importances.head(15))

# Select top K features
top_k = 20
selected_features = importances.head(top_k).index.tolist()

# Ensure location is always included
for col in ["Latitude", "Longitude"]:
    if col in df.columns and col not in selected_features:
        selected_features.append(col)

print("\nFinal selected feature count:", len(selected_features))

# Keep only selected features + target
df = df[selected_features + ["ClosePrice"]]

# =====================================================

out_path.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(out_path, index=False)

print("Saved:", out_path, "shape:", df.shape)

#End:LC

FileNotFoundError: [Errno 2] No such file or directory: 'data/processed/filtered_base.csv'

In [5]:
#BEGIN: Vivian Lin

# Mount Google Drive for permanent storage of data
# from google.colab import drive
# drive.mount('/content/drive')

# Creating test set: December 2025
test_set = pd.read_csv('california/CRMLSSold202512.csv')
test_set = test_set[(test_set['PropertySubType'] == 'SingleFamilyResidence') & (test_set['PropertyType'] == 'Residential')]
test_set.to_csv('processed_data/california_housing_test.csv')

# Creating training set: April to November 2025 
training_set = pd.DataFrame()

for i in range(4, 12):
    if i < 10:
        month = '0' + str(i)
    else:
        month = str(i)
    monthly_data = pd.read_csv(f'california/CRMLSSold2025{month}.csv')
    monthly_data['month'] = month
    training_set = pd.concat([training_set, monthly_data])
training_set = training_set[(training_set['PropertySubType'] == 'SingleFamilyResidence') & (training_set['PropertyType'] == 'Residential')]
training_set = training_set.reset_index(drop=True)

training_set.to_csv('processed_data/california_housing_train.csv')

#END: Vivian Lin

  monthly_data = pd.read_csv(f'california/CRMLSSold2025{month}.csv')


In [15]:
test_set['month'] = 12
entire_dataset = pd.concat([training_set, test_set])
entire_dataset.to_csv('entire_dataset.csv')