In [7]:
import numpy as np
import pandas as pd

# ============================================
# Configuration
# ============================================

DROP_COLS = ["id", "date", "zipcode"]
TARGET = "price"

TRAIN_FILE = "train.csv"
TEST_FILE  = "test.csv"

TRAIN_OUT = "train_clean.csv"
TEST_OUT  = "test_clean.csv"

# ============================================
# 1. Load raw train and test
# ============================================

train_df = pd.read_csv(TRAIN_FILE)
test_df  = pd.read_csv(TEST_FILE)

# ============================================
# 2. Drop ignored columns (if they exist)
# ============================================

train_df = train_df.drop(columns=DROP_COLS, errors="ignore")
test_df  = test_df.drop(columns=DROP_COLS, errors="ignore")

# ============================================
# 3. Divide price by 1000
# ============================================

train_df[TARGET] = train_df[TARGET].astype(float) / 1000.0
test_df[TARGET]  = test_df[TARGET].astype(float) / 1000.0

# ============================================
# 4. Separate features and target
# ============================================

X_train = train_df.drop(columns=[TARGET])
y_train = train_df[TARGET]

X_test  = test_df.drop(columns=[TARGET])
y_test  = test_df[TARGET]

# ============================================
# 5. Scale features (using TRAIN stats only)
# ============================================

mu = X_train.mean()
sigma = X_train.std()

# Prevent division by zero
sigma[sigma == 0] = 1.0

X_train_scaled = (X_train - mu) / sigma
X_test_scaled  = (X_test - mu) / sigma

# ============================================
# 6. Reconstruct cleaned DataFrames
# ============================================

train_clean = pd.concat([X_train_scaled, y_train], axis=1)
test_clean  = pd.concat([X_test_scaled, y_test], axis=1)

# ============================================
# 7. Save cleaned CSV files
# ============================================

train_clean.to_csv(TRAIN_OUT, index=False)
test_clean.to_csv(TEST_OUT, index=False)

print("Preprocessing complete.")
print("Saved:", TRAIN_OUT)
print("Saved:", TEST_OUT)


Preprocessing complete.
Saved: train_clean.csv
Saved: test_clean.csv
