In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

# Load data
df = pd.read_csv("../data/raw/train.csv")

# Drop ID (not predictive)
df.drop(columns=["Id"], inplace=True)

# Separate target variable
y = df["SalePrice"]
X = df.drop("SalePrice", axis=1)

# Identify column types
num_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()

# Drop columns with >40% missing values (optional rule of thumb)
drop_cols = X.columns[X.isnull().mean() > 0.4]
X.drop(columns=drop_cols, inplace=True)
num_cols = [col for col in num_cols if col not in drop_cols]
cat_cols = [col for col in cat_cols if col not in drop_cols]

# Define preprocessing pipelines
num_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

cat_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="Missing")),
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

# Combine pipelines
preprocessor = ColumnTransformer(transformers=[
    ("num", num_pipeline, num_cols),
    ("cat", cat_pipeline, cat_cols)
])

# Fit-transform the data
X_processed = preprocessor.fit_transform(X)

# Get feature names
cat_encoded_names = preprocessor.named_transformers_["cat"]["encoder"].get_feature_names_out(cat_cols)
feature_names = np.concatenate([num_cols, cat_encoded_names])

# Convert to DataFrame
X_df = pd.DataFrame(X_processed, columns=feature_names)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_df, y, test_size=0.2, random_state=42)

# Save processed datasets
X_train.to_csv("../data/X_train.csv", index=False)
X_test.to_csv("../data/X_test.csv", index=False)
y_train.to_csv("../data/y_train.csv", index=False)
y_test.to_csv("../data/y_test.csv", index=False)