In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

# Load data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Save the target and IDs
y = train['SalePrice']
train_ID = train['Id']
test_ID = test['Id']

# Drop 'Id' columns
train.drop(['Id'], axis=1, inplace=True)
test.drop(['Id'], axis=1, inplace=True)

# Combine train and test for consistent preprocessing
data = pd.concat([train.drop('SalePrice', axis=1), test], axis=0).reset_index(drop=True)

print("✅ Data loaded and combined.")

# === Fill Missing Values ===

# Numerical: Fill with median
num_cols = data.select_dtypes(include=[np.number]).columns
for col in num_cols:
    data[col] = data[col].fillna(data[col].median())

# Categorical: Fill with mode
cat_cols = data.select_dtypes(include=[object]).columns
for col in cat_cols:
    data[col] = data[col].fillna(data[col].mode()[0])

print("✅ Missing values handled.")

# === Label Encoding for categorical features ===
for col in cat_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col].astype(str))

print("✅ Categorical features label encoded.")

# === Feature Engineering ===

# Total square footage
data['TotalSF'] = data['TotalBsmtSF'] + data['1stFlrSF'] + data['2ndFlrSF']

# Total bathrooms (including half baths as 0.5)
data['TotalBath'] = (
    data['FullBath'] +
    0.5 * data['HalfBath'] +
    data['BsmtFullBath'] +
    0.5 * data['BsmtHalfBath']
)

# Total porch area
data['TotalPorchSF'] = (
    data['OpenPorchSF'] +
    data['EnclosedPorch'] +
    data['3SsnPorch'] +
    data['ScreenPorch']
)

print("✅ Feature engineering complete.")

# === Split back into train and test ===
X_train = data.iloc[:train.shape[0], :]
X_test = data.iloc[train.shape[0]:, :]

# === Log-transform the target (SalePrice) to reduce skewness ===
y_log = np.log1p(y)  # log(SalePrice + 1)

# Save for future use
X_train.to_csv('processed_train.csv', index=False)
X_test.to_csv('processed_test.csv', index=False)
y_log.to_csv('target_log.csv', index=False)

print("✅ Final processed data saved as CSVs.")


✅ Data loaded and combined.
✅ Missing values handled.
✅ Categorical features label encoded.
✅ Feature engineering complete.
✅ Final processed data saved as CSVs.
