In [1]:
# ============================================================
# House Price Prediction - Full Training Pipeline (Single File)
# ============================================================

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

import joblib

# ------------------------------------------------------------
# Paths
# ------------------------------------------------------------
DATA_DIR  = "../data/raw"
MODEL_DIR = "../models"
FIG_DIR   = "../reports/figures"

os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(FIG_DIR, exist_ok=True)

# ------------------------------------------------------------
# 1. Load Data
# ------------------------------------------------------------
df_train = pd.read_csv(f"{DATA_DIR}/House_Price_Train.csv")
df_test  = pd.read_csv(f"{DATA_DIR}/House_Price_Test.csv")

df = pd.concat([df_train, df_test], ignore_index=True)

# ------------------------------------------------------------
# 2. Initial Exploration
# ------------------------------------------------------------
print(df.head())
print(df.info())
print(df.describe())
print("Shape:", df.shape)

# ------------------------------------------------------------
# 3. Feature Type Identification
# ------------------------------------------------------------
int_features   = df.select_dtypes(include="int64").columns
float_features = df.select_dtypes(include="float64").columns
cat_features   = df.select_dtypes(include="object").columns

print("Integer features:", len(int_features))
print("Float features:", len(float_features))
print("Categorical features:", len(cat_features))

# ------------------------------------------------------------
# 4. Missing Values Heatmap
# ------------------------------------------------------------
plt.figure(figsize=(12, 6))
sns.heatmap(df.isnull(), cbar=False)
plt.title("Missing Values Heatmap")
plt.savefig(f"{FIG_DIR}/Null_Values_Heatmap.png", dpi=300, bbox_inches="tight")
plt.close()

# ------------------------------------------------------------
# 5. Drop High-Missing Columns
# ------------------------------------------------------------
df = df.drop(columns=["Alley", "PoolQC", "Fence", "MiscFeature"], errors="ignore")

# ------------------------------------------------------------
# 6. Feature Engineering
# ------------------------------------------------------------

# 6.1 Fill Missing Values
df["LotFrontage"] = (
    df.groupby("Neighborhood")["LotFrontage"]
      .transform(lambda x: x.fillna(x.median()))
)

for col in df.select_dtypes(include="object").columns:
    df[col] = df[col].fillna(df[col].mode()[0])

for col in df.select_dtypes(include=["int64", "float64"]).columns:
    df[col] = df[col].fillna(df[col].median())

# 6.2 Create New Features
df["HouseAge"]  = df["YrSold"] - df["YearBuilt"]
df["RemodAge"]  = df["YrSold"] - df["YearRemodAdd"]
df["TotalSF"]   = df["TotalBsmtSF"] + df["1stFlrSF"] + df["2ndFlrSF"]
df["TotalBath"] = (
    df["FullBath"]
    + 0.5 * df["HalfBath"]
    + df["BsmtFullBath"]
    + 0.5 * df["BsmtHalfBath"]
)

# ------------------------------------------------------------
# 7. Train / Test Split + Log Target
# ------------------------------------------------------------
df_train = df[df["SalePrice"].notna()]
df_test  = df[df["SalePrice"].isna()]

X = df_train.drop(columns="SalePrice")
y = np.log1p(df_train["SalePrice"])

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)


# ------------------------------------------------------------
# 8. Preprocessing Pipeline
# ------------------------------------------------------------
num_features = X_train.select_dtypes(include=["int64", "float64"]).columns
cat_features = X_train.select_dtypes(include="object").columns

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_features),
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_features),
    ]
)

# ------------------------------------------------------------
# 9. Models
# ------------------------------------------------------------
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(
        n_estimators=200,
        max_depth=20,
        random_state=42,
        n_jobs=-1
    ),
}

# ------------------------------------------------------------
# 10. Training & Validation Evaluation
# ------------------------------------------------------------
results = []

for name, model in models.items():
    pipe = Pipeline([
        ("prep", preprocessor),
        ("model", model),
    ])

    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_val)

    rmse = np.sqrt(mean_squared_error(y_val, preds))
    r2   = r2_score(y_val, preds)

    results.append([name, rmse, r2])

results_df = pd.DataFrame(results, columns=["Model", "RMSE", "R2"])
print(results_df)


# ------------------------------------------------------------
# 11. Cross-Validation (Random Forest)
# ------------------------------------------------------------
rf_pipe = Pipeline([
    ("prep", preprocessor),
    ("model", models["Random Forest"]),
])

cv_rmse = -cross_val_score(
    rf_pipe,
    X_train,
    y_train,
    scoring="neg_root_mean_squared_error",
    cv=10,
)

print("\nCross-Validation RMSE Summary:")
print(pd.Series(cv_rmse).describe())


# ------------------------------------------------------------
# 12. Final Model Training & Save
# ------------------------------------------------------------
rf_pipe.fit(X, y)
joblib.dump(rf_pipe, f"{MODEL_DIR}/final_house_price_model.pkl")

print("\n Final model saved at:", f"{MODEL_DIR}/final_house_price_model.pkl")

   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold  \
0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   
2         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      9   
3         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
4         Lvl    AllPub  ...        0    NaN   NaN         NaN       0     12   

  YrSold  SaleType  SaleCondition  SalePrice  
0   2008        WD   