# Section A

In [1]:
# 1) Setup & load canonical cleaned data + metadata
import pandas as pd
import numpy as np
from pathlib import Path
import json
import warnings
warnings.filterwarnings("ignore")

DATA_DIR = Path("../data/")
PROCESSED_DIR = DATA_DIR / "processed"

# adjust if you used _v02 in EDA; keeping v01 as default
df_train = pd.read_parquet(PROCESSED_DIR / "hp_train_clean_v01.parquet")
df_test  = pd.read_parquet(PROCESSED_DIR / "hp_test_clean_v01.parquet")

with open(PROCESSED_DIR / "hp_clean_meta_v01.json", "r") as f:
    meta = json.load(f)

# working copies
df_tr = df_train.copy()
df_te = df_test.copy()

# keep a running list of engineered columns
fe_cols = []
(pd.DataFrame({"train_shape":[df_tr.shape], "test_shape":[df_te.shape]}))

Unnamed: 0,train_shape,test_shape
0,"(1458, 83)","(1459, 83)"


In [2]:
# 2) Structural size & composition
df_tr["fe_TotalSF"] = df_tr["1stFlrSF"] + df_tr["2ndFlrSF"] + df_tr["TotalBsmtSF"]
df_te["fe_TotalSF"] = df_te["1stFlrSF"] + df_te["2ndFlrSF"] + df_te["TotalBsmtSF"]

df_tr["fe_TotalPorchSF"] = df_tr["OpenPorchSF"] + df_tr["EnclosedPorch"] + df_tr["3SsnPorch"] + df_tr["ScreenPorch"]
df_te["fe_TotalPorchSF"] = df_te["OpenPorchSF"] + df_te["EnclosedPorch"] + df_te["3SsnPorch"] + df_te["ScreenPorch"]

df_tr["fe_TotalBaths"] = (df_tr["FullBath"] + 0.5*df_tr["HalfBath"] + df_tr["BsmtFullBath"] + 0.5*df_tr["BsmtHalfBath"])
df_te["fe_TotalBaths"] = (df_te["FullBath"] + 0.5*df_te["HalfBath"] + df_te["BsmtFullBath"] + 0.5*df_te["BsmtHalfBath"])

# ratios / densities (protect divides)
df_tr["fe_AboveVsBasement"] = df_tr["GrLivArea"] / (df_tr["TotalBsmtSF"] + 1.0)
df_te["fe_AboveVsBasement"] = df_te["GrLivArea"] / (df_te["TotalBsmtSF"] + 1.0)

df_tr["fe_FirstVsSecond"] = df_tr["1stFlrSF"] / (df_tr["2ndFlrSF"] + 1.0)
df_te["fe_FirstVsSecond"] = df_te["1stFlrSF"] / (df_te["2ndFlrSF"] + 1.0)

df_tr["fe_GarageCarsPer100SF"] = df_tr["GarageCars"] / (df_tr["GarageArea"]/100.0 + 1.0)
df_te["fe_GarageCarsPer100SF"] = df_te["GarageCars"] / (df_te["GarageArea"]/100.0 + 1.0)

new_cols = ["fe_TotalSF","fe_TotalPorchSF","fe_TotalBaths","fe_AboveVsBasement","fe_FirstVsSecond","fe_GarageCarsPer100SF"]
fe_cols += new_cols
pd.DataFrame({"added_structural": [new_cols]})

Unnamed: 0,added_structural
0,"[fe_TotalSF, fe_TotalPorchSF, fe_TotalBaths, f..."


In [3]:
# 3) Age & temporal features (clip at >= 0); Season categorical
df_tr["fe_AgeAtSale"] = (df_tr["YrSold"] - df_tr["YearBuilt"]).clip(lower=0)
df_te["fe_AgeAtSale"] = (df_te["YrSold"] - df_te["YearBuilt"]).clip(lower=0)

df_tr["fe_YearsSinceRemod"] = (df_tr["YrSold"] - df_tr["YearRemodAdd"]).clip(lower=0)
df_te["fe_YearsSinceRemod"] = (df_te["YrSold"] - df_te["YearRemodAdd"]).clip(lower=0)

# garage age: if no garage, treat as 0
df_tr["fe_GarageAgeAtSale"] = np.where(df_tr["GarageCars"]>0, (df_tr["YrSold"] - df_tr["GarageYrBlt"]).clip(lower=0), 0)
df_te["fe_GarageAgeAtSale"] = np.where(df_te["GarageCars"]>0, (df_te["YrSold"] - df_te["GarageYrBlt"]).clip(lower=0), 0)

# seasons from MoSold (keep MoSold_sin/cos from EDA as-is)
def _season(m):
    if m in [12,1,2]: return "Winter"
    if m in [3,4,5]:  return "Spring"
    if m in [6,7,8]:  return "Summer"
    return "Fall"
df_tr["comb_Season"] = pd.Categorical([_season(int(m)) for m in pd.to_numeric(df_tr["MoSold"], errors="coerce")], categories=["Winter","Spring","Summer","Fall"])
df_te["comb_Season"] = pd.Categorical([_season(int(m)) for m in pd.to_numeric(df_te["MoSold"], errors="coerce")], categories=["Winter","Spring","Summer","Fall"])

new_cols = ["fe_AgeAtSale","fe_YearsSinceRemod","fe_GarageAgeAtSale","comb_Season"]
fe_cols += new_cols
pd.DataFrame({"added_temporal": [new_cols]})

Unnamed: 0,added_temporal
0,"[fe_AgeAtSale, fe_YearsSinceRemod, fe_GarageAg..."


In [4]:
# 4) Amenity presence flags (binary int)
df_tr["bin_HasPool"] = (df_tr["PoolArea"] > 0).astype(int)
df_te["bin_HasPool"] = (df_te["PoolArea"] > 0).astype(int)

df_tr["bin_Has2ndFlr"] = (df_tr["2ndFlrSF"] > 0).astype(int)
df_te["bin_Has2ndFlr"] = (df_te["2ndFlrSF"] > 0).astype(int)

df_tr["bin_HasBasement"] = (df_tr["TotalBsmtSF"] > 0).astype(int)
df_te["bin_HasBasement"] = (df_te["TotalBsmtSF"] > 0).astype(int)

df_tr["bin_HasPorch"] = (df_tr["fe_TotalPorchSF"] > 0).astype(int)
df_te["bin_HasPorch"] = (df_te["fe_TotalPorchSF"] > 0).astype(int)

df_tr["bin_HasDeck"] = (df_tr["WoodDeckSF"] > 0).astype(int)
df_te["bin_HasDeck"] = (df_te["WoodDeckSF"] > 0).astype(int)

df_tr["bin_HasFireplace"] = (df_tr["Fireplaces"] > 0).astype(int)
df_te["bin_HasFireplace"] = (df_te["Fireplaces"] > 0).astype(int)

df_tr["bin_HasGarage"] = (df_tr["GarageCars"] > 0).astype(int)
df_te["bin_HasGarage"] = (df_te["GarageCars"] > 0).astype(int)

new_cols = ["bin_HasPool","bin_Has2ndFlr","bin_HasBasement","bin_HasPorch","bin_HasDeck","bin_HasFireplace","bin_HasGarage"]
fe_cols += new_cols
pd.DataFrame({"added_flags": [new_cols]})

Unnamed: 0,added_flags
0,"[bin_HasPool, bin_Has2ndFlr, bin_HasBasement, ..."


In [5]:
# 5) Quality/condition composites (keep parts too)
df_tr["fe_OverallGrade"]  = 2*df_tr["OverallQual"] + df_tr["OverallCond"]
df_te["fe_OverallGrade"]  = 2*df_te["OverallQual"] + df_te["OverallCond"]

df_tr["fe_ExteriorGrade"] = df_tr["ExterQual"] + df_tr["ExterCond"]
df_te["fe_ExteriorGrade"] = df_te["ExterQual"] + df_te["ExterCond"]

df_tr["fe_BasementGrade"] = df_tr["BsmtQual"] + df_tr["BsmtCond"] + df_tr["BsmtExposure"]
df_te["fe_BasementGrade"] = df_te["BsmtQual"] + df_te["BsmtCond"] + df_te["BsmtExposure"]

df_tr["fe_GarageGrade"]   = df_tr["GarageQual"] + df_tr["GarageCond"]
df_te["fe_GarageGrade"]   = df_te["GarageQual"] + df_te["GarageCond"]

# (optional) kitchen composite — keep it simple with KitchenQual only or include count
df_tr["fe_KitchenGrade"]  = df_tr["KitchenQual"]
df_te["fe_KitchenGrade"]  = df_te["KitchenQual"]

new_cols = ["fe_OverallGrade","fe_ExteriorGrade","fe_BasementGrade","fe_GarageGrade","fe_KitchenGrade"]
fe_cols += new_cols
pd.DataFrame({"added_quality": [new_cols]})

Unnamed: 0,added_quality
0,"[fe_OverallGrade, fe_ExteriorGrade, fe_Basemen..."


In [6]:
# 6) Condition proximity flags + exterior combination (order-invariant)
# flags from Condition1/2
def _is_artery(x): return x in {"Artery","Feedr"}
def _is_rr(x):     return x in {"RRNn","RRAn","RRNe","RRAe"}
def _is_pos(x):    return x in {"PosN","PosA"}

for df in (df_tr, df_te):
    df["bin_NearArtery"] = (df["Condition1"].apply(_is_artery) | df["Condition2"].apply(_is_artery)).astype(int)
    df["bin_NearRR"]     = (df["Condition1"].apply(_is_rr)     | df["Condition2"].apply(_is_rr)).astype(int)
    df["bin_NearPositive"]= (df["Condition1"].apply(_is_pos)   | df["Condition2"].apply(_is_pos)).astype(int)

# exterior set
def _ext_set(row):
    a = str(row["Exterior1st"])
    b = str(row["Exterior2nd"])
    return "+".join(sorted([a, b]))
df_tr["comb_ExteriorSet"] = pd.Categorical(df_tr.apply(_ext_set, axis=1))
df_te["comb_ExteriorSet"] = pd.Categorical(df_te.apply(_ext_set, axis=1))

new_cols = ["bin_NearArtery","bin_NearRR","bin_NearPositive","comb_ExteriorSet"]
fe_cols += new_cols
pd.DataFrame({"added_location_materials": [new_cols]})

Unnamed: 0,added_location_materials
0,"[bin_NearArtery, bin_NearRR, bin_NearPositive,..."


In [7]:
# 7) Neighborhood feature-only aggregates (train-only medians; mapped to both)
nbhd_cols = ["LotArea", "YearBuilt", "GrLivArea"]
if "fe_TotalSF" in df_tr.columns:
    nbhd_cols.append("fe_TotalSF")

nbhd_meds = df_tr.groupby("Neighborhood")[nbhd_cols].median()

for c in nbhd_cols:
    colname = f"fe_NbhdMed_{c}"
    df_tr[colname] = df_tr["Neighborhood"].map(nbhd_meds[c])
    df_te[colname] = df_te["Neighborhood"].map(nbhd_meds[c])

new_cols = [f"fe_NbhdMed_{c}" for c in nbhd_cols]
fe_cols += new_cols
pd.DataFrame({"added_nbhd_medians": [new_cols]})

Unnamed: 0,added_nbhd_medians
0,"[fe_NbhdMed_LotArea, fe_NbhdMed_YearBuilt, fe_..."


In [8]:
# 8) Compact interaction shortlist
df_tr["int_OQual_x_GrLiv"]  = df_tr["OverallQual"] * df_tr["GrLivArea"]
df_te["int_OQual_x_GrLiv"]  = df_te["OverallQual"] * df_te["GrLivArea"]

df_tr["int_OQual_x_TotalSF"] = df_tr["OverallQual"] * df_tr["fe_TotalSF"]
df_te["int_OQual_x_TotalSF"] = df_te["OverallQual"] * df_te["fe_TotalSF"]

df_tr["int_TotalBaths_x_Beds"] = df_tr["fe_TotalBaths"] * df_tr["BedroomAbvGr"]
df_te["int_TotalBaths_x_Beds"] = df_te["fe_TotalBaths"] * df_te["BedroomAbvGr"]

df_tr["int_GarCars_x_GarArea"] = df_tr["GarageCars"] * df_tr["GarageArea"]
df_te["int_GarCars_x_GarArea"] = df_te["GarageCars"] * df_te["GarageArea"]

new_cols = ["int_OQual_x_GrLiv","int_OQual_x_TotalSF","int_TotalBaths_x_Beds","int_GarCars_x_GarArea"]
fe_cols += new_cols
pd.DataFrame({"added_interactions": [new_cols]})

Unnamed: 0,added_interactions
0,"[int_OQual_x_GrLiv, int_OQual_x_TotalSF, int_T..."


In [9]:
# 9) Safety pass: finite values, types, cardinality snapshot
ratio_cols = ["fe_AboveVsBasement","fe_FirstVsSecond","fe_GarageCarsPer100SF"]
for df in (df_tr, df_te):
    df[ratio_cols] = df[ratio_cols].replace([np.inf, -np.inf], np.nan).fillna(0)

# cast flags to int
flag_cols = [c for c in df_tr.columns if c.startswith("bin_")]
df_tr[flag_cols] = df_tr[flag_cols].astype("int")
df_te[flag_cols] = df_te[flag_cols].astype("int")

# ensure engineered categoricals are 'category'
cat_new = ["comb_Season","comb_ExteriorSet"]
for c in cat_new:
    if c in df_tr.columns:
        df_tr[c] = df_tr[c].astype("category")
        df_te[c] = df_te[c].astype("category")

# quick summary of what we added
pd.Series({
    "n_engineered_cols": len(fe_cols),
    "sample_engineered_cols": fe_cols[:12]
})

n_engineered_cols                                                        34
sample_engineered_cols    [fe_TotalSF, fe_TotalPorchSF, fe_TotalBaths, f...
dtype: object

In [10]:
# 10) Update metadata & persist feature-engineered datasets
# categorize engineered columns by type
engineered_binary = flag_cols
engineered_nominal = [c for c in ["comb_Season","comb_ExteriorSet"] if c in df_tr.columns]
engineered_continuous = sorted(list(set(fe_cols) - set(engineered_binary) - set(engineered_nominal)))
engineered_interactions = [c for c in fe_cols if c.startswith("int_")]

# update meta lists (non-destructive)
meta.update({
    "engineered_binary": engineered_binary,
    "engineered_nominal": engineered_nominal,
    "engineered_continuous": engineered_continuous,
    "engineered_interactions": engineered_interactions,
})

with open(PROCESSED_DIR / "hp_clean_meta_v02.json", "w") as f:
    json.dump(meta, f, indent=2)

# save FE outputs (unscaled, unencoded, no power/log)
df_tr.to_parquet(PROCESSED_DIR / "hp_train_feat_v01.parquet", index=False)
df_te.to_parquet(PROCESSED_DIR / "hp_test_feat_v01.parquet", index=False)

pd.DataFrame({
    "train_feat_shape":[df_tr.shape],
    "test_feat_shape":[df_te.shape],
    "meta_file":[str(PROCESSED_DIR / "hp_clean_meta_v02.json")]
})

Unnamed: 0,train_feat_shape,test_feat_shape,meta_file
0,"(1458, 117)","(1459, 117)",../data/processed/hp_clean_meta_v02.json


# Section B

In [1]:
# 1) Setup: paths, load v01 features/meta/folds
import numpy as np, pandas as pd, json
from pathlib import Path
from sklearn.cluster import KMeans
import warnings; warnings.filterwarnings("ignore")

DATA_DIR = Path("../data/")
PROCESSED_DIR = DATA_DIR / "processed"
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

df_tr = pd.read_parquet(PROCESSED_DIR / "hp_train_feat_v01.parquet")
df_te = pd.read_parquet(PROCESSED_DIR / "hp_test_feat_v01.parquet")

with open(PROCESSED_DIR / "hp_clean_meta_v02.json","r") as f:
    meta = json.load(f)

folds = pd.read_csv(PROCESSED_DIR / "cv_folds_selected.csv")
fold = folds["fold"].values

id_col = "Id"
target_col = "SalePrice"

pd.DataFrame({"train_shape":[df_tr.shape], "test_shape":[df_te.shape]})

Unnamed: 0,train_shape,test_shape
0,"(1458, 117)","(1459, 117)"


In [2]:
# 2) Ensure base columns exist + helpful composites (TotalSF)
# TotalSF = TotalBsmtSF + 1stFlrSF + 2ndFlrSF (add if missing)
for df in (df_tr, df_te):
    if "TotalSF" not in df.columns:
        cols = [c for c in ["TotalBsmtSF","1stFlrSF","2ndFlrSF"] if c in df.columns]
        df["TotalSF"] = df[cols].sum(axis=1)

# Keep a copy of y on log scale for CV-safe encodings
y_log = np.log1p(df_tr[target_col])

pd.Series({"has_TotalSF": "TotalSF" in df_tr.columns, "y_log_ready": True})

has_TotalSF    True
y_log_ready    True
dtype: bool

In [4]:
# 3) Neighborhood target mean encoding (CV-safe for TRAIN; full-train map for TEST)
# Column to create: Nbhd_TgtMean_Log_v2
# Fallback to global mean if a neighborhood is unseen.

nbhd_col = "Neighborhood"
y_log = np.log1p(df_tr[target_col])
global_mean = float(y_log.mean())

# Preallocate OOF array, fill per-fold, then assign once (avoids LossySetitemError)
oof_nbhd_mean = np.full(len(df_tr), np.nan, dtype=np.float64)

for k in np.unique(fold):
    tr_idx = np.where(fold != k)[0]
    va_idx = np.where(fold == k)[0]

    # map from train part of the fold
    map_k = (
        df_tr.iloc[tr_idx]
            .groupby(nbhd_col)[target_col]
            .apply(lambda s: np.log1p(s).mean())
    )

    mapped = (
        df_tr.loc[va_idx, nbhd_col]
            .map(map_k)
            .astype("float64")          # ensure float, allow NaN
            .to_numpy()
    )
    oof_nbhd_mean[va_idx] = mapped

# fill unseen neighborhoods with global mean and assign once
oof_nbhd_mean = np.where(np.isnan(oof_nbhd_mean), global_mean, oof_nbhd_mean)
df_tr = df_tr.copy()  # be explicit under CoW
df_tr["Nbhd_TgtMean_Log_v2"] = oof_nbhd_mean

# TEST side: full-train map (no leakage; test has no labels)
map_full = (
    df_tr.groupby(nbhd_col)[target_col]
        .apply(lambda s: np.log1p(s).mean())
)
df_te["Nbhd_TgtMean_Log_v2"] = (
    df_te[nbhd_col].map(map_full).astype("float64").fillna(global_mean)
)

pd.Series({
    "train_nulls": int(np.isnan(oof_nbhd_mean).sum()),
    "test_nulls": int(df_te["Nbhd_TgtMean_Log_v2"].isna().sum()),
    "global_mean_log": global_mean,
})

train_nulls         0.000000
test_nulls          0.000000
global_mean_log    12.024015
dtype: float64

In [5]:
# 4) Interaction features using the neighborhood premium (numeric)
# - Qual_x_NbhdPrem_v2 = OverallQual * Nbhd_TgtMean_Log_v2
# - LogTotalSF_x_NbhdPrem_v2 = log1p(TotalSF) * Nbhd_TgtMean_Log_v2

for df in (df_tr, df_te):
    df["TotalSF_log1p_v2"] = np.log1p(df["TotalSF"])
    df["Qual_x_NbhdPrem_v2"] = df["OverallQual"] * df["Nbhd_TgtMean_Log_v2"]
    df["LogTotalSF_x_NbhdPrem_v2"] = df["TotalSF_log1p_v2"] * df["Nbhd_TgtMean_Log_v2"]

pd.DataFrame({
    "new_numeric":[["TotalSF_log1p_v2","Qual_x_NbhdPrem_v2","LogTotalSF_x_NbhdPrem_v2"]]
})

Unnamed: 0,new_numeric
0,"[TotalSF_log1p_v2, Qual_x_NbhdPrem_v2, LogTota..."


In [7]:
# 5) Relative-to-neighborhood features (robust to categorical dtypes)
# Build medians on TRAIN (feature-only stats)
med_q  = df_tr.groupby("Neighborhood")["OverallQual"].median().astype("float64")
med_sf = df_tr.groupby("Neighborhood")["TotalSF"].median().astype("float64")

for df in (df_tr, df_te):
    # map medians and cast to float
    den_q  = df["Neighborhood"].map(med_q).astype("float64").replace(0.0, np.nan)
    den_sf = df["Neighborhood"].map(med_sf).astype("float64").replace(0.0, np.nan)

    # cast numerators to float in case anything is category/int extension dtype
    num_q  = pd.to_numeric(df["OverallQual"], errors="coerce").astype("float64")
    num_sf = pd.to_numeric(df["TotalSF"],     errors="coerce").astype("float64")

    # compute ratios
    df["OverallQual_relative_v2"] = num_q  / den_q
    df["TotalSF_relative_v2"]     = num_sf / den_sf

# clean infinities / missing (rare neighborhoods) → neutral value 1.0
for df in (df_tr, df_te):
    df[["OverallQual_relative_v2","TotalSF_relative_v2"]] = (
        df[["OverallQual_relative_v2","TotalSF_relative_v2"]]
        .replace([np.inf, -np.inf], np.nan)
        .fillna(1.0)
        .astype("float64")
    )

pd.DataFrame({
    "qual_rel_mean":[df_tr["OverallQual_relative_v2"].mean()],
    "sf_rel_mean":[df_tr["TotalSF_relative_v2"].mean()]
})

Unnamed: 0,qual_rel_mean,sf_rel_mean
0,1.021616,1.018291


In [8]:
# 6) Categorical interactions
# - Nbhd_Qual_cat_v2 = f"{Neighborhood}_{OverallQual}"
# - YearBuiltDecade_v2 = floor(YearBuilt/10)*10
# - Nbhd_Decade_cat_v2 = f"{Neighborhood}_{YearBuiltDecade_v2}"

for df in (df_tr, df_te):
    df["Nbhd_Qual_cat_v2"] = df[nbhd_col].astype(str) + "_" + df["OverallQual"].astype(str)
    df["YearBuiltDecade_v2"] = (df["YearBuilt"] // 10) * 10
    df["Nbhd_Decade_cat_v2"] = df[nbhd_col].astype(str) + "_" + df["YearBuiltDecade_v2"].astype(str)

# (optional) cast to 'category' now; CSV will not preserve dtype but helps in memory during work
for df in (df_tr, df_te):
    df["Nbhd_Qual_cat_v2"]   = df["Nbhd_Qual_cat_v2"].astype("category")
    df["Nbhd_Decade_cat_v2"] = df["Nbhd_Decade_cat_v2"].astype("category")

pd.DataFrame({
    "n_levels_Nbhd_Qual":[df_tr["Nbhd_Qual_cat_v2"].nunique()],
    "n_levels_Nbhd_Decade":[df_tr["Nbhd_Decade_cat_v2"].nunique()]
})

Unnamed: 0,n_levels_Nbhd_Qual,n_levels_Nbhd_Decade
0,112,122


In [9]:
# 7) Neighborhood clusters (feature-only stats; safe)
# Cluster neighborhoods by (median OverallQual, median TotalSF, median YearBuilt)
feat_nbhd = (
    df_tr.groupby(nbhd_col)
    .agg(med_qual=("OverallQual","median"),
         med_sf=("TotalSF","median"),
         med_year=("YearBuilt","median"))
    .reset_index()
)

kmeans = KMeans(n_clusters=4, random_state=42, n_init="auto")
feat_nbhd["NbhdCluster4_v2"] = kmeans.fit_predict(feat_nbhd[["med_qual","med_sf","med_year"]])

cluster_map = dict(zip(feat_nbhd[nbhd_col], feat_nbhd["NbhdCluster4_v2"]))
for df in (df_tr, df_te):
    df["NbhdCluster4_v2"] = df[nbhd_col].map(cluster_map).astype("Int64")

pd.DataFrame({"clusters":[feat_nbhd["NbhdCluster4_v2"].value_counts().to_dict()]})

Unnamed: 0,clusters
0,"{1: 7, 2: 7, 0: 6, 3: 4}"


In [10]:
# 8) Luxury flags
lux_set = {"StoneBr","NoRidge","NridgHt","Veenker","ClearCr"}
for df in (df_tr, df_te):
    df["is_luxury_nbhd_v2"] = df[nbhd_col].isin(lux_set).astype("int8")
    df["is_luxury_v2"] = ((df["OverallQual"] >= 9) | df[nbhd_col].isin(lux_set)).astype("int8")

pd.DataFrame({
    "lux_nbhd_rate":[df_tr["is_luxury_nbhd_v2"].mean()],
    "lux_rate":[df_tr["is_luxury_v2"].mean()]
})

Unnamed: 0,lux_nbhd_rate,lux_rate
0,0.124829,0.135117


In [11]:
# 9) Extra log transforms for skewed size features (if not already present)
# Create *_log1p_v2 to avoid collisions
for col in ["LotArea","GrLivArea","BsmtFinSF1"]:
    if col in df_tr.columns:
        for df in (df_tr, df_te):
            df[f"{col}_log1p_v2"] = np.log1p(df[col])

new_logs = [c for c in df_tr.columns if c.endswith("_log1p_v2")]
pd.Series(new_logs)

0       TotalSF_log1p_v2
1       LotArea_log1p_v2
2     GrLivArea_log1p_v2
3    BsmtFinSF1_log1p_v2
dtype: object

In [12]:
# 10) Assemble lists of newly engineered features for metadata
engineered_cont_v2 = [
    "Nbhd_TgtMean_Log_v2",
    "TotalSF_log1p_v2",
    "Qual_x_NbhdPrem_v2",
    "LogTotalSF_x_NbhdPrem_v2",
    "OverallQual_relative_v2",
    "TotalSF_relative_v2",
] + new_logs

engineered_nominal_v2 = [
    "Nbhd_Qual_cat_v2",
    "Nbhd_Decade_cat_v2",
    "NbhdCluster4_v2",
]

engineered_binary_v2 = [
    "is_luxury_nbhd_v2",
    "is_luxury_v2",
]

pd.DataFrame({
    "engineered_cont_v2":[engineered_cont_v2],
    "engineered_nominal_v2":[engineered_nominal_v2],
    "engineered_binary_v2":[engineered_binary_v2],
})

Unnamed: 0,engineered_cont_v2,engineered_nominal_v2,engineered_binary_v2
0,"[Nbhd_TgtMean_Log_v2, TotalSF_log1p_v2, Qual_x...","[Nbhd_Qual_cat_v2, Nbhd_Decade_cat_v2, NbhdClu...","[is_luxury_nbhd_v2, is_luxury_v2]"


In [13]:
# 11) Save v2 CSVs (train/test) with the new features appended
train_v2_path = PROCESSED_DIR / "hp_train_feat_v02.csv"
test_v2_path  = PROCESSED_DIR / "hp_test_feat_v02.csv"

df_tr.to_csv(train_v2_path, index=False)
df_te.to_csv(test_v2_path, index=False)

pd.DataFrame({"saved_train_v2":[str(train_v2_path)], "saved_test_v2":[str(test_v2_path)]})

Unnamed: 0,saved_train_v2,saved_test_v2
0,../data/processed/hp_train_feat_v02.csv,../data/processed/hp_test_feat_v02.csv


In [14]:
# 12) Update metadata → hp_clean_meta_v03.json (append v2 features)
meta_v03 = dict(meta)  # copy previous
meta_v03["engineered_continuous"] = sorted(list(set(meta.get("engineered_continuous", []) + engineered_cont_v2)))
meta_v03["engineered_nominal"]    = sorted(list(set(meta.get("engineered_nominal", [])    + engineered_nominal_v2)))
meta_v03["engineered_binary"]     = sorted(list(set(meta.get("engineered_binary", [])     + engineered_binary_v2)))
meta_v03["feature_version"]       = "v02"
meta_v03["folds_file"]            = str(PROCESSED_DIR / "cv_folds_selected.csv")
meta_v03_path = PROCESSED_DIR / "hp_clean_meta_v03.json"

with open(meta_v03_path, "w") as f:
    json.dump(meta_v03, f, indent=2)

pd.Series({"saved_meta": str(meta_v03_path)})

saved_meta    ../data/processed/hp_clean_meta_v03.json
dtype: object