In [363]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

eps = 1e-3

# Paths
train_path = "train_cleaned.csv"
test_path  = "test_cleaned.csv"

train = pd.read_csv(train_path)
test  = pd.read_csv(test_path)

train_fe = train.copy()
test_fe  = test.copy()

print("Train shape:", train_fe.shape)
print("Test shape :", test_fe.shape)
assert train_fe.isna().sum().sum() == 0
assert test_fe.isna().sum().sum() == 0

Train shape: (2736, 179)
Test shape : (20, 178)


 **Purpose**

1. **Load cleaned datasets**
   - Read `train_cleaned.csv` and `test_cleaned.csv` into Pandas DataFrames  
   - Create copies `train_fe` and `test_fe` to use for feature engineering

2. **Check dataset shapes**
   - Print the number of rows and columns for train and test sets  
   - Ensures the datasets are loaded correctly

3. **Verify there are no missing values**
   - `assert train_fe.isna().sum().sum() == 0`  
   - `assert test_fe.isna().sum().sum() == 0`  
   - Confirms that all missing values have been handled before feature engineering


In [364]:
age_col = "Basic_Demos-Age"
sex_col = "Basic_Demos-Sex"

for df in (train_fe, test_fe):
    df["Age_sq"] = df[age_col] ** 2
    df["Age_cu"] = df[age_col] ** 3

age_bins = [0, 7, 10, 13, 16, 20]
age_labels = [0, 1, 2, 3, 4]

train_fe["Age_group"] = pd.cut(train_fe[age_col], bins=age_bins, labels=age_labels).astype(int)
test_fe["Age_group"]  = pd.cut(test_fe[age_col],  bins=age_bins, labels=age_labels).astype(int)


**Purpose**

1. **Create polynomial features for age**
   - `Age_sq` = age squared  
   - `Age_cu` = age cubed  
   - Captures potential **non-linear relationships** between age and target variables

2. **Create age groups**
   - Define age bins: `[0, 7, 10, 13, 16, 20]`  
   - Assign labels `[0, 1, 2, 3, 4]` corresponding to age ranges  
   - `pd.cut(...).astype(int)` converts continuous age into discrete **age categories**


In [365]:
bmi_col = "Physical-BMI"

for df in (train_fe, test_fe):
    df["BMI_sq"] = df[bmi_col] ** 2

def bmi_cat(x):
    if x < 18.5:   return 0
    elif x < 25:   return 1
    elif x < 30:   return 2
    else:          return 3

train_fe["BMI_category"] = train_fe[bmi_col].apply(bmi_cat)
test_fe["BMI_category"]  = test_fe[bmi_col].apply(bmi_cat)


**Purpose**

1. **Create polynomial feature for BMI**
   - `BMI_sq` = BMI squared  
   - Captures potential **non-linear relationships** between BMI and the target

2. **Create BMI categories**
   - Function `bmi_cat` defines categories:  
     - `< 18.5` → underweight (0)  
     - `18.5–24.9` → normal weight (1)  
     - `>= 25` → overweight/obese (3)  
   - Converts continuous BMI into discrete **BMI categories**



In [366]:
h_col = "Physical-Height"
w_col = "Physical-Weight"

group_cols = [sex_col, "Age_group"]

height_stats = train_fe.groupby(group_cols)[h_col].agg(["mean", "std"])
weight_stats = train_fe.groupby(group_cols)[w_col].agg(["mean", "std"])

h_global_mean, h_global_std = train_fe[h_col].mean(), train_fe[h_col].std()
w_global_mean, w_global_std = train_fe[w_col].mean(), train_fe[w_col].std()

def add_hw_z(df):
    HZ, WZ = [], []
    for sex, ag, h, w in zip(df[sex_col], df["Age_group"], df[h_col], df[w_col]):
        if (sex, ag) in height_stats.index:
            hm, hs = height_stats.loc[(sex, ag)]
        else:
            hm, hs = h_global_mean, h_global_std
        if (sex, ag) in weight_stats.index:
            wm, ws = weight_stats.loc[(sex, ag)]
        else:
            wm, ws = w_global_mean, w_global_std

        HZ.append((h - hm) / (hs if hs else h_global_std))
        WZ.append((w - wm) / (ws if ws else w_global_std))

    df["Height_for_age_z"] = HZ
    df["Weight_for_age_z"] = WZ

add_hw_z(train_fe)
add_hw_z(test_fe)


**Purpose**

1. **Compute group-specific statistics**
   - Group by `sex` and `Age_group`  
   - Compute **mean** and **standard deviation** for Height (`Physical-Height`) and Weight (`Physical-Weight`)  
   - Also compute **global mean and std** as fallback

2. **Create height- and weight-for-age z-scores**
   - For each row, calculate:  
     - `Height_for_age_z = (height - group_mean) / group_std`  
     - `Weight_for_age_z = (weight - group_mean) / group_std`  
   - If group statistics are missing, use global mean and std  
   - Standardizes measurements within **age and sex groups**
|


In [367]:
sys_col = "Physical-Systolic_BP"
dia_col = "Physical-Diastolic_BP"

for df in (train_fe, test_fe):
    df["BP_pulse_pressure"] = df[sys_col] - df[dia_col]
    df["BP_mean_arterial"]  = df[dia_col] + df["BP_pulse_pressure"] / 3

def bp_cat(sys, dia):
    if sys < 120 and dia < 80: return 0
    if 120 <= sys < 130 and dia < 80: return 1
    if 130 <= sys < 140 or 80 <= dia < 90: return 2
    if sys >= 140 or dia >= 90: return 3
    return 0

train_fe["BP_category"] = [bp_cat(s, d) for s, d in zip(train_fe[sys_col], train_fe[dia_col])]
test_fe["BP_category"]  = [bp_cat(s, d) for s, d in zip(test_fe[sys_col], test_fe[dia_col])]


**Purpose**

1. **Compute derived blood pressure features**
   - `BP_pulse_pressure = Systolic_BP - Diastolic_BP` → measures the difference between systolic and diastolic pressure  
   - `BP_mean_arterial = Diastolic_BP + BP_pulse_pressure / 3` → approximate mean arterial pressure

2. **Create blood pressure categories**
   - Function `bp_cat(sys, dia)` categorizes BP based on **standard clinical thresholds**:  
     - 0: Normal (<120 / <80)  
     - 1: Elevated (120–129 / <80)  
     - 2: Hypertension stage 1 (130–139 / 80–89)  
     - 3: Hypertension stage 2 (≥140 / ≥90)  



In [368]:
print(train_fe.shape)
print(test_fe.shape)
print(train_fe.isna().sum().sum())
print(test_fe.isna().sum().sum())

(2736, 189)
(20, 188)
0
0


In [369]:
print(train_fe.shape)
print(test_fe.shape)
print(train_fe.isna().sum().sum())
print(test_fe.isna().sum().sum())

(2736, 189)
(20, 188)
0
0


In [370]:
print(train_fe.shape)
print(test_fe.shape)
print(train_fe.isna().sum().sum())
print(test_fe.isna().sum().sum())

(2736, 189)
(20, 188)
0
0


In [371]:
for df in (train_fe, test_fe):
    df["pciat_span"]    = df["relative_date_PCIAT_max"] - df["relative_date_PCIAT_min"]
    df["pciat_iqr"]     = df["relative_date_PCIAT_75%"] - df["relative_date_PCIAT_25%"]
    df["pciat_recency"] = df["relative_date_PCIAT_max"]


**Purpose**

  **Create derived PCIAT features**
   - `pciat_span` = `relative_date_PCIAT_max - relative_date_PCIAT_min`  
     - Measures the total time span of PCIAT activity  
   - `pciat_iqr` = `relative_date_PCIAT_75% - relative_date_PCIAT_25%`  
     - Measures the interquartile range of activity dates, capturing variability  
   - `pciat_recency` = `relative_date_PCIAT_max`  
     - Records the most recent activity, indicating recency


In [None]:
print(train_fe.shape)
print(test_fe.shape)
print(train_fe.isna().sum().sum())
print(test_fe.isna().sum().sum())

(2736, 192)
(20, 191)
0
0


In [373]:
for df in (train_fe, test_fe):

    df["enmo_cv"]   = df["enmo_std"]   / (df["enmo_mean"].abs() + eps)
    df["light_cv"]  = df["light_std"]  / (df["light_mean"].abs() + eps)
    df["anglez_cv"] = df["anglez_std"] / (df["anglez_mean"].abs() + eps)

    df["enmo_activity_volume"] = df["enmo_mean"] * df["enmo_count"]

    df["active_por"] = df["light_por"] + df["moderate_por"]
    df["sedentary_to_active_ratio"] = df["sedentary_por"] / (df["active_por"] + eps)

    df["sedentary_percentage"] = df["sedentary_por"]
    df["moderate_vigorous_percentage"] = df["moderate_por"]

    df["mvpa_minutes_estimate"] = df["moderate_por"] * 24 * 60


**Purpose**

1. **Compute coefficient of variation (CV) for Actigraphy signals**
   - `enmo_cv = enmo_std / |enmo_mean|`  
   - `light_cv = light_std / |light_mean|`  
   - `anglez_cv = anglez_std / |anglez_mean|`  
   - Captures **relative variability** of the signals  

2. **Compute activity volume**
   - `enmo_activity_volume = enmo_mean * enmo_count`  
   - Quantifies **overall movement intensity**  

3. **Summarize activity proportions**
   - `active_por = light_por + moderate_por` → total active time proportion  
   - `sedentary_to_active_ratio = sedentary_por / active_por` → balance of sedentary vs active time  
   - `sedentary_percentage = sedentary_por`  
   - `moderate_vigorous_percentage = moderate_por`  

4. **Estimate MVPA (Moderate-to-Vigorous Physical Activity)**
   - `mvpa_minutes_estimate = moderate_por * 24 * 60`  
   - Converts proportion to **estimated minutes per day**  



In [None]:
sds_t = "SDS-SDS_Total_T"
cgas  = "CGAS-CGAS_Score"

# Log transform
for df in (train_fe, test_fe):
    df["SDS_T_log"] = np.log(df[sds_t])

# Z-scores (train-only)
sds_mean, sds_std = train_fe[sds_t].mean(), train_fe[sds_t].std()
cgas_mean, cgas_std = train_fe[cgas].mean(), train_fe[cgas].std()

for df in (train_fe, test_fe):
    df["SDS_T_z"] = (df[sds_t] - sds_mean) / sds_std
    df["CGAS_z"] = (df[cgas] - cgas_mean) / cgas_std

# SDS severity
def sds_sev(x):
    if x < 50: return 0
    if x < 60: return 1
    if x < 70: return 2
    return 3

train_fe["SDS_severity"] = train_fe[sds_t].apply(sds_sev)
test_fe["SDS_severity"]  = test_fe[sds_t].apply(sds_sev)

# CGAS impairment
def cgas_imp(x):
    if x >= 80: return 0
    if x >= 70: return 1
    if x >= 60: return 2
    return 3

train_fe["CGAS_impairment"] = train_fe[cgas].apply(cgas_imp)
test_fe["CGAS_impairment"]  = test_fe[cgas].apply(cgas_imp)

# SDS–CGAS interactions
for df in (train_fe, test_fe):
    df["SDS_CGAS_ratio"]   = df[sds_t] / (df[cgas] + eps)
    df["SDS_CGAS_diff_z"]  = df["SDS_T_z"] - df["CGAS_z"]


**Purpose**

1. **Log-transform SDS Total Score**
   - `SDS_T_log = log(SDS-SDS_Total_T)`  
   - Reduces skewness and stabilizes variance of SDS scores  

2. **Compute Z-scores (train-only statistics)**
   - `SDS_T_z = (SDS - mean) / std`  
   - `CGAS_z = (CGAS - mean) / std`  
   - Standardizes scores for comparability across features  

3. **Create categorical severity/impairment features**
   - `SDS_severity`: discretizes SDS score into 4 severity levels (0–3)  
   - `CGAS_impairment`: discretizes CGAS score into 4 impairment levels (0–3)  

4. **Create SDS–CGAS interaction features**
   - `SDS_CGAS_ratio = SDS / CGAS` → captures relative imbalance  
   - `SDS_CGAS_diff_z = SDS_z - CGAS_z` → captures standardized difference  



In [375]:
print(train_fe.shape)
print(test_fe.shape)
print(train_fe.isna().sum().sum())
print(test_fe.isna().sum().sum())

(2736, 208)
(20, 207)
0
0


In [None]:
inet = "PreInt_EduHx-computerinternet_hoursday"

def inet_risk(x):
    if x == 0: return 0
    if x <= 1: return 1
    if x <= 2: return 2
    return 3

train_fe["Internet_risk_bin"] = train_fe[inet].apply(inet_risk)
test_fe["Internet_risk_bin"]  = test_fe[inet].apply(inet_risk)

for df in (train_fe, test_fe):
    df["Internet_sedentary_interaction"] = df[inet] * df["sedentary_por"]
    df["Screen_vs_activity_ratio"] = df[inet] / (df["active_por"] + eps)


**Purpose**

1. **Discretize internet use into risk bins**
   - Function `inet_risk` maps hours/day to 4 bins:  
     - 0 hours → 0  
     - ≤1 hour → 1  
     - ≤2 hours → 2  
     - >2 hours → 3  
   - Creates `Internet_risk_bin` feature to categorize low vs high screen time  

2. **Create interaction features with physical activity**
   - `Internet_sedentary_interaction = internet_hours * sedentary_por`  
     - Captures combined effect of screen time and sedentary behavior  
   - `Screen_vs_activity_ratio = internet_hours / active_por`  
     - Measures relative balance between screen time and active time  


In [377]:
mins = "Fitness_Endurance-Time_Mins"
secs = "Fitness_Endurance-Time_Sec"
stage = "Fitness_Endurance-Max_Stage"

for df in (train_fe, test_fe):
    df["Endurance_total_secs"] = df[mins] * 60 + df[secs]

end_mean, end_std = train_fe["Endurance_total_secs"].mean(), train_fe["Endurance_total_secs"].std()
stage_mean, stage_std = train_fe[stage].mean(), train_fe[stage].std()

for df in (train_fe, test_fe):
    df["Endurance_secs_z"] = (df["Endurance_total_secs"] - end_mean) / end_std
    df["Endurance_stage_z"] = (df[stage] - stage_mean) / stage_std


**Purpose**

1. **Compute total endurance time in seconds**
   - `Endurance_total_secs = Fitness_Endurance-Time_Mins * 60 + Fitness_Endurance-Time_Sec`  
   - Combines minutes and seconds into a single numeric measure  

2. **Standardize endurance features (train-only statistics)**
   - `Endurance_secs_z = (total_secs - mean) / std`  
   - `Endurance_stage_z = (Max_Stage - mean) / std`  
   - Creates z-scores for comparability across participants  



In [378]:
fgc_scores = [c for c in train.columns if c.startswith("FGC-FGC_") and "_Zone" not in c and "Season" not in c]
fgc_zones  = [c for c in train.columns if c.startswith("FGC-FGC_") and c.endswith("_Zone")]

for df in (train_fe, test_fe):
    df["FGC_total_score"] = df[fgc_scores].sum(axis=1)
    df["FGC_zone_mean"]   = df[fgc_zones].mean(axis=1)
    df["FGC_zone_max"]    = df[fgc_zones].max(axis=1)


**Purpose**

1. **Select FGC features**
   - `fgc_scores`: all FGC score columns excluding zones and season columns  
   - `fgc_zones`: all FGC zone-specific columns  

2. **Compute summary features for FGC**
   - `FGC_total_score = sum of all FGC scores` → overall FGC performance  
   - `FGC_zone_mean = mean of all FGC zones` → average zone performance  
   - `FGC_zone_max = max of all FGC zones` → highest zone score  



In [379]:
for df in (train_fe, test_fe):
    df["BMI_x_Endurance_secs"] = df[bmi_col] * df["Endurance_total_secs"]
    df["BMI_x_FGC_total"]      = df[bmi_col] * df["FGC_total_score"]


**Purpose**

 **Create interaction features**
   - `BMI_x_Endurance_secs = BMI * Endurance_total_secs` → captures combined effect of body composition and endurance  
   - `BMI_x_FGC_total = BMI * FGC_total_score` → captures interaction between body composition and FGC performance  


In [380]:
fat = "BIA-BIA_Fat"
ffm = "BIA-BIA_FFM"
smm = "BIA-BIA_SMM"
w   = "Physical-Weight"

for df in (train_fe, test_fe):
    df["Fat_to_FFM_ratio"]        = df[fat] / (df[ffm] + eps)
    df["Fat_to_SMM_ratio"]        = df[fat] / (df[smm] + eps)
    df["Fat_percent_of_weight"]   = df[fat] / (df[w] + eps)


**DPurpose**

 **Create body composition ratios**
   - `Fat_to_FFM_ratio = BIA_Fat / BIA_FFM` → proportion of fat relative to fat-free mass  
   - `Fat_to_SMM_ratio = BIA_Fat / BIA_SMM` → proportion of fat relative to skeletal muscle mass  
   - `Fat_percent_of_weight = BIA_Fat / Weight` → percentage of total body weight that is fat  



In [381]:
print(train_fe.shape)
print(test_fe.shape)
print(train_fe.isna().sum().sum())
print(test_fe.isna().sum().sum())

(2736, 222)
(20, 221)
0
0


In [382]:
fat_stats = train_fe.groupby(group_cols)[fat].agg(["mean","std"])
fat_global_mean, fat_global_std = train_fe[fat].mean(), train_fe[fat].std()

def add_fat_z(df):
    Z = []
    for sex, ag, f in zip(df[sex_col], df["Age_group"], df[fat]):
        if (sex, ag) in fat_stats.index:
            fm, fs = fat_stats.loc[(sex, ag)]
        else:
            fm, fs = fat_global_mean, fat_global_std
        Z.append((f - fm) / (fs if fs else fat_global_std))
    df["Fat_for_age_z"] = Z

add_fat_z(train_fe)
add_fat_z(test_fe)


**Purpose**

 **Compute age- and sex-adjusted fat z-scores**
   - Calculate mean and standard deviation of `BIA_Fat` for each combination of **sex** and **Age_group**  
   - `Fat_for_age_z = (fat_value - group_mean) / group_std`  
   - Uses **global mean/std** if a group combination does not exist  


In [383]:
paqA = "PAQ_A-PAQ_A_Total"
paqC = "PAQ_C-PAQ_C_Total"

paqA_mean, paqA_std = train_fe[paqA].mean(), train_fe[paqA].std()
paqC_mean, paqC_std = train_fe[paqC].mean(), train_fe[paqC].std()
mvpa_mean, mvpa_std = train_fe["moderate_por"].mean(), train_fe["moderate_por"].std()

for df in (train_fe, test_fe):
    df["PAQ_A_z"] = (df[paqA] - paqA_mean) / paqA_std
    df["PAQ_C_z"] = (df[paqC] - paqC_mean) / paqC_std
    df["PAQ_Total"] = (df[paqA] + df[paqC]) / 2

paqT_mean, paqT_std = train_fe["PAQ_Total"].mean(), train_fe["PAQ_Total"].std()

for df in (train_fe, test_fe):
    df["PAQ_Total_z"] = (df["PAQ_Total"] - paqT_mean) / paqT_std
    df["MVPA_z"]      = (df["moderate_por"] - mvpa_mean) / mvpa_std
    df["PAQ_MVPA_gap"] = df["PAQ_Total_z"] - df["MVPA_z"]


**Purpose**

1. **Standardize PAQ scores**
   - `PAQ_A_z = (PAQ_A_Total - mean) / std` → z-score for PAQ_A  
   - `PAQ_C_z = (PAQ_C_Total - mean) / std` → z-score for PAQ_C  
   - `PAQ_Total = (PAQ_A + PAQ_C) / 2` → combined PAQ score  
   - `PAQ_Total_z = standardized PAQ_Total`  

2. **Standardize MVPA (moderate physical activity)**
   - `MVPA_z = (moderate_por - mean) / std` → z-score for moderate physical activity  

3. **Compute PAQ-MVPA gap**
   - `PAQ_MVPA_gap = PAQ_Total_z - MVPA_z` → measures discrepancy between self-reported activity (PAQ) and measured activity (actigraphy)  



In [384]:
bmi_mean, bmi_std = train_fe[bmi_col].mean(), train_fe[bmi_col].std()
sys_mean, sys_std = train_fe[sys_col].mean(), train_fe[sys_col].std()
dia_mean, dia_std = train_fe[dia_col].mean(), train_fe[dia_col].std()

for df in (train_fe, test_fe):
    df["BMI_z"]     = (df[bmi_col] - bmi_mean) / bmi_std
    df["BP_sys_z"]  = (df[sys_col] - sys_mean) / sys_std
    df["BP_dia_z"]  = (df[dia_col] - dia_mean) / dia_std


**Purpose**

| **Standardize key numeric features**
   - `BMI_z = (BMI - mean) / std` → z-score for Body Mass Index  
   - `BP_sys_z = (Systolic_BP - mean) / std` → z-score for systolic blood pressure  
   - `BP_dia_z = (Diastolic_BP - mean) / std` → z-score for diastolic blood pressure  



In [385]:
for df in (train_fe, test_fe):
    df["Physical_risk_index"] = (
        df["BMI_z"]
        + df["BP_sys_z"]
        + df["BP_dia_z"]
        + df["Fat_for_age_z"]
        - df["Endurance_secs_z"]
    )


**Purpose**

 **Compute a composite physical risk index**
   - Combines **standardized features**:  
     - `BMI_z` → body mass index  
     - `BP_sys_z` → systolic blood pressure  
     - `BP_dia_z` → diastolic blood pressure  
     - `Fat_for_age_z` → fat mass relative to age  
     - `Endurance_secs_z` → standardized endurance (subtracted to reduce risk)  

In [386]:
print(train_fe.shape)
print(test_fe.shape)
print(train_fe.isna().sum().sum())
print(test_fe.isna().sum().sum())

(2736, 233)
(20, 232)
0
0


In [387]:
sed_q75 = train_fe["sedentary_por"].quantile(0.75)
mvpa_q25 = train_fe["moderate_por"].quantile(0.25)

for df in (train_fe, test_fe):
    df["risk_internet_high"] = (df["Internet_risk_bin"] >= 2).astype(int)
    df["risk_sedentary_high"] = (df["sedentary_por"] >= sed_q75).astype(int)
    df["risk_low_mvpa"] = (df["moderate_por"] <= mvpa_q25).astype(int)
    df["risk_SDS_high"] = (df["SDS_severity"] >= 2).astype(int)

    df["Lifestyle_risk_score"] = (
        df["risk_internet_high"] +
        df["risk_sedentary_high"] +
        df["risk_low_mvpa"] +
        df["risk_SDS_high"] +
        df["sleep_anomaly"]
    )


**Purpose**

1. **Create individual lifestyle risk flags**
   - `risk_internet_high` → high internet usage (`Internet_risk_bin >= 2`)  
   - `risk_sedentary_high` → high sedentary time (`sedentary_por >= 75th percentile`)  
   - `risk_low_mvpa` → low moderate-to-vigorous activity (`moderate_por <= 25th percentile`)  
   - `risk_SDS_high` → high SDS severity (`SDS_severity >= 2`)  

2. **Compute composite lifestyle risk score**
   - `Lifestyle_risk_score` = sum of the above risk flags + `sleep_anomaly`  
   - Represents **overall lifestyle-related health risk**



In [388]:
for df in (train_fe, test_fe):
    df["SDSxInternet"]  = df["SDS_T_z"] * df["Internet_risk_bin"]
    df["SDSxSedentary"] = df["SDS_T_z"] * df["sedentary_por"]
    df["BMIxSedentary"] = df["BMI_z"]   * df["sedentary_por"]
    df["BMIxInternet"]  = df["BMI_z"]   * df["Internet_risk_bin"]
    df["PAQxLifestyle"] = df["PAQ_Total_z"] * df["Lifestyle_risk_score"]


In [389]:
print(train_fe.shape)
print(test_fe.shape)
print(train_fe.isna().sum().sum())
print(test_fe.isna().sum().sum())

(2736, 243)
(20, 242)
0
0


In [390]:
# Original columns (includes target)
orig_cols = list(train.columns)

# New engineered columns
new_cols = [c for c in train_fe.columns if c not in orig_cols]

# Final column order
final_train_cols = orig_cols + new_cols

# Test version
final_test_cols = [c for c in final_train_cols if c != "sii"]

# SAFE reindex (does NOT touch rows)
train_fe = train_fe.reindex(columns=final_train_cols, fill_value=0)
test_fe  = test_fe.reindex(columns=final_test_cols, fill_value=0)

print(train_fe.shape)
print(test_fe.shape)
print(train_fe.isna().sum().sum())
print(test_fe.isna().sum().sum())


# Must match original row count
assert train_fe.shape[0] == train.shape[0]
assert test_fe.shape[0]  == test.shape[0]

# Must contain no NaNs
assert train_fe.isna().sum().sum() == 0
assert test_fe.isna().sum().sum() == 0


(2736, 243)
(20, 242)
0
0


**Purpose**

1. **Preserve original columns**
   - `orig_cols` stores all columns from the original dataset (including the target `sii`).

2. **Identify newly engineered features**
   - `new_cols` contains all columns created during feature engineering that were not in the original dataset.

3. **Set final column order**
   - `final_train_cols` combines original and new columns for the training dataset.
   - `final_test_cols` mirrors the training columns but excludes the target `sii`.

4. **Reindex datasets safely**
   - Ensures **both train and test datasets** have the same column order.
   - Missing columns (if any) are filled with `0`.
   - Row order is **not changed**.

5. **Sanity checks**
   - Shapes of `train_fe` and `test_fe` match original row counts.
   - No NaNs remain in either dataset after processing.
   - Guarantees **ready-to-use feature matrices** for modeling.


In [391]:
train_fe.to_csv("train_fed.csv", index=False)
test_fe.to_csv("test_fed.csv", index=False)
print("Done.")


Done.
