In [20]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

eps = 1e-3

# Paths
train_path = "train_cleaned.csv"
test_path  = "test_cleaned.csv"

train = pd.read_csv(train_path)
test  = pd.read_csv(test_path)

train_fe = train.copy()
test_fe  = test.copy()

print("Train shape:", train_fe.shape)
print("Test shape :", test_fe.shape)
print("Train NaNs:", train_fe.isna().sum().sum())
print("Test NaNs :", test_fe.isna().sum().sum())

Train shape: (2736, 179)
Test shape : (20, 178)
Train NaNs: 0
Test NaNs : 0


In [21]:
age_col = "Basic_Demos-Age"
sex_col = "Basic_Demos-Sex"

for df in (train_fe, test_fe):
    df["Age_sq"] = df[age_col] ** 2
    df["Age_cu"] = df[age_col] ** 3

age_bins = [0, 7, 10, 13, 16, 20]
age_labels = [0, 1, 2, 3, 4]

train_fe["Age_group"] = pd.cut(train_fe[age_col], bins=age_bins, labels=age_labels).astype(int)
test_fe["Age_group"]  = pd.cut(test_fe[age_col],  bins=age_bins, labels=age_labels).astype(int)


In [22]:
bmi_col = "Physical-BMI"

for df in (train_fe, test_fe):
    df["BMI_sq"] = df[bmi_col] ** 2

def bmi_cat(x):
    if x < 18.5:   return 0
    elif x < 25:   return 1
    elif x < 30:   return 2
    else:          return 3

train_fe["BMI_category"] = train_fe[bmi_col].apply(bmi_cat)
test_fe["BMI_category"]  = test_fe[bmi_col].apply(bmi_cat)


In [23]:
h_col = "Physical-Height"
w_col = "Physical-Weight"

group_cols = [sex_col, "Age_group"]

height_stats = train_fe.groupby(group_cols)[h_col].agg(["mean", "std"])
weight_stats = train_fe.groupby(group_cols)[w_col].agg(["mean", "std"])

h_global_mean, h_global_std = train_fe[h_col].mean(), train_fe[h_col].std()
w_global_mean, w_global_std = train_fe[w_col].mean(), train_fe[w_col].std()

def add_hw_z(df):
    HZ, WZ = [], []
    for sex, ag, h, w in zip(df[sex_col], df["Age_group"], df[h_col], df[w_col]):
        if (sex, ag) in height_stats.index:
            hm, hs = height_stats.loc[(sex, ag)]
        else:
            hm, hs = h_global_mean, h_global_std
        if (sex, ag) in weight_stats.index:
            wm, ws = weight_stats.loc[(sex, ag)]
        else:
            wm, ws = w_global_mean, w_global_std

        HZ.append((h - hm) / (hs if hs else h_global_std))
        WZ.append((w - wm) / (ws if ws else w_global_std))

    df["Height_for_age_z"] = HZ
    df["Weight_for_age_z"] = WZ

add_hw_z(train_fe)
add_hw_z(test_fe)


In [24]:
sys_col = "Physical-Systolic_BP"
dia_col = "Physical-Diastolic_BP"

for df in (train_fe, test_fe):
    df["BP_pulse_pressure"] = df[sys_col] - df[dia_col]
    df["BP_mean_arterial"]  = df[dia_col] + df["BP_pulse_pressure"] / 3

def bp_cat(sys, dia):
    if sys < 120 and dia < 80: return 0
    if 120 <= sys < 130 and dia < 80: return 1
    if 130 <= sys < 140 or 80 <= dia < 90: return 2
    if sys >= 140 or dia >= 90: return 3
    return 0

train_fe["BP_category"] = [bp_cat(s, d) for s, d in zip(train_fe[sys_col], train_fe[dia_col])]
test_fe["BP_category"]  = [bp_cat(s, d) for s, d in zip(test_fe[sys_col], test_fe[dia_col])]


In [25]:
season_cols = [c for c in train.columns if c.endswith("Season")]

season_map = {"spring":0, "summer":1, "fall":2, "winter":3}

def add_season_feats(df):
    for col in season_cols:
        df[col+"_num"] = df[col].map(season_map).fillna(-1)
        df[col+"_sin"] = np.sin(2*np.pi*(df[col+"_num"]/4))
        df[col+"_cos"] = np.cos(2*np.pi*(df[col+"_num"]/4))
        df[col+"_is_winter"] = (df[col] == "winter").astype(int)
        df[col+"_is_summer"] = (df[col] == "summer").astype(int)

add_season_feats(train_fe)
add_season_feats(test_fe)


In [26]:
season_dummy_map = {}

for col in season_cols:
    train_dum = pd.get_dummies(train_fe[col], prefix=col)
    season_dummy_map[col] = list(train_dum.columns)

    train_fe = pd.concat([train_fe, train_dum], axis=1)

    test_dum = pd.get_dummies(test_fe[col], prefix=col)
    test_dum = test_dum.reindex(season_dummy_map[col], fill_value=0)
    test_fe = pd.concat([test_fe, test_dum], axis=1)


In [27]:
for df in (train_fe, test_fe):
    df["pciat_span"]    = df["relative_date_PCIAT_max"] - df["relative_date_PCIAT_min"]
    df["pciat_iqr"]     = df["relative_date_PCIAT_75%"] - df["relative_date_PCIAT_25%"]
    df["pciat_recency"] = df["relative_date_PCIAT_max"]


In [28]:
for df in (train_fe, test_fe):

    df["enmo_cv"]   = df["enmo_std"]   / (df["enmo_mean"].abs() + eps)
    df["light_cv"]  = df["light_std"]  / (df["light_mean"].abs() + eps)
    df["anglez_cv"] = df["anglez_std"] / (df["anglez_mean"].abs() + eps)

    df["enmo_activity_volume"] = df["enmo_mean"] * df["enmo_count"]

    df["active_por"] = df["light_por"] + df["moderate_por"]
    df["sedentary_to_active_ratio"] = df["sedentary_por"] / (df["active_por"] + eps)

    df["sedentary_percentage"] = df["sedentary_por"]
    df["moderate_vigorous_percentage"] = df["moderate_por"]

    df["mvpa_minutes_estimate"] = df["moderate_por"] * 24 * 60


In [29]:
sds_t = "SDS-SDS_Total_T"
cgas  = "CGAS-CGAS_Score"

# Log transform
for df in (train_fe, test_fe):
    df["SDS_T_log"] = np.log(df[sds_t])

# Z-scores (train-only)
sds_mean, sds_std = train_fe[sds_t].mean(), train_fe[sds_t].std()
cgas_mean, cgas_std = train_fe[cgas].mean(), train_fe[cgas].std()

for df in (train_fe, test_fe):
    df["SDS_T_z"] = (df[sds_t] - sds_mean) / sds_std
    df["CGAS_z"] = (df[cgas] - cgas_mean) / cgas_std

# SDS severity
def sds_sev(x):
    if x < 50: return 0
    if x < 60: return 1
    if x < 70: return 2
    return 3

train_fe["SDS_severity"] = train_fe[sds_t].apply(sds_sev)
test_fe["SDS_severity"]  = test_fe[sds_t].apply(sds_sev)

# CGAS impairment
def cgas_imp(x):
    if x >= 80: return 0
    if x >= 70: return 1
    if x >= 60: return 2
    return 3

train_fe["CGAS_impairment"] = train_fe[cgas].apply(cgas_imp)
test_fe["CGAS_impairment"]  = test_fe[cgas].apply(cgas_imp)

# SDSâ€“CGAS interactions
for df in (train_fe, test_fe):
    df["SDS_CGAS_ratio"]   = df[sds_t] / (df[cgas] + eps)
    df["SDS_CGAS_diff_z"]  = df["SDS_T_z"] - df["CGAS_z"]


In [30]:
inet = "PreInt_EduHx-computerinternet_hoursday"

def inet_risk(x):
    if x == 0: return 0
    if x <= 1: return 1
    if x <= 2: return 2
    return 3

train_fe["Internet_risk_bin"] = train_fe[inet].apply(inet_risk)
test_fe["Internet_risk_bin"]  = test_fe[inet].apply(inet_risk)

for df in (train_fe, test_fe):
    df["Internet_sedentary_interaction"] = df[inet] * df["sedentary_por"]
    df["Screen_vs_activity_ratio"] = df[inet] / (df["active_por"] + eps)


In [31]:
mins = "Fitness_Endurance-Time_Mins"
secs = "Fitness_Endurance-Time_Sec"
stage = "Fitness_Endurance-Max_Stage"

for df in (train_fe, test_fe):
    df["Endurance_total_secs"] = df[mins] * 60 + df[secs]

end_mean, end_std = train_fe["Endurance_total_secs"].mean(), train_fe["Endurance_total_secs"].std()
stage_mean, stage_std = train_fe[stage].mean(), train_fe[stage].std()

for df in (train_fe, test_fe):
    df["Endurance_secs_z"] = (df["Endurance_total_secs"] - end_mean) / end_std
    df["Endurance_stage_z"] = (df[stage] - stage_mean) / stage_std


In [32]:
fgc_scores = [c for c in train.columns if c.startswith("FGC-FGC_") and "_Zone" not in c and "Season" not in c]
fgc_zones  = [c for c in train.columns if c.startswith("FGC-FGC_") and c.endswith("_Zone")]

for df in (train_fe, test_fe):
    df["FGC_total_score"] = df[fgc_scores].sum(axis=1)
    df["FGC_zone_mean"]   = df[fgc_zones].mean(axis=1)
    df["FGC_zone_max"]    = df[fgc_zones].max(axis=1)


  df["FGC_zone_max"]    = df[fgc_zones].max(axis=1)


In [33]:
for df in (train_fe, test_fe):
    df["BMI_x_Endurance_secs"] = df[bmi_col] * df["Endurance_total_secs"]
    df["BMI_x_FGC_total"]      = df[bmi_col] * df["FGC_total_score"]


  df["BMI_x_Endurance_secs"] = df[bmi_col] * df["Endurance_total_secs"]
  df["BMI_x_FGC_total"]      = df[bmi_col] * df["FGC_total_score"]


In [34]:
fat = "BIA-BIA_Fat"
ffm = "BIA-BIA_FFM"
smm = "BIA-BIA_SMM"
w   = "Physical-Weight"

for df in (train_fe, test_fe):
    df["Fat_to_FFM_ratio"]        = df[fat] / (df[ffm] + eps)
    df["Fat_to_SMM_ratio"]        = df[fat] / (df[smm] + eps)
    df["Fat_percent_of_weight"]   = df[fat] / (df[w] + eps)


  df["Fat_to_FFM_ratio"]        = df[fat] / (df[ffm] + eps)
  df["Fat_to_SMM_ratio"]        = df[fat] / (df[smm] + eps)
  df["Fat_percent_of_weight"]   = df[fat] / (df[w] + eps)


In [35]:
fat_stats = train_fe.groupby(group_cols)[fat].agg(["mean","std"])
fat_global_mean, fat_global_std = train_fe[fat].mean(), train_fe[fat].std()

def add_fat_z(df):
    Z = []
    for sex, ag, f in zip(df[sex_col], df["Age_group"], df[fat]):
        if (sex, ag) in fat_stats.index:
            fm, fs = fat_stats.loc[(sex, ag)]
        else:
            fm, fs = fat_global_mean, fat_global_std
        Z.append((f - fm) / (fs if fs else fat_global_std))
    df["Fat_for_age_z"] = Z

add_fat_z(train_fe)
add_fat_z(test_fe)


  df["Fat_for_age_z"] = Z


In [36]:
paqA = "PAQ_A-PAQ_A_Total"
paqC = "PAQ_C-PAQ_C_Total"

paqA_mean, paqA_std = train_fe[paqA].mean(), train_fe[paqA].std()
paqC_mean, paqC_std = train_fe[paqC].mean(), train_fe[paqC].std()
mvpa_mean, mvpa_std = train_fe["moderate_por"].mean(), train_fe["moderate_por"].std()

for df in (train_fe, test_fe):
    df["PAQ_A_z"] = (df[paqA] - paqA_mean) / paqA_std
    df["PAQ_C_z"] = (df[paqC] - paqC_mean) / paqC_std
    df["PAQ_Total"] = (df[paqA] + df[paqC]) / 2

paqT_mean, paqT_std = train_fe["PAQ_Total"].mean(), train_fe["PAQ_Total"].std()

for df in (train_fe, test_fe):
    df["PAQ_Total_z"] = (df["PAQ_Total"] - paqT_mean) / paqT_std
    df["MVPA_z"]      = (df["moderate_por"] - mvpa_mean) / mvpa_std
    df["PAQ_MVPA_gap"] = df["PAQ_Total_z"] - df["MVPA_z"]


  df["PAQ_A_z"] = (df[paqA] - paqA_mean) / paqA_std
  df["PAQ_C_z"] = (df[paqC] - paqC_mean) / paqC_std
  df["PAQ_Total"] = (df[paqA] + df[paqC]) / 2
  df["PAQ_Total_z"] = (df["PAQ_Total"] - paqT_mean) / paqT_std
  df["MVPA_z"]      = (df["moderate_por"] - mvpa_mean) / mvpa_std
  df["PAQ_MVPA_gap"] = df["PAQ_Total_z"] - df["MVPA_z"]


In [37]:
bmi_mean, bmi_std = train_fe[bmi_col].mean(), train_fe[bmi_col].std()
sys_mean, sys_std = train_fe[sys_col].mean(), train_fe[sys_col].std()
dia_mean, dia_std = train_fe[dia_col].mean(), train_fe[dia_col].std()

for df in (train_fe, test_fe):
    df["BMI_z"]     = (df[bmi_col] - bmi_mean) / bmi_std
    df["BP_sys_z"]  = (df[sys_col] - sys_mean) / sys_std
    df["BP_dia_z"]  = (df[dia_col] - dia_mean) / dia_std


  df["BMI_z"]     = (df[bmi_col] - bmi_mean) / bmi_std
  df["BP_sys_z"]  = (df[sys_col] - sys_mean) / sys_std
  df["BP_dia_z"]  = (df[dia_col] - dia_mean) / dia_std


In [38]:
for df in (train_fe, test_fe):
    df["Physical_risk_index"] = (
        df["BMI_z"]
        + df["BP_sys_z"]
        + df["BP_dia_z"]
        + df["Fat_for_age_z"]
        - df["Endurance_secs_z"]
    )


  df["Physical_risk_index"] = (


In [39]:
sed_q75 = train_fe["sedentary_por"].quantile(0.75)
mvpa_q25 = train_fe["moderate_por"].quantile(0.25)

for df in (train_fe, test_fe):
    df["risk_internet_high"] = (df["Internet_risk_bin"] >= 2).astype(int)
    df["risk_sedentary_high"] = (df["sedentary_por"] >= sed_q75).astype(int)
    df["risk_low_mvpa"] = (df["moderate_por"] <= mvpa_q25).astype(int)
    df["risk_SDS_high"] = (df["SDS_severity"] >= 2).astype(int)

    df["Lifestyle_risk_score"] = (
        df["risk_internet_high"] +
        df["risk_sedentary_high"] +
        df["risk_low_mvpa"] +
        df["risk_SDS_high"] +
        df["sleep_anomaly"]
    )


  df["risk_internet_high"] = (df["Internet_risk_bin"] >= 2).astype(int)
  df["risk_sedentary_high"] = (df["sedentary_por"] >= sed_q75).astype(int)
  df["risk_low_mvpa"] = (df["moderate_por"] <= mvpa_q25).astype(int)
  df["risk_SDS_high"] = (df["SDS_severity"] >= 2).astype(int)
  df["Lifestyle_risk_score"] = (


In [40]:
for df in (train_fe, test_fe):
    df["SDSxInternet"]  = df["SDS_T_z"] * df["Internet_risk_bin"]
    df["SDSxSedentary"] = df["SDS_T_z"] * df["sedentary_por"]
    df["BMIxSedentary"] = df["BMI_z"]   * df["sedentary_por"]
    df["BMIxInternet"]  = df["BMI_z"]   * df["Internet_risk_bin"]
    df["PAQxLifestyle"] = df["PAQ_Total_z"] * df["Lifestyle_risk_score"]


  df["SDSxInternet"]  = df["SDS_T_z"] * df["Internet_risk_bin"]
  df["SDSxSedentary"] = df["SDS_T_z"] * df["sedentary_por"]
  df["BMIxSedentary"] = df["BMI_z"]   * df["sedentary_por"]
  df["BMIxInternet"]  = df["BMI_z"]   * df["Internet_risk_bin"]
  df["PAQxLifestyle"] = df["PAQ_Total_z"] * df["Lifestyle_risk_score"]


In [41]:
# Original column ordering
orig_cols = list(train.columns)
new_cols = [c for c in train_fe.columns if c not in orig_cols]

col_order_train = orig_cols + new_cols
col_order_test  = [c for c in col_order_train if c != "sii"]

train_fe = train_fe[col_order_train]
test_fe  = test_fe[col_order_test]

print("Final train shape:", train_fe.shape)
print("Final test shape:", test_fe.shape)

train_fe.to_csv("train_fed.csv", index=False)
test_fe.to_csv("test_fed.csv", index=False)

print("Saved train_fed.csv and test_fed.csv")


KeyError: "['Fitness_Endurance-Season_winter', 'BIA-Season_spring', 'PAQ_A-Season_fall', 'PAQ_A-Season_spring', 'PAQ_A-Season_winter'] not in index"