In [61]:
import numpy as np
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from longitudinal.settings.constants import DATA_PATH

from sklearn.ensemble import RandomForestRegressor

import warnings
warnings.filterwarnings("ignore")

gen1_train = pd.read_csv(DATA_PATH + "gen1_train_comp_final.csv")  # parent data (training)
gen2_train = pd.read_csv(DATA_PATH + "gen2_train_comp_final.csv")  # child data (training)
gen1_test = pd.read_csv(DATA_PATH + "gen1_test_comp_final.csv")    # parent data (test)
gen2_test = pd.read_csv(DATA_PATH + "gen2_test_upto9_comp_final.csv")  # child data, up to age 9

gen1_train = gen1_train.rename(columns={"age": "AgeGr"})
gen1_test = gen1_test.rename(columns={"age": "AgeGr"})
gen1_train['sex_assigned_at_birth'] = gen1_train['sex_assigned_at_birth'].map({'M': 1, 'F': 0})
gen2_train['sex_assigned_at_birth'] = gen2_train['sex_assigned_at_birth'].map({'M': 1, 'F': 0})
gen2_train['study_parent_sex'] = gen2_train['study_parent_sex'].map({'mother': 1, 'father': 0})

gen1_train = gen1_train.rename(columns={
    "gen1_id": "person_id",  # was gen1_id
    "SHgt_cm": "height_cm"   # rename for clarity
})

gen2_train = gen2_train.rename(columns={
    "gen2_id": "person_id",  # was gen2_id
    "SHgt_cm": "height_cm"   
})

gen1_train["generation"] = 1
gen2_train["generation"] = 2

# submission template
submission = pd.read_csv(DATA_PATH + "gen2_test_solution_template.csv")
submission["gen2_id"] = submission["gen2id_age"].str[: 4].astype(float)
submission["age"] = submission["gen2id_age"].str[-2:].astype(float)
submission.head()

Unnamed: 0,gen2id_age,SHgt_cm,gen2_id,age
0,2831_10,150,2831.0,10.0
1,2831_11,150,2831.0,11.0
2,2831_12,150,2831.0,12.0
3,2831_13,150,2831.0,13.0
4,2831_14,150,2831.0,14.0


In [52]:
gen1_train.sample(15)

Unnamed: 0,person_id,sex_assigned_at_birth,AgeGr,height_cm,generation
229,748,1,9.0,143.435583,1
3291,412,0,10.0,144.093093,1
1253,634,1,17.0,180.906863,1
2655,497,1,16.0,191.867411,1
3347,410,1,20.0,180.881129,1
386,725,0,15.5,163.269774,1
3173,433,0,1.5,,1
1934,552,0,15.5,166.012964,1
661,706,0,9.0,133.709588,1
2237,527,1,1.5,86.649765,1


In [64]:
import numpy as np
from sklearn.linear_model import BayesianRidge

# Ensure a copy to avoid SettingWithCopyWarning
z = gen1_train[gen1_train["person_id"] == 401].copy()

z["height_cm"] = z["height_cm"]

# Save the original values
z["height_cm_TRUTH"] = z["height_cm"]

# Introduce missing values
z.loc[z["AgeGr"] < 1, "height_cm"] = np.nan
z.loc[(z["AgeGr"] >= 7) & (z["AgeGr"] < 9), "height_cm"] = np.nan

# Track missing values
z["h_null"] = np.where(z["height_cm"].isna(), 1, 0)

# MICE Imputer
features_for_imputation = ["height_cm"]

z["height_cm"] = z["height_cm"].interpolate(method="polynomial", order=2)
mice_imputer = IterativeImputer(max_iter=10, random_state=42, min_value=0, estimator=BayesianRidge())
z[features_for_imputation] = mice_imputer.fit_transform(z[features_for_imputation])

z

Unnamed: 0,person_id,sex_assigned_at_birth,AgeGr,height_cm,generation,height_cm_TRUTH,h_null
3348,401,1,0.1,161.110964,1,56.579066,1
3349,401,1,0.25,161.110964,1,64.485911,1
3350,401,1,0.5,161.110964,1,71.828223,1
3351,401,1,0.75,161.110964,1,76.179279,1
3352,401,1,1.0,81.528996,1,81.528996,0
3353,401,1,1.5,89.453099,1,89.453099,0
3354,401,1,2.0,96.304374,1,96.304374,0
3355,401,1,3.0,108.800714,1,108.800714,0
3356,401,1,4.0,117.103821,1,117.103821,0
3357,401,1,5.0,125.289878,1,125.289878,0


In [23]:
gen2_train.head()

Unnamed: 0,gen2_id,sex_assigned_at_birth,study_parent_sex,study_parent_id_new,AgeGr,SHgt_cm,Wgt_kg
0,3012,1,1,636,0.1,56.251625,4.636903
1,3012,1,1,636,0.25,64.491579,
2,3012,1,1,636,0.5,70.465927,
3,3012,1,1,636,0.75,73.992677,
4,3012,1,1,636,1.0,79.343537,


In [24]:
gen1_train = gen1_train.rename(columns={
    "gen1_id": "person_id",  # was gen1_id
    "SHgt_cm": "height_cm"   # rename for clarity
})

gen2_train = gen2_train.rename(columns={
    "gen2_id": "person_id",  # was gen2_id
    "SHgt_cm": "height_cm"   
})

gen1_train["generation"] = 1
gen2_train["generation"] = 2

# Now they have common column names for person_id, height_cm, AgeGr, etc.
# Optional: if Wgt_kg only exists in gen2, that’s fine; it will just become an extra column for gen1 with NaNs.
