In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from longitudinal.settings.constants import DATA_PATH

import warnings
warnings.filterwarnings("ignore")

gen1_train = pd.read_csv(DATA_PATH + "gen1_train_comp_final.csv")  # parent data (training)
gen2_train = pd.read_csv(DATA_PATH + "gen2_train_comp_final.csv")  # child data (training)
gen1_test = pd.read_csv(DATA_PATH + "gen1_test_comp_final.csv")    # parent data (test)
gen2_test = pd.read_csv(DATA_PATH + "gen2_test_upto9_comp_final.csv")  # child data, up to age 9

# submission template
submission = pd.read_csv(DATA_PATH + "gen2_test_solution_template.csv")
submission["gen2_id"] = submission["gen2id_age"].str[: 4].astype(float)
submission["age"] = submission["gen2id_age"].str[-2:].astype(float)
submission.head()

Unnamed: 0,gen2id_age,SHgt_cm,gen2_id,age
0,2831_10,150,2831.0,10.0
1,2831_11,150,2831.0,11.0
2,2831_12,150,2831.0,12.0
3,2831_13,150,2831.0,13.0
4,2831_14,150,2831.0,14.0


In [2]:
gen1_train.head(2)

Unnamed: 0,gen1_id,sex_assigned_at_birth,age,SHgt_cm
0,774,F,0.1,56.961812
1,774,F,0.25,64.82619


In [3]:
gen1_test.head(2)

Unnamed: 0,gen1_id,sex_assigned_at_birth,age,SHgt_cm
0,768,F,0.1,53.822825
1,768,F,0.25,61.455579


In [4]:
gen2_train.head(2)

Unnamed: 0,gen2_id,sex_assigned_at_birth,study_parent_sex,study_parent_id_new,AgeGr,SHgt_cm,Wgt_kg
0,3012,M,mother,636,0.1,56.251625,4.636903
1,3012,M,mother,636,0.25,64.491579,


In [5]:
gen2_test.head(2)

Unnamed: 0,gen2_id,sex_assigned_at_birth,study_parent_sex,study_parent_id_new,AgeGr,SHgt_cm,Wgt_kg
0,2831,F,mother,455,0.1,52.912025,
1,2831,F,mother,455,0.25,59.532779,


In [6]:
def preprocess_dataframe(df: pd.DataFrame, generation: int, cols_to_drop = ["Wgt_kg"]) -> pd.DataFrame:

    df = df.copy()

    # Column renaming mappings
    column_renaming = {
        1: {"gen1_id": "person_id", "SHgt_cm": "height_cm", "age": "AgeGr"},
        2: {"gen2_id": "person_id", "SHgt_cm": "height_cm"}
    }

    # Apply column renaming if applicable
    if generation in column_renaming:
        df = df.rename(columns=column_renaming[generation])

    # Encoding mappings for categorical variables
    encoding_mappings = {
        "sex_assigned_at_birth": {'M': 1, 'F': 0},
        "study_parent_sex": {'mother': 1, 'father': 0}
    }

    # Apply encoding if columns exist
    for col, mapping in encoding_mappings.items():
        if col in df.columns:
            df[col] = df[col].map(mapping)

    # Add the generation column
    df["generation"] = generation

    for col in cols_to_drop:
        if col in df.columns:
            df.drop(columns=[col], inplace=True)

    return df

def filter_by_age(df: pd.DataFrame, age_col: str, age_cutoff: float) -> pd.DataFrame:
    return df[df[age_col] <= age_cutoff].copy()


def impute_time_series_LEGACY(df, group_col, time_col, value_col, interpolation_range=(9, 18), knn_neighbors=5, min_valid_entries=3):

    df = df.copy()

    # Polynomial Interpolation
    def polynomial_interpolation(group):
        mask = (group[time_col] >= interpolation_range[0]) & (group[time_col] <= interpolation_range[1])

        if mask.sum() >= 3:  # Ensure at least 3 points to fit a polynomial
            try:
                group.loc[mask, value_col] = group.loc[mask, value_col].interpolate(method="polynomial", order=2)
            except ValueError:  # If polynomial interpolation fails, use linear interpolation
                group.loc[mask, value_col] = group.loc[mask, value_col].interpolate(method="linear")
        else:
            group.loc[mask, value_col] = group.loc[mask, value_col].interpolate(method="linear")

        return group

    df = df.groupby(group_col, group_keys=False).apply(polynomial_interpolation).reset_index(drop=True)

    # KNN Imputation
    knn_imputer = KNNImputer(n_neighbors=knn_neighbors, weights="uniform")
    knn_data = df[[time_col, value_col]]
    df[f"{value_col}_imputed"] = knn_imputer.fit_transform(knn_data)[:, 1]

    # Mean Imputation for Remaining NaNs
    mean_by_time = df.groupby(time_col)[f"{value_col}_imputed"].transform(lambda x: x.fillna(x.mean()))
    df[f"{value_col}_imputed"] = df[f"{value_col}_imputed"].fillna(mean_by_time)

    # Filter Out Groups with Insufficient Data
    valid_counts = df.groupby(group_col)[f"{value_col}_imputed"].count()
    valid_groups = valid_counts[valid_counts >= min_valid_entries].index
    df = df[df[group_col].isin(valid_groups)]

    return df

def impute_time_series(df, group_col, time_col, value_col, interpolation_range=(9, 18), knn_neighbors=5, min_valid_entries=3):
    """
    Imputes missing height values using polynomial interpolation, KNN, and mean imputation.
    
    Args:
        df (pd.DataFrame): Input data.
        group_col (str): Column that defines unique individuals.
        time_col (str): Column that represents time (e.g., age).
        value_col (str): Column with missing values to be imputed.
        interpolation_range (tuple): Range of ages where interpolation should be applied.
        knn_neighbors (int): Number of neighbors for KNN imputation.
        min_valid_entries (int): Minimum number of valid records to keep a group.
    
    Returns:
        pd.DataFrame: Data with imputed values.
    """
    df = df.copy()

    # Step 1: Polynomial or Linear Interpolation
    def polynomial_interpolation(group):
        mask = (group[time_col] >= interpolation_range[0]) & (group[time_col] <= interpolation_range[1])
        
        if mask.sum() >= 3:  # Ensure enough points for polynomial
            try:
                group.loc[mask, value_col] = group.loc[mask, value_col].interpolate(method="polynomial", order=2)
            except ValueError:  # If polynomial fails, fallback to linear
                group.loc[mask, value_col] = group.loc[mask, value_col].interpolate(method="linear")
        else:
            group.loc[mask, value_col] = group.loc[mask, value_col].interpolate(method="linear")

        return group

    df = df.groupby(group_col, group_keys=False).apply(polynomial_interpolation).reset_index(drop=True)

    # Step 2: KNN Imputation (Preserve index)
    knn_imputer = KNNImputer(n_neighbors=knn_neighbors, weights="uniform")
    
    knn_data = df[[time_col, value_col]]
    
    # Apply KNN Imputation and convert back to DataFrame with same index
    knn_imputed_values = knn_imputer.fit_transform(knn_data)
    knn_imputed_df = pd.DataFrame(knn_imputed_values, columns=[time_col, f"{value_col}_imputed"], index=df.index)

    print("Original DataFrame Index:", df.index)
    print("KNN Imputed DataFrame Index:", knn_imputed_df.index)

    # Assign back while ensuring index matches
    df[f"{value_col}_imputed"] = knn_imputed_df[f"{value_col}_imputed"]

    # Step 3: Forward & Backward Fill to Fix Remaining NaNs
    df[f"{value_col}_imputed"] = df.groupby(group_col)[f"{value_col}_imputed"].apply(lambda x: x.fillna(method="bfill").fillna(method="ffill"))

    # Step 4: Mean Imputation for Any Remaining NaNs
    mean_by_time = df.groupby(time_col)[f"{value_col}_imputed"].transform(lambda x: x.fillna(x.mean()))
    df[f"{value_col}_imputed"] = df[f"{value_col}_imputed"].fillna(mean_by_time)

    # Step 5: Drop Groups with Too Few Valid Data Points
    valid_counts = df.groupby(group_col)[f"{value_col}_imputed"].count()
    valid_groups = valid_counts[valid_counts >= min_valid_entries].index
    df = df[df[group_col].isin(valid_groups)].reset_index(drop=True)

    return df


def get_features(gen1, gen2):
    """
    Merges parent and child growth data, ensuring a correct one-to-one mapping,
    and creates lagged and lead features for modeling.

    Args:
        gen1 (pd.DataFrame): Parent growth data.
        gen2 (pd.DataFrame): Child growth data.

    Returns:
        pd.DataFrame: Merged dataset with engineered features.
    """

    # Step 1: Ensure parents have multiple time steps (DO NOT drop duplicates!)
    gen1 = gen1.sort_values(["person_id", "AgeGr"])  # Ensures proper ordering

    # Step 2: Compute Future Parent Heights BEFORE Merge
    for lead in range(1, 4):
        gen1[f"height_cm_lead_{lead}"] = gen1.groupby("person_id")["height_cm"].shift(-lead)

    # Step 3: Rename `AgeGr` in child data before merging
    gen1 = gen1.rename(columns={"AgeGr": "AgeGr_child"})
    gen2 = gen2.rename(columns={"AgeGr": "AgeGr_child"})

    # Step 4: Merge Parent and Child Growth Data (1-to-1 join)
    df = gen2.merge(
        gen1,  # Ensure correct merge key
        how="left",
        left_on=["study_parent_id_new", "AgeGr_child"],
        right_on=["person_id", "AgeGr_child"],
        suffixes=("_child", "_parent")
    )

    # Step 5: Create Lagged Height Features (7 for both child and parent)
    for lag in range(1, 8):
        df[f"height_cm_lag_{lag}_child"] = df.groupby("person_id_child")["height_cm_child"].shift(lag)
        df[f"height_cm_lag_{lag}_parent"] = df.groupby("person_id_parent")["height_cm_parent"].shift(lag)

    # Step 6: Growth Spurt Indicator
    df["growth_spurt_child"] = df["AgeGr_child"].between(13, 18).astype(int)
    df["growth_spurt_parent"] = df["AgeGr_child"].between(13, 18).astype(int)

    # Step 7: Target Variable (Predict Child Height at Age t+1)
    df["height_cm_target"] = df.groupby("person_id_child")["height_cm_child"].shift(-1)

    # Drop rows with missing target values (no next age available)
    # df = df.dropna(subset=["height_cm_target"]).reset_index(drop=True)

    return df


def get_features_fixed(gen1, gen2):
    """
    Merges parent and child growth data, ensuring a correct one-to-one mapping,
    and creates lagged and lead features for modeling.

    Args:
        gen1 (pd.DataFrame): Parent growth data.
        gen2 (pd.DataFrame): Child growth data.

    Returns:
        pd.DataFrame: Merged dataset with engineered features.
    """

    # Step 1: Ensure parents have multiple time steps (DO NOT drop duplicates!)
    gen1 = gen1.sort_values(["person_id", "AgeGr"])  # Ensures proper ordering

    # Step 2: Compute Future Parent Heights BEFORE Merge
    for lead in range(1, 4):
        gen1[f"height_cm_lead_{lead}"] = gen1.groupby("person_id")["height_cm"].shift(-lead)

    # Step 3: Rename `AgeGr` in child data before merging
    gen1 = gen1.rename(columns={"AgeGr": "AgeGr_child"})
    gen2 = gen2.rename(columns={"AgeGr": "AgeGr_child"})

    # Step 4: Merge Parent and Child Growth Data (1-to-1 join)
    df = gen2.merge(
        gen1,  # Ensure correct merge key
        how="left",
        left_on=["study_parent_id_new", "AgeGr_child"],
        right_on=["person_id", "AgeGr_child"],
        suffixes=("_child", "_parent")
    )

    # Step 5: Create Lagged Height Features (7 for both child and parent)
    for lag in range(1, 8):
        df[f"height_cm_lag_{lag}_child"] = df.groupby("person_id_child")["height_cm_child"].shift(lag)
        df[f"height_cm_lag_{lag}_parent"] = df.groupby("person_id_parent")["height_cm_parent"].shift(lag)

    # Step 6: Growth Spurt Indicator
    df["growth_spurt_child"] = df["AgeGr_child"].between(13, 18).astype(int)
    df["growth_spurt_parent"] = df["AgeGr_child"].between(13, 18).astype(int)

    # Step 7: Target Variable (Predict Child Height at Age t+1)
    df["height_cm_target"] = df.groupby("person_id_child")["height_cm_child"].shift(-1)

    # Step 8: **Fix: Only Drop Rows If `height_cm_target` is Missing and Age is Not Max**
    max_age = df["AgeGr_child"].max()
    df = df[(df["height_cm_target"].notna()) | (df["AgeGr_child"] == max_age)].reset_index(drop=True)

    return df


from sklearn.impute import KNNImputer
import pandas as pd
import numpy as np

def safe_knn_impute(df, time_col, value_col, knn_neighbors=5):
    """
    Applies KNN imputation while ensuring index and column alignment.
    
    Args:
        df (pd.DataFrame): DataFrame containing time series data.
        time_col (str): Column representing time steps (e.g., "AgeGr").
        value_col (str): Column to impute.
        knn_neighbors (int): Number of neighbors for imputation.

    Returns:
        pd.DataFrame: DataFrame with the imputed column added.
    """
    knn_imputer = KNNImputer(n_neighbors=knn_neighbors, weights="uniform")

    # Store original index and columns
    original_index = df.index
    original_cols = df.columns

    # Apply KNN Imputation
    knn_imputed_df = pd.DataFrame(
        knn_imputer.fit_transform(df[[time_col, value_col]]),
        columns=[time_col, f"{value_col}_imputed"],
        index=df.index  # Preserve original index
    )

    # Debugging logs
    print(f"Original Shape: {df.shape}, KNN Imputed Shape: {knn_imputed_df.shape}")
    assert df.index.equals(knn_imputed_df.index), "Index mismatch detected!"

    # Assign back safely
    df.loc[:, f"{value_col}_imputed"] = knn_imputed_df[f"{value_col}_imputed"]

    return df

def polynomial_interpolation(df, time_col, value_col, interpolation_range=(9, 18)):
    """
    Applies polynomial interpolation within a specified range.

    Args:
        df (pd.DataFrame): DataFrame containing time series data.
        time_col (str): Column representing time steps (e.g., "AgeGr").
        value_col (str): Column to interpolate.
        interpolation_range (tuple): Age range to interpolate (min, max).

    Returns:
        pd.DataFrame: DataFrame with interpolated values.
    """
    def interpolate_group(group):
        mask = (group[time_col] >= interpolation_range[0]) & (group[time_col] <= interpolation_range[1])
        try:
            group.loc[mask, value_col] = group.loc[mask, value_col].interpolate(method="polynomial", order=2)
        except ValueError:  # If polynomial interpolation fails, fallback to linear
            group.loc[mask, value_col] = group.loc[mask, value_col].interpolate(method="linear")
        return group

    return df.groupby("person_id", group_keys=False).apply(interpolate_group)

def impute_time_series_NEW(df, group_col, time_col, value_col, interpolation_range=(9, 18), knn_neighbors=5, min_valid_entries=3):
    """
    Imputes missing values in a time series using polynomial interpolation and KNN.

    Args:
        df (pd.DataFrame): DataFrame containing time series data.
        group_col (str): Column representing unique groups (e.g., "person_id").
        time_col (str): Column representing time steps (e.g., "AgeGr").
        value_col (str): Column to impute.
        interpolation_range (tuple): Age range for interpolation.
        knn_neighbors (int): Number of neighbors for KNN imputation.
        min_valid_entries (int): Minimum required valid entries for inclusion.

    Returns:
        pd.DataFrame: DataFrame with imputed values.
    """
    df = df.copy()

    # Step 1: Apply Polynomial Interpolation
    df = polynomial_interpolation(df, time_col, value_col, interpolation_range)

    # Step 2: Apply KNN Imputation Safely
    df = safe_knn_impute(df, time_col, value_col, knn_neighbors)

    # Step 3: Handle Remaining NaNs (Mean Imputation)
    df[f"{value_col}_imputed"] = df.groupby(time_col)[f"{value_col}_imputed"].transform(lambda x: x.fillna(x.mean()))

    # Step 4: Filter Out Groups with Insufficient Data
    valid_counts = df.groupby(group_col)[f"{value_col}_imputed"].count()
    valid_groups = valid_counts[valid_counts >= min_valid_entries].index
    df = df[df[group_col].isin(valid_groups)].reset_index(drop=True)

    return df

In [7]:
gen1_train = pd.read_csv(DATA_PATH + "gen1_train_comp_final.csv")  # parent data (training)
gen2_train = pd.read_csv(DATA_PATH + "gen2_train_comp_final.csv")  # child data (training)
gen1_test = pd.read_csv(DATA_PATH + "gen1_test_comp_final.csv")    # parent data (test)
gen2_test = pd.read_csv(DATA_PATH + "gen2_test_upto9_comp_final.csv")  # child data, up to age 9

# initial cleaning and processing
gen1_train = preprocess_dataframe(gen1_train, generation=1)
gen1_test = preprocess_dataframe(gen1_test, generation=1)
gen2_train = preprocess_dataframe(gen2_train, generation=2)
gen2_test = preprocess_dataframe(gen2_test, generation=2)

## Cleaning Train Data

In [8]:
gen1_train_filtered = filter_by_age(gen1_train, age_col="AgeGr", age_cutoff=18)
gen2_train_filtered = filter_by_age(gen2_train, age_col="AgeGr", age_cutoff=18)

gen1_train_imputed = impute_time_series_NEW(gen1_train_filtered, group_col="person_id", time_col="AgeGr", value_col="height_cm")
gen2_train_imputed = impute_time_series_NEW(gen2_train_filtered, group_col="person_id", time_col="AgeGr", value_col="height_cm")
gen1_train_imputed.head()

Original Shape: (3232, 5), KNN Imputed Shape: (3232, 2)
Original Shape: (4224, 7), KNN Imputed Shape: (4224, 2)


Unnamed: 0,person_id,sex_assigned_at_birth,AgeGr,height_cm,generation,height_cm_imputed
0,774,0,0.1,56.961812,1,56.961812
1,774,0,0.25,64.82619,1,64.82619
2,774,0,0.5,74.340764,1,74.340764
3,774,0,0.75,79.747338,1,79.747338
4,774,0,1.0,84.092569,1,84.092569


In [9]:
gen2_train_imputed.head()

Unnamed: 0,person_id,sex_assigned_at_birth,study_parent_sex,study_parent_id_new,AgeGr,height_cm,generation,height_cm_imputed
0,3012,1,1,636,0.1,56.251625,2,56.251625
1,3012,1,1,636,0.25,64.491579,2,64.491579
2,3012,1,1,636,0.5,70.465927,2,70.465927
3,3012,1,1,636,0.75,73.992677,2,73.992677
4,3012,1,1,636,1.0,79.343537,2,79.343537


In [10]:
gen1_train_imputed.isna().sum()

person_id                 0
sex_assigned_at_birth     0
AgeGr                     0
height_cm                29
generation                0
height_cm_imputed         0
dtype: int64

## Cleaning Test Data

In [11]:
gen1_test_filtered = filter_by_age(gen1_test, age_col="AgeGr", age_cutoff=18)
gen2_test_filtered = filter_by_age(gen2_test, age_col="AgeGr", age_cutoff=9)

gen1_test_imputed = impute_time_series_NEW(gen1_test_filtered, group_col="person_id", time_col="AgeGr", value_col="height_cm")
gen2_test_imputed = impute_time_series_NEW(gen2_test_filtered, group_col="person_id", time_col="AgeGr", value_col="height_cm")
gen1_test_imputed.head()

Original Shape: (1984, 5), KNN Imputed Shape: (1984, 2)
Original Shape: (1232, 7), KNN Imputed Shape: (1232, 2)


Unnamed: 0,person_id,sex_assigned_at_birth,AgeGr,height_cm,generation,height_cm_imputed
0,768,0,0.1,53.822825,1,53.822825
1,768,0,0.25,61.455579,1,61.455579
2,768,0,0.5,69.757527,1,69.757527
3,768,0,0.75,73.385477,1,73.385477
4,768,0,1.0,78.129137,1,78.129137


In [12]:
gen2_test_imputed["AgeGr"].max()

np.float64(9.0)

## Feature Engineering

In [13]:
df_model_train = get_features_fixed(gen1_train_imputed, gen2_train_imputed)
df_model_test = get_features_fixed(gen1_test_imputed, gen2_test_imputed)

# df_model_train.drop(columns=["study_parent_id_new", "person_id_parent", "height_cm_child"], inplace=True)
# df_model_test.drop(columns=["study_parent_id_new", "person_id_parent", "height_cm_child"], inplace=True)

In [14]:
# df_model_test.sample(15).T
df_model_train["AgeGr_child"].max(), df_model_test["AgeGr_child"].max()

(np.float64(18.0), np.float64(9.0))

In [15]:
df_model_train.to_parquet(DATA_PATH + "PROCESSED/train_data_CLEANED_V4.parquet", index=False)
df_model_test.to_parquet(DATA_PATH + "PROCESSED/test_data_CLEANED_V4.parquet", index=False)