In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.multioutput import MultiOutputRegressor

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer


import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV

from longitudinal.settings.constants import DATA_PATH

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv(DATA_PATH + "gen1_train_comp_final.csv")
is_gen1 = True
df.shape

(3636, 4)

In [4]:
from sklearn.metrics import mean_absolute_error


def preprocess_data(df, is_gen1=True):
    """
    Preprocesses the data for both generation 1 and 2 datasets
    """

    # Remove columns that are completely missing
    df = df.dropna(axis=1, how="all")

    # Encode categorical variables before imputation
    if "sex_assigned_at_birth" in df.columns:
        df['sex_assigned_at_birth'] = df['sex_assigned_at_birth'].map({'M': 1, 'F': 0}).astype(float)
    if "study_parent_sex" in df.columns:
        df['study_parent_sex'] = df['study_parent_sex'].map({'mother': 1, 'father': 0}).astype(float)

    # Ensure only numeric columns are passed to imputer
    numeric_cols = df.select_dtypes(include=["number"]).columns
    df_numeric = df[numeric_cols]

    # Apply IterativeImputer
    df_imputer = IterativeImputer(max_iter=10, random_state=0, min_value=0, initial_strategy="mean")
    df_imputed = pd.DataFrame(df_imputer.fit_transform(df_numeric), columns=df_numeric.columns)

    # Restore categorical columns if necessary
    df_final = df.copy()
    df_final[numeric_cols] = df_imputed
    df = df_final

    # Round ages and convert to int
    age_col = 'age' if is_gen1 else 'AgeGr'
    df[age_col] = df[age_col].round().astype(int)

    # Group columns
    group_cols = ['gen1_id', 'sex_assigned_at_birth', 'age'] if is_gen1 else \
                ['gen2_id', 'sex_assigned_at_birth', 'study_parent_sex', 'study_parent_id_new', 'AgeGr']
    
    # Group and calculate mean
    df = df.groupby(group_cols, as_index=False).mean()
    
    # Sort and interpolate
    sort_cols = ['gen1_id', 'age'] if is_gen1 else ['gen2_id', 'AgeGr']
    id_col = 'gen1_id' if is_gen1 else 'gen2_id'
    
    df = df.sort_values(by=sort_cols)
    df_grouped = df.groupby(id_col)
    
    # Interpolate height
    df["SHgt_cm_CLEANED"] = df_grouped["SHgt_cm"].apply(
        lambda x: x.interpolate(method="polynomial", order=2) if x.count() > 2 else x.interpolate(method="linear")
    ).bfill().values
    
    # Interpolate weight for gen2 only
    if not is_gen1:
        df["Wgt_kg_CLEANED"] = df_grouped["Wgt_kg"].apply(
            lambda x: x.interpolate(method="polynomial", order=2) if x.count() > 2 else x.interpolate(method="linear")
        ).bfill().values
    
    return df

def extract_features(df, prefix, ages, age_column):
    """
    Extract features like height at specific ages and growth velocity
    """
    features = {}
    for age in ages:
        height_at_age = df[df[age_column] == age]["SHgt_cm_CLEANED"].values
        if len(height_at_age) > 0:
            features[f"{prefix}_height_age_{age}"] = height_at_age[0]
        else:
            features[f"{prefix}_height_age_{age}"] = np.nan
    
    # Add growth velocity features
    if len(ages) > 1:
        for i in range(len(ages)-1):
            age1, age2 = ages[i], ages[i+1]
            height1 = features.get(f"{prefix}_height_age_{age1}")
            height2 = features.get(f"{prefix}_height_age_{age2}")
            if height1 is not None and height2 is not None and not (np.isnan(height1) or np.isnan(height2)):
                features[f"{prefix}_velocity_{age1}_{age2}"] = (height2 - height1) / (age2 - age1)
    
    return pd.Series(features)

def format_predictions(predictions, gen2_ids):
    """
    Format predictions into the required output format:
    gen2id_age SHgt_cm
    """
    formatted_rows = []
    prediction_ages = [10, 11, 12, 13, 14, 15, 16, 18]  # Excluding age 17
    
    for idx, gen2_id in enumerate(gen2_ids):
        for col_idx, age in enumerate(prediction_ages):
            row_id = f"{gen2_id}_{age}"
            height = predictions[idx, col_idx]
            formatted_rows.append([row_id, height])
    
    formatted_df = pd.DataFrame(formatted_rows, columns=['gen2id_age', 'SHgt_cm'])
    return formatted_df

# Load data
gen1_train = pd.read_csv(DATA_PATH + "gen1_train_comp_final.csv")
gen2_train = pd.read_csv(DATA_PATH + "gen2_train_comp_final.csv")
gen1_test = pd.read_csv(DATA_PATH + "gen1_test_comp_final.csv")
gen2_test = pd.read_csv(DATA_PATH + "gen2_test_upto9_comp_final.csv")

# Fix inconsistent study_parent_sex values
# There 'study_parent_sex' not consistent over time
no_match_kids_0 = gen2_train[gen2_train['gen2_id'].isin([1332, 2505])]
no_match_kids_1 = gen2_train[gen2_train['gen2_id'].isin([2517, 3012])] 

gen2_train.loc[gen2_train['gen2_id'].isin([1332, 2505]), 'study_parent_sex'] = 0
gen2_train.loc[gen2_train['gen2_id'].isin([2517, 3012]), 'study_parent_sex'] = 1

# Preprocess data
gen1_train = preprocess_data(gen1_train, is_gen1=True)
gen2_train = preprocess_data(gen2_train, is_gen1=False)
gen1_test = preprocess_data(gen1_test, is_gen1=True)
gen2_test = preprocess_data(gen2_test, is_gen1=False)

# Extract features
child_ages = range(0, 10)

# Convert age ranges to integers and handle edge cases
min_parent_age = int(min(gen1_train['age'].min(), gen1_test['age'].min()))
max_parent_age = int(max(gen1_train['age'].max(), gen1_test['age'].max()))
parent_ages = list(range(min_parent_age, max_parent_age + 1))

# Extract features for children
child_features_train = gen2_train.groupby("gen2_id").apply(
    extract_features, prefix="child", ages=child_ages, age_column="AgeGr"
).reset_index()

child_features_test = gen2_test.groupby("gen2_id").apply(
    extract_features, prefix="child", ages=child_ages, age_column="AgeGr"
).reset_index()

# Extract features for parents
parent_features_train = gen1_train.groupby("gen1_id").apply(
    extract_features, prefix="parent", ages=parent_ages, age_column="age"
).reset_index()

parent_features_test = gen1_test.groupby("gen1_id").apply(
    extract_features, prefix="parent", ages=parent_ages, age_column="age"
).reset_index()

# Get parent IDs for linking
parent_ids_train = gen2_train[['gen2_id', 'study_parent_id_new']].drop_duplicates()
parent_ids_test = gen2_test[['gen2_id', 'study_parent_id_new']].drop_duplicates()

# Merge features
train_data = pd.merge(child_features_train, parent_ids_train, on='gen2_id')
train_data = pd.merge(train_data, parent_features_train, 
                     left_on='study_parent_id_new', 
                     right_on='gen1_id', 
                     how='left')
    
train_data.interpolate(method='linear', inplace=True)  
train_data.fillna(method='ffill', inplace=True)
train_data.fillna(method='bfill', inplace=True) 

test_data = pd.merge(child_features_test, parent_ids_test, on='gen2_id')
test_data = pd.merge(test_data, parent_features_test,
                    left_on='study_parent_id_new',
                    right_on='gen1_id',
                    how='left')

test_data.interpolate(method='linear', inplace=True)
test_data.fillna(method='ffill', inplace=True)
test_data.fillna(method='bfill', inplace=True)

# Prepare target variables
target_ages = [10, 11, 12, 13, 14, 15, 16, 18]
target_train = gen2_train[gen2_train["AgeGr"].isin(target_ages)].pivot(
    index="gen2_id", 
    columns="AgeGr", 
    values="SHgt_cm_CLEANED"
)

# Ensure all required ages are present
for age in target_ages:
    if age not in target_train.columns:
        target_train[age] = np.nan

# Sort columns and interpolate
target_train = target_train.reindex(columns=target_ages)
target_train = target_train.interpolate(axis=1, method='linear')
target_train = target_train.fillna(method='ffill').fillna(method='bfill')

# Remove any columns with all NaN values
train_data = train_data.dropna(axis=1, how='all')
test_data = test_data.dropna(axis=1, how='all')

# Align features between train and test sets
common_columns = list(set(train_data.columns) & set(test_data.columns))
train_data = train_data[common_columns]
test_data = test_data[common_columns]

In [8]:
# Split training data
X_train, X_val, y_train, y_val = train_test_split(
    train_data.drop(['gen2_id', 'study_parent_id_new', 'gen1_id'], axis=1, errors='ignore'),
    target_train,
    test_size=0.2,
    random_state=42
)

In [18]:
target_train

AgeGr,10,11,12,13,14,15,16,18
gen2_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1332.0,145.262229,150.670668,160.222405,164.623273,171.599576,178.575879,185.552181,199.504787
2330.0,131.799105,146.592755,153.569058,160.545361,167.521664,174.497966,181.474269,195.426874
2331.0,141.524124,150.015631,161.792710,171.815312,169.361521,176.337824,183.314127,197.266732
2505.0,139.257516,144.366273,150.663565,158.903055,165.879454,172.855853,179.832251,193.785048
2507.0,150.337479,156.747461,164.874826,159.046592,166.022894,172.999197,179.975500,193.928105
...,...,...,...,...,...,...,...,...
2825.0,138.702403,145.678706,152.655008,159.631311,166.607614,173.583916,180.560219,194.512824
2827.0,137.226438,147.350383,152.487799,155.343264,165.374217,170.255790,172.711099,174.151818
2829.0,141.468590,150.291349,160.823434,164.816989,166.242011,167.438765,168.077352,168.980405
2830.0,145.422500,152.627457,156.193087,158.433856,159.909045,159.748067,159.855986,192.503685


In [10]:
param_grid = {
    "n_estimators": [50, 100, 200],  # Number of boosting rounds
    "learning_rate": [0.01, 0.05, 0.1, 0.2],  # Step size shrinkage
    "max_depth": [3, 5, 7, 9],  # Maximum depth of trees
    "subsample": [0.6, 0.8, 1.0],  # Fraction of samples used per tree
    "colsample_bytree": [0.6, 0.8, 1.0],  # Fraction of features used per tree
    "gamma": [0, 0.1, 0.2, 0.3],  # Minimum loss reduction required to make a split
    "reg_alpha": [0, 0.01, 0.1, 1.0],  # L1 regularization
    "reg_lambda": [1.0, 2.0, 5.0]  # L2 regularization
}

xgboost_reg = xgb.XGBRegressor(objective="reg:squarederror", random_state=42)

random_search = RandomizedSearchCV(
    estimator=xgboost_reg,
    param_distributions=param_grid,
    n_iter=30, 
    scoring="neg_mean_absolute_error",
    cv=5,
    n_jobs=-1,
    random_state=42
)

random_search.fit(X_train, y_train)

print("Best Hyperparameters:", random_search.best_params_)
print("Best MAE:", -random_search.best_score_)


xgboost_reg = xgb.XGBRegressor(objective="reg:squarederror", random_state=42, **random_search.best_params_)
xgboost_reg.fit(X_train, y_train)

# Evaluate on test set
y_pred = xgboost_reg.predict(X_val)
mae = mean_absolute_error(y_val, y_pred)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"Test MAE: {mae:.2f} cm")
print(f"Test RMSE: {rmse:.2f} cm")

multi_output_model = MultiOutputRegressor(xgboost_reg)
multi_output_model.fit(X_train, y_train)

# Evaluate model
y_pred = multi_output_model.predict(X_val)
mse = mean_squared_error(y_val, y_pred)
print(f"MSE: {mse}")

# Make predictions on test data
test_features = test_data.drop(['gen2_id', 'study_parent_id_new', 'gen1_id'], axis=1, errors='ignore')
test_predictions = multi_output_model.predict(test_features)

# Format and save predictions
formatted_predictions_xgb = format_predictions(
    predictions=test_predictions,
    gen2_ids=test_data['gen2_id']
)

# Round heights to 2 decimal places
formatted_predictions_xgb['SHgt_cm'] = formatted_predictions_xgb['SHgt_cm']

# Save predictions
# formatted_predictions_rf.to_csv("test_predictions.csv", index=False)

print("Process completed successfully!")
print(f"Final predictions shape: {formatted_predictions_xgb.shape}")
print("\nFirst few predictions:")
print(formatted_predictions_xgb.head(10))

Best Hyperparameters: {'subsample': 0.6, 'reg_lambda': 2.0, 'reg_alpha': 0, 'n_estimators': 100, 'max_depth': 3, 'learning_rate': 0.05, 'gamma': 0.1, 'colsample_bytree': 1.0}
Best MAE: 4.060417652130127
Test MAE: 3.76 cm
Test RMSE: 5.09 cm
MSE: 25.39261817932129
Process completed successfully!
Final predictions shape: (704, 2)

First few predictions:
  gen2id_age     SHgt_cm
0  2332.0_10  131.367004
1  2332.0_11  137.060242
2  2332.0_12  143.428024
3  2332.0_13  151.069717
4  2332.0_14  158.513565
5  2332.0_15  164.357712
6  2332.0_16  167.850662
7  2332.0_18  171.600845
8  2503.0_10  148.784225
9  2503.0_11  154.246597


In [11]:
# Define the parameter grid
param_grid = {
    "n_estimators": np.arange(50, 500, 50),  # Number of trees in the forest
    "max_depth": [None] + list(np.arange(5, 30, 5)),  # Maximum depth of the tree
    "min_samples_split": [2, 5, 10, 20],  # Minimum samples required to split a node
    "min_samples_leaf": [1, 2, 4, 10],  # Minimum samples required at a leaf node
    "max_features": ["sqrt", "log2", None],  # Number of features to consider for best split
    "bootstrap": [True, False],  # Whether to use bootstrap sampling
}

# Initialize the Random Forest model
rf = RandomForestRegressor(random_state=42)

# Randomized Search CV
random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_grid,
    n_iter=50,  # Number of parameter settings to sample
    cv=5,  # 5-fold cross-validation
    scoring="neg_mean_squared_error",  # Use MSE as evaluation metric
    n_jobs=-1,  # Use all available CPU cores
    random_state=42
)

# Fit the model on data (replace X_train, y_train with actual data)
random_search.fit(X_train, y_train)

# Print best parameters (once fitted)
print("Best parameters found: ", random_search.best_params_)
print("Best MAE:", -random_search.best_score_)


rf = RandomForestRegressor(random_state=42, **random_search.best_params_)
rf.fit(X_train, y_train)

# Evaluate on test set
y_pred = rf.predict(X_val)
mae = mean_absolute_error(y_val, y_pred)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"Test MAE: {mae:.2f} cm")
print(f"Test RMSE: {rmse:.2f} cm")

multi_output_model = MultiOutputRegressor(rf)
multi_output_model.fit(X_train, y_train)

# Evaluate model
y_pred = multi_output_model.predict(X_val)
mse = mean_squared_error(y_val, y_pred)
print(f"MSE: {mse}")

# Make predictions on test data
test_features = test_data.drop(['gen2_id', 'study_parent_id_new', 'gen1_id'], axis=1, errors='ignore')
test_predictions = multi_output_model.predict(test_features)

# Format and save predictions
formatted_predictions_rf = format_predictions(
    predictions=test_predictions,
    gen2_ids=test_data['gen2_id']
)

# Round heights to 2 decimal places
formatted_predictions_rf['SHgt_cm'] = formatted_predictions_rf['SHgt_cm']

# Save predictions
# formatted_predictions_rf.to_csv("test_predictions.csv", index=False)

print("Process completed successfully!")
print(f"Final predictions shape: {formatted_predictions_rf.shape}")
print("\nFirst few predictions:")
print(formatted_predictions_rf.head(10))

Best parameters found:  {'n_estimators': np.int64(150), 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': np.int64(15), 'bootstrap': False}
Best MAE: 29.839653286611387
Test MAE: 3.83 cm
Test RMSE: 5.02 cm
MSE: 25.073769794460922
Process completed successfully!
Final predictions shape: (704, 2)

First few predictions:
  gen2id_age     SHgt_cm
0  2332.0_10  131.540159
1  2332.0_11  138.075674
2  2332.0_12  144.593711
3  2332.0_13  152.284787
4  2332.0_14  157.331796
5  2332.0_15  161.560191
6  2332.0_16  165.798184
7  2332.0_18  174.678769
8  2503.0_10  146.117488
9  2503.0_11  152.945105


In [17]:
pd.DataFrame(xgboost_reg.predict(X_val))

Unnamed: 0,0,1,2,3,4,5,6,7
0,137.205948,144.588394,149.008423,156.327957,160.506912,166.472946,167.159515,173.995178
1,145.453659,152.103226,161.733063,167.467834,171.079071,175.005981,177.625275,176.726929
2,147.124924,154.164764,160.472229,164.987381,172.974167,178.199951,181.098389,190.249557
3,148.663315,155.290482,162.953278,165.589951,173.170303,174.783829,183.878891,191.332901
4,146.168213,151.520309,155.178238,162.249863,167.367569,171.880432,177.002701,184.372894
5,136.369278,142.968491,149.210297,156.615707,161.572708,171.19075,178.657608,186.0634
6,133.985672,142.145874,147.541061,154.251205,162.534744,164.95549,166.113754,166.128235
7,140.81543,146.470169,152.888535,161.053253,165.840164,170.81456,173.02948,177.531952
8,135.780975,141.692337,145.322708,151.883987,157.832779,164.682434,170.419098,176.967926
9,148.660507,155.181046,161.841309,165.8806,172.997177,176.914398,181.421051,182.867569


In [12]:
formatted_predictions_rf

Unnamed: 0,gen2id_age,SHgt_cm
0,2332.0_10,131.540159
1,2332.0_11,138.075674
2,2332.0_12,144.593711
3,2332.0_13,152.284787
4,2332.0_14,157.331796
...,...,...
699,2831.0_13,156.321027
700,2831.0_14,162.715954
701,2831.0_15,167.383549
702,2831.0_16,171.240835


In [13]:
formatted_predictions_xgb

Unnamed: 0,gen2id_age,SHgt_cm
0,2332.0_10,131.367004
1,2332.0_11,137.060242
2,2332.0_12,143.428024
3,2332.0_13,151.069717
4,2332.0_14,158.513565
...,...,...
699,2831.0_13,155.568329
700,2831.0_14,161.484467
701,2831.0_15,164.783447
702,2831.0_16,170.233261


In [7]:
formatted_predictions.to_csv("wh_xgboost_submission_2.csv", index = False)

In [8]:
formatted_predictions.shape

(704, 2)

In [9]:
submission = pd.read_csv(DATA_PATH + "gen2_test_solution_template.csv")
submission.shape

(589, 2)

In [13]:
submission.merge(formatted_predictions, on="gen2id_age", how="inner").drop(columns=["SHgt_cm_x"]).rename(columns={"SHgt_cm_y": "SHgt_cm"}).to_csv("wh_xgboost_submission_2.csv", index = False)