In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.multioutput import MultiOutputRegressor

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer


import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV

from longitudinal.settings.constants import DATA_PATH

In [3]:
df = pd.read_csv(DATA_PATH + "gen1_train_comp_final.csv")
is_gen1 = True
df.shape

(3636, 4)

In [4]:
from sklearn.metrics import mean_absolute_error


def preprocess_data(df, is_gen1=True):
    """
    Preprocesses the data for both generation 1 and 2 datasets
    """

    # Remove columns that are completely missing
    df = df.dropna(axis=1, how="all")

    # Ensure only numeric columns are passed to imputer
    numeric_cols = df.select_dtypes(include=["number"]).columns
    df_numeric = df[numeric_cols]

    # Apply IterativeImputer
    df_imputer = IterativeImputer(max_iter=10, random_state=0, min_value=0, initial_strategy="mean")
    df_imputed = pd.DataFrame(df_imputer.fit_transform(df_numeric), columns=df_numeric.columns)

    # Restore categorical columns if necessary
    df_final = df.copy()
    df_final[df_numeric.columns] = df_imputed

    # Round ages and convert to int
    age_col = 'age' if is_gen1 else 'AgeGr'
    df[age_col] = df[age_col].round().astype(int)

    # try:
    #     df['sex_assigned_at_birth'] = df['sex_assigned_at_birth'].map({'M': 1, 'F': 0})
    # except:
    #     pass
    # try:
    #     df['sex_assigned_at_birth'] = df['sex_assigned_at_birth'].map({'M': 1, 'F': 0})
    # except:
    #     pass
    # try:
    #     df['study_parent_sex'] = df['study_parent_sex'].map({'mother': 1, 'father': 0})
    # except:
    #     pass
    

    # df_imputer = IterativeImputer(max_iter=10, random_state=0, min_value=0, initial_strategy="mean")
    # df = pd.DataFrame(df_imputer.fit_transform(df), columns=df.columns)
    
    # Group columns
    group_cols = ['gen1_id', 'sex_assigned_at_birth', 'age'] if is_gen1 else \
                ['gen2_id', 'sex_assigned_at_birth', 'study_parent_sex', 'study_parent_id_new', 'AgeGr']
    
    # Group and calculate mean
    df = df.groupby(group_cols, as_index=False).mean()
    
    # Sort and interpolate
    sort_cols = ['gen1_id', 'age'] if is_gen1 else ['gen2_id', 'AgeGr']
    id_col = 'gen1_id' if is_gen1 else 'gen2_id'
    
    df = df.sort_values(by=sort_cols)
    df_grouped = df.groupby(id_col)
    
    # Interpolate height
    df["SHgt_cm_CLEANED"] = df_grouped["SHgt_cm"].apply(lambda x: x.interpolate(method="linear")).bfill().values
    
    # Interpolate weight for gen2 only
    if not is_gen1:
        df["Wgt_kg_CLEANED"] = df_grouped["Wgt_kg"].apply(lambda x: x.interpolate(method="linear")).bfill().values
    
    # Encode categorical variables
    df['sex_assigned_at_birth'] = df['sex_assigned_at_birth'].map({'F': 0, 'M': 1})
    if not is_gen1:
        df['study_parent_sex'] = df['study_parent_sex'].map({'father': 0, 'mother': 1})
    
    return df

def extract_features(df, prefix, ages, age_column):
    """
    Extract features like height at specific ages and growth velocity
    """
    features = {}
    for age in ages:
        height_at_age = df[df[age_column] == age]["SHgt_cm_CLEANED"].values
        if len(height_at_age) > 0:
            features[f"{prefix}_height_age_{age}"] = height_at_age[0]
        else:
            features[f"{prefix}_height_age_{age}"] = np.nan
    
    # Add growth velocity features
    if len(ages) > 1:
        for i in range(len(ages)-1):
            age1, age2 = ages[i], ages[i+1]
            height1 = features.get(f"{prefix}_height_age_{age1}")
            height2 = features.get(f"{prefix}_height_age_{age2}")
            if height1 is not None and height2 is not None and not (np.isnan(height1) or np.isnan(height2)):
                features[f"{prefix}_velocity_{age1}_{age2}"] = (height2 - height1) / (age2 - age1)
    
    return pd.Series(features)

def format_predictions(predictions, gen2_ids):
    """
    Format predictions into the required output format:
    gen2id_age SHgt_cm
    """
    formatted_rows = []
    prediction_ages = [10, 11, 12, 13, 14, 15, 16, 18]  # Excluding age 17
    
    for idx, gen2_id in enumerate(gen2_ids):
        for col_idx, age in enumerate(prediction_ages):
            row_id = f"{gen2_id}_{age}"
            height = predictions[idx, col_idx]
            formatted_rows.append([row_id, height])
    
    formatted_df = pd.DataFrame(formatted_rows, columns=['gen2id_age', 'SHgt_cm'])
    return formatted_df

# Load data
gen1_train = pd.read_csv(DATA_PATH + "gen1_train_comp_final.csv")
gen2_train = pd.read_csv(DATA_PATH + "gen2_train_comp_final.csv")
gen1_test = pd.read_csv(DATA_PATH + "gen1_test_comp_final.csv")
gen2_test = pd.read_csv(DATA_PATH + "gen2_test_upto9_comp_final.csv")

# Fix inconsistent study_parent_sex values
# There 'study_parent_sex' not consistent over time
no_match_kids_0 = gen2_train[gen2_train['gen2_id'].isin([1332, 2505])]
no_match_kids_1 = gen2_train[gen2_train['gen2_id'].isin([2517, 3012])] 

gen2_train.loc[gen2_train['gen2_id'].isin([1332, 2505]), 'study_parent_sex'] = 0
gen2_train.loc[gen2_train['gen2_id'].isin([2517, 3012]), 'study_parent_sex'] = 1

# Preprocess data
gen1_train = preprocess_data(gen1_train, is_gen1=True)
gen2_train = preprocess_data(gen2_train, is_gen1=False)
gen1_test = preprocess_data(gen1_test, is_gen1=True)
gen2_test = preprocess_data(gen2_test, is_gen1=False)

# Extract features
child_ages = range(0, 10)

# Convert age ranges to integers and handle edge cases
min_parent_age = int(min(gen1_train['age'].min(), gen1_test['age'].min()))
max_parent_age = int(max(gen1_train['age'].max(), gen1_test['age'].max()))
parent_ages = list(range(min_parent_age, max_parent_age + 1))

# Extract features for children
child_features_train = gen2_train.groupby("gen2_id").apply(
    extract_features, prefix="child", ages=child_ages, age_column="AgeGr"
).reset_index()

child_features_test = gen2_test.groupby("gen2_id").apply(
    extract_features, prefix="child", ages=child_ages, age_column="AgeGr"
).reset_index()

# Extract features for parents
parent_features_train = gen1_train.groupby("gen1_id").apply(
    extract_features, prefix="parent", ages=parent_ages, age_column="age"
).reset_index()

parent_features_test = gen1_test.groupby("gen1_id").apply(
    extract_features, prefix="parent", ages=parent_ages, age_column="age"
).reset_index()

# Get parent IDs for linking
parent_ids_train = gen2_train[['gen2_id', 'study_parent_id_new']].drop_duplicates()
parent_ids_test = gen2_test[['gen2_id', 'study_parent_id_new']].drop_duplicates()

# Merge features
train_data = pd.merge(child_features_train, parent_ids_train, on='gen2_id')
train_data = pd.merge(train_data, parent_features_train, 
                     left_on='study_parent_id_new', 
                     right_on='gen1_id', 
                     how='left')
    
train_data.interpolate(method='linear', inplace=True)  
train_data.fillna(method='ffill', inplace=True)
train_data.fillna(method='bfill', inplace=True) 

test_data = pd.merge(child_features_test, parent_ids_test, on='gen2_id')
test_data = pd.merge(test_data, parent_features_test,
                    left_on='study_parent_id_new',
                    right_on='gen1_id',
                    how='left')

test_data.interpolate(method='linear', inplace=True)
test_data.fillna(method='ffill', inplace=True)
test_data.fillna(method='bfill', inplace=True)

# Prepare target variables
target_ages = [10, 11, 12, 13, 14, 15, 16, 18]
target_train = gen2_train[gen2_train["AgeGr"].isin(target_ages)].pivot(
    index="gen2_id", 
    columns="AgeGr", 
    values="SHgt_cm_CLEANED"
)

# Ensure all required ages are present
for age in target_ages:
    if age not in target_train.columns:
        target_train[age] = np.nan

# Sort columns and interpolate
target_train = target_train.reindex(columns=target_ages)
target_train = target_train.interpolate(axis=1, method='linear')
target_train = target_train.fillna(method='ffill').fillna(method='bfill')

# Remove any columns with all NaN values
train_data = train_data.dropna(axis=1, how='all')
test_data = test_data.dropna(axis=1, how='all')

# Align features between train and test sets
common_columns = list(set(train_data.columns) & set(test_data.columns))
train_data = train_data[common_columns]
test_data = test_data[common_columns]

  child_features_train = gen2_train.groupby("gen2_id").apply(
  child_features_test = gen2_test.groupby("gen2_id").apply(
  parent_features_train = gen1_train.groupby("gen1_id").apply(
  parent_features_test = gen1_test.groupby("gen1_id").apply(
  train_data.fillna(method='ffill', inplace=True)
  train_data.fillna(method='bfill', inplace=True)
  test_data.fillna(method='ffill', inplace=True)
  test_data.fillna(method='bfill', inplace=True)
  target_train = target_train.fillna(method='ffill').fillna(method='bfill')


In [6]:
# Split training data
X_train, X_val, y_train, y_val = train_test_split(
    train_data.drop(['gen2_id', 'study_parent_id_new', 'gen1_id'], axis=1, errors='ignore'),
    target_train,
    test_size=0.2,
    random_state=42
)

In [7]:
param_grid = {
    "n_estimators": [50, 100, 200],  # Number of boosting rounds
    "learning_rate": [0.01, 0.05, 0.1, 0.2],  # Step size shrinkage
    "max_depth": [3, 5, 7, 9],  # Maximum depth of trees
    "subsample": [0.6, 0.8, 1.0],  # Fraction of samples used per tree
    "colsample_bytree": [0.6, 0.8, 1.0],  # Fraction of features used per tree
    "gamma": [0, 0.1, 0.2, 0.3],  # Minimum loss reduction required to make a split
    "reg_alpha": [0, 0.01, 0.1, 1.0],  # L1 regularization
    "reg_lambda": [1.0, 2.0, 5.0]  # L2 regularization
}

model = xgb.XGBRegressor(objective="reg:squarederror", random_state=42)

random_search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_grid,
    n_iter=30, 
    scoring="neg_mean_absolute_error",
    cv=5,
    n_jobs=-1,
    random_state=42
)

random_search.fit(X_train, y_train)

print("Best Hyperparameters:", random_search.best_params_)
print("Best MAE:", -random_search.best_score_)


model = xgb.XGBRegressor(objective="reg:squarederror", random_state=42, **random_search.best_params_)
model.fit(X_train, y_train)

# Evaluate on test set
y_pred = model.predict(X_val)
mae = mean_absolute_error(y_val, y_pred)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"Test MAE: {mae:.2f} cm")
print(f"Test RMSE: {rmse:.2f} cm")

multi_output_model = MultiOutputRegressor(model)
multi_output_model.fit(X_train, y_train)

# Evaluate model
y_pred = multi_output_model.predict(X_val)
mse = mean_squared_error(y_val, y_pred)
print(f"MSE: {mse}")

# Make predictions on test data
test_features = test_data.drop(['gen2_id', 'study_parent_id_new', 'gen1_id'], axis=1, errors='ignore')
test_predictions = multi_output_model.predict(test_features)

# Format and save predictions
formatted_predictions = format_predictions(
    predictions=test_predictions,
    gen2_ids=test_data['gen2_id']
)

# Round heights to 2 decimal places
formatted_predictions['SHgt_cm'] = formatted_predictions['SHgt_cm']

# Save predictions
# formatted_predictions.to_csv("test_predictions.csv", index=False)

print("Process completed successfully!")
print(f"Final predictions shape: {formatted_predictions.shape}")
print("\nFirst few predictions:")
print(formatted_predictions.head(10))

Best Hyperparameters: {'subsample': 1.0, 'reg_lambda': 1.0, 'reg_alpha': 0.01, 'n_estimators': 100, 'max_depth': 5, 'learning_rate': 0.1, 'gamma': 0.1, 'colsample_bytree': 0.6}
Best MAE: 4.955441474914551
Test MAE: 4.26 cm
Test RMSE: 6.03 cm
MSE: 36.822608947753906
Process completed successfully!
Final predictions shape: (704, 2)

First few predictions:
  gen2id_age     SHgt_cm
0    2332_10  128.413208
1    2332_11  134.490616
2    2332_12  140.101913
3    2332_13  144.901367
4    2332_14  150.920502
5    2332_15  153.667099
6    2332_16  158.042175
7    2332_18  160.882416
8    2503_10  139.112305
9    2503_11  142.112961


In [7]:
formatted_predictions.to_csv("wh_xgboost_submission_2.csv", index = False)

In [8]:
formatted_predictions.shape

(704, 2)

In [9]:
submission = pd.read_csv(DATA_PATH + "gen2_test_solution_template.csv")
submission.shape

(589, 2)

In [13]:
submission.merge(formatted_predictions, on="gen2id_age", how="inner").drop(columns=["SHgt_cm_x"]).rename(columns={"SHgt_cm_y": "SHgt_cm"}).to_csv("wh_xgboost_submission_2.csv", index = False)