## Kaggle Final Model
#### Import libraries

In [None]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer, SimpleImputer
from lightgbm import LGBMRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import Ridge
from sklearn.ensemble import StackingRegressor, RandomForestRegressor
import warnings
warnings.filterwarnings('ignore')

### Load data

In [None]:
gen1_train = pd.read_csv("gen1_train_comp_final.csv")
gen2_train = pd.read_csv("gen2_train_comp_final.csv")
gen1_test = pd.read_csv("gen1_test_comp_final.csv")
gen2_test = pd.read_csv("gen2_test_upto9_comp_final.csv")

In [None]:
def preprocess_data(df, is_gen1=True):
    age_col = 'age' if is_gen1 else 'AgeGr'
    df[age_col] = df[age_col].round().astype(int)
    
    group_cols = ['gen1_id', 'sex_assigned_at_birth', 'age'] if is_gen1 else \
                ['gen2_id', 'sex_assigned_at_birth', 'study_parent_sex', 'study_parent_id_new', 'AgeGr']
    df = df.groupby(group_cols, as_index=False).mean()
    
    sort_cols = ['gen1_id', 'age'] if is_gen1 else ['gen2_id', 'AgeGr']
    id_col = 'gen1_id' if is_gen1 else 'gen2_id'
    
    df = df.sort_values(by=sort_cols)
    df_grouped = df.groupby(id_col)
    
    df["SHgt_cm_CLEANED"] = df_grouped["SHgt_cm"].apply(
        lambda x: x.interpolate(method="linear").fillna(method='ffill').fillna(method='bfill')).values
    
    if not is_gen1:
        if 'Wgt_kg' in df.columns:
            df["Wgt_kg_CLEANED"] = df_grouped["Wgt_kg"].apply(
                lambda x: x.interpolate(method="linear").fillna(method='ffill').fillna(method='bfill')).values
    
    df['sex_assigned_at_birth'] = df['sex_assigned_at_birth'].map({'F': 0, 'M': 1})
    if not is_gen1:
        df['study_parent_sex'] = df['study_parent_sex'].map({'father': 0, 'mother': 1})
    
    return df


In [None]:
gen1_train = preprocess_data(gen1_train, is_gen1=True)
gen2_train = preprocess_data(gen2_train, is_gen1=False)
gen1_test = preprocess_data(gen1_test, is_gen1=True)
gen2_test = preprocess_data(gen2_test, is_gen1=False)

### Feature engineering function

In [None]:
def create_features(df, is_test=False, is_gen1=False):
    features_df = df.copy()
    id_col = 'gen1_id' if is_gen1 else 'gen2_id'
    
    # For gen2 data, calculate derived features
    if not is_gen1:
        grouped = features_df.groupby(id_col)
        
        # Calculate height velocity (growth rate)
        features_df['height_velocity'] = grouped['SHgt_cm_CLEANED'].diff().fillna(0)
        
        # Calculate BMI
        if 'Wgt_kg_CLEANED' in features_df.columns:
            features_df['bmi'] = features_df['Wgt_kg_CLEANED'] / ((features_df['SHgt_cm_CLEANED']/100)**2)
        else:
            features_df['bmi'] = np.nan
        
        # Calculate height percentile within age group
        features_df['height_percentile'] = features_df.groupby('AgeGr')['SHgt_cm_CLEANED'].rank(pct=True)
        
        # Parent relative height feature
        if 'gen1_id' in features_df.columns and 'study_parent_sex' in features_df.columns:
            # Encode father vs mother
            features_df['is_father'] = (features_df['study_parent_sex'] == 0).astype(int)
    
    return features_df

In [None]:
gen1_train = create_features(gen1_train, is_gen1=True)
gen2_train = create_features(gen2_train, is_gen1=False)
gen1_test = create_features(gen1_test, is_test=True, is_gen1=True)
gen2_test = create_features(gen2_test, is_test=True, is_gen1=False)

In [None]:
# Extract features and target variables
target_ages = [10, 11, 12, 13, 14, 15, 16, 18]

# Filter train data to include only ages 0-9 (for predicting ages 10-18)
gen2_train_features = gen2_train[~gen2_train["AgeGr"].isin(target_ages)]
gen2_train_targets = gen2_train[gen2_train["AgeGr"].isin(target_ages)]

# Get unique gen2_ids that have both feature data (ages 0-9) and target data (ages 10-18)
valid_gen2_ids = list(set(gen2_train_features['gen2_id']).intersection(
    set(gen2_train_targets['gen2_id'].unique())))


In [None]:

# Filter data to include only those gen2_ids
gen2_train_features = gen2_train_features[gen2_train_features['gen2_id'].isin(valid_gen2_ids)]
gen2_train_targets = gen2_train_targets[gen2_train_targets['gen2_id'].isin(valid_gen2_ids)]

# Create target DataFrame with proper pivoting
target_train = gen2_train_targets.pivot(
    index="gen2_id", columns="AgeGr", values="SHgt_cm_CLEANED")

# Fill NaN values in target if any
target_train = target_train.fillna(method='ffill', axis=1).fillna(method='bfill', axis=1)

# Get latest height measurements for each person (age 9)
latest_heights = gen2_train_features[gen2_train_features['AgeGr'] == 9].set_index('gen2_id')['SHgt_cm_CLEANED']


In [None]:

# Now merge with parental data
train_data = gen2_train_features.merge(
    gen1_train, left_on='study_parent_id_new', right_on='gen1_id', how='left', suffixes=('', '_parent'))
test_data = gen2_test.merge(
    gen1_test, left_on='study_parent_id_new', right_on='gen1_id', how='left', suffixes=('', '_parent'))


In [None]:

# Create advanced features
# Fill NaN values before division to avoid NaN results
train_data['SHgt_cm_CLEANED'] = train_data['SHgt_cm_CLEANED'].fillna(train_data['SHgt_cm_CLEANED'].median())
train_data['SHgt_cm_CLEANED_parent'] = train_data['SHgt_cm_CLEANED_parent'].fillna(train_data['SHgt_cm_CLEANED_parent'].median())
test_data['SHgt_cm_CLEANED'] = test_data['SHgt_cm_CLEANED'].fillna(test_data['SHgt_cm_CLEANED'].median())
test_data['SHgt_cm_CLEANED_parent'] = test_data['SHgt_cm_CLEANED_parent'].fillna(test_data['SHgt_cm_CLEANED_parent'].median())

train_data['height_to_parent_ratio'] = train_data['SHgt_cm_CLEANED'] / train_data['SHgt_cm_CLEANED_parent']
test_data['height_to_parent_ratio'] = test_data['SHgt_cm_CLEANED'] / test_data['SHgt_cm_CLEANED_parent']


In [None]:

# Group train_data by gen2_id to ensure one row per person
# Get most recent measurements (highest age) for each person
train_data_latest = train_data.loc[train_data.groupby('gen2_id')['AgeGr'].idxmax()]

# Ensure target_train and train_data_latest have the same gen2_ids
common_gen2_ids = list(set(train_data_latest['gen2_id']).intersection(set(target_train.index)))

train_data_latest = train_data_latest[train_data_latest['gen2_id'].isin(common_gen2_ids)]
target_train = target_train.loc[common_gen2_ids]

# Make sure gen2_ids are in the same order
train_data_latest = train_data_latest.set_index('gen2_id').loc[target_train.index].reset_index()
target_array = target_train.reset_index(drop=True).values

# For test data, get most recent measurements
test_data_latest = test_data.loc[test_data.groupby('gen2_id')['AgeGr'].idxmax()]
test_gen2_ids = test_data_latest['gen2_id'].copy()

# Identify non-ID feature columns
id_cols = ['gen1_id', 'gen2_id', 'study_parent_id_new']
feature_cols = [col for col in train_data_latest.columns if col not in id_cols]

# Remove columns with too many NAs
na_threshold = 0.3
na_cols = [col for col in feature_cols if train_data_latest[col].isna().mean() > na_threshold]
print(f"Removing {len(na_cols)} columns with >30% NaN values")
feature_cols = [col for col in feature_cols if col not in na_cols]

# Make sure all feature columns exist in both datasets
for col in feature_cols:
    if col not in test_data_latest.columns:
        print(f"Missing column in test data: {col}")
        test_data_latest[col] = np.nan

simple_imputer = SimpleImputer(strategy='median')
train_data_latest[feature_cols] = simple_imputer.fit_transform(train_data_latest[feature_cols])
test_data_latest[feature_cols] = simple_imputer.transform(test_data_latest[feature_cols])

### Apply KNN imputation

In [None]:
# Apply KNN imputation for better results
imputer = KNNImputer(n_neighbors=5)
train_data_latest[feature_cols] = imputer.fit_transform(train_data_latest[feature_cols])
test_data_latest[feature_cols] = imputer.transform(test_data_latest[feature_cols])

# Create consistent features for training and testing
train_features = train_data_latest.drop(columns=[col for col in id_cols if col in train_data_latest.columns])
test_features = test_data_latest.drop(columns=[col for col in id_cols if col in test_data_latest.columns])

# Ensure column consistency
common_columns = list(set(train_features.columns) & set(test_features.columns))
train_features = train_features[common_columns]
test_features = test_features[common_columns]

# Final check for NaN values - clean up any stragglers
for col in train_features.columns:
    if train_features[col].isna().any():
        median_val = train_features[col].median()
        train_features[col] = train_features[col].fillna(median_val)
        test_features[col] = test_features[col].fillna(median_val)

# Verify no NaN values remain
nan_count_train = train_features.isna().sum().sum()
nan_count_test = test_features.isna().sum().sum()
nan_count_target = np.isnan(target_array).sum()

print(f"NaN values in training features: {nan_count_train}")
print(f"NaN values in test features: {nan_count_test}")
print(f"NaN values in targets: {nan_count_target}")

if nan_count_train > 0 or nan_count_target > 0:
    for i in range(target_array.shape[1]):
        if np.isnan(target_array[:, i]).any():
            col_median = np.nanmedian(target_array[:, i])
            target_array[:, i] = np.nan_to_num(target_array[:, i], nan=col_median)


### Train-Validation Split + Model Training

In [None]:
# Split data for validation
X_train, X_val, y_train, y_val = train_test_split(train_features, target_array, test_size=0.2, random_state=42)

# Define a stacking ensemble for each target column
base_models = [
    ('lgbm', LGBMRegressor(n_estimators=200, max_depth=10, learning_rate=0.05, subsample=0.8, colsample_bytree=0.8)),
    ('rf', RandomForestRegressor(n_estimators=200, max_depth=10, min_samples_split=2))
]
final_estimator = Ridge()

# Define pipelines with scaling
pipelines = []
for name, model in base_models:
    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('model', model)
    ])
    pipelines.append((name, pipe))

stacking_regressors = []
for i in range(target_array.shape[1]):
    stacking_model = StackingRegressor(estimators=pipelines, final_estimator=final_estimator)
    stacking_regressors.append(stacking_model)

for i, model in enumerate(stacking_regressors):
    model.fit(X_train, y_train[:, i])
    val_pred = model.predict(X_val)
    val_rmse = np.sqrt(mean_squared_error(y_val[:, i], val_pred))
    print(f"Validation RMSE for age {target_ages[i]}: {val_rmse:.3f}")


### Make predictions on test data

In [None]:
y_pred = np.zeros((len(test_features), len(stacking_regressors)))
for i, model in enumerate(stacking_regressors):
    y_pred[:, i] = model.predict(test_features)

# Format predictions
prediction_ages = [10, 11, 12, 13, 14, 15, 16, 18]
formatted_rows = []
for idx, gen2_id in enumerate(test_gen2_ids):
    for col_idx, age in enumerate(prediction_ages):
        row_id = f"{gen2_id}_{age}"
        height = y_pred[idx, col_idx]
        formatted_rows.append([row_id, height])

formatted_predictions = pd.DataFrame(formatted_rows, columns=['gen2id_age', 'SHgt_cm'])
formatted_predictions.to_csv("test_predictions.csv", index=False)

In [None]:
output = pd.read_csv("test_predictions.csv")
output.head(10)
output.shape

# Get the distinct values
distinct_ids_1 = output['gen2id_age'].drop_duplicates()

print(distinct_ids_1)

In [None]:
ex_output = pd.read_csv("gen2_test_solution_template.csv")
ex_output

In [None]:
distinct_ids_2 = ex_output['gen2id_age'].drop_duplicates()

difference = distinct_ids_1[~distinct_ids_1.isin(distinct_ids_2)]


output.drop(output[output['gen2id_age'].isin(difference)].index, inplace=True)

output['gen2id_age'].drop_duplicates()
output.to_csv("test_predictions_catboost.csv", index=False)

output.shape