In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.multioutput import MultiOutputRegressor

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer


import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV

from longitudinal.settings.constants import DATA_PATH

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv(DATA_PATH + "gen1_train_comp_final.csv")
is_gen1 = True
df.shape

(3636, 4)

In [3]:
from sklearn.metrics import mean_absolute_error


def preprocess_data(df, is_gen1=True):
    """
    Preprocesses the data for both generation 1 and 2 datasets
    """

    # Remove columns that are completely missing
    df = df.dropna(axis=1, how="all")

    # Encode categorical variables before imputation
    if "sex_assigned_at_birth" in df.columns:
        df['sex_assigned_at_birth'] = df['sex_assigned_at_birth'].map({'M': 1, 'F': 0}).astype(float)
    if "study_parent_sex" in df.columns:
        df['study_parent_sex'] = df['study_parent_sex'].map({'mother': 1, 'father': 0}).astype(float)

    # Ensure only numeric columns are passed to imputer
    numeric_cols = df.select_dtypes(include=["number"]).columns
    df_numeric = df[numeric_cols]

    # Apply IterativeImputer
    df_imputer = IterativeImputer(max_iter=10, random_state=0, min_value=0, initial_strategy="mean")
    df_imputed = pd.DataFrame(df_imputer.fit_transform(df_numeric), columns=df_numeric.columns)

    # Restore categorical columns if necessary
    df_final = df.copy()
    df_final[numeric_cols] = df_imputed
    df = df_final

    # Round ages and convert to int
    age_col = 'age' if is_gen1 else 'AgeGr'
    df[age_col] = df[age_col].round().astype(int)

    # Group columns
    group_cols = ['gen1_id', 'sex_assigned_at_birth', 'age'] if is_gen1 else \
                ['gen2_id', 'sex_assigned_at_birth', 'study_parent_sex', 'study_parent_id_new', 'AgeGr']
    
    # Group and calculate mean
    df = df.groupby(group_cols, as_index=False).mean()
    
    # Sort and interpolate
    sort_cols = ['gen1_id', 'age'] if is_gen1 else ['gen2_id', 'AgeGr']
    id_col = 'gen1_id' if is_gen1 else 'gen2_id'
    
    df = df.sort_values(by=sort_cols)
    df_grouped = df.groupby(id_col)
    
    # Interpolate height
    df["SHgt_cm_CLEANED"] = df_grouped["SHgt_cm"].apply(
        lambda x: x.interpolate(method="polynomial", order=2) if x.count() > 2 else x.interpolate(method="linear")
    ).bfill().values
    
    # Interpolate weight for gen2 only
    if not is_gen1:
        df["Wgt_kg_CLEANED"] = df_grouped["Wgt_kg"].apply(
            lambda x: x.interpolate(method="polynomial", order=2) if x.count() > 2 else x.interpolate(method="linear")
        ).bfill().values
    
    return df

def extract_features(df, prefix, ages, age_column):
    """
    Extract features like height at specific ages and growth velocity
    """
    features = {}
    for age in ages:
        height_at_age = df[df[age_column] == age]["SHgt_cm_CLEANED"].values
        if len(height_at_age) > 0:
            features[f"{prefix}_height_age_{age}"] = height_at_age[0]
        else:
            features[f"{prefix}_height_age_{age}"] = np.nan
    
    # Add growth velocity features
    if len(ages) > 1:
        for i in range(len(ages)-1):
            age1, age2 = ages[i], ages[i+1]
            height1 = features.get(f"{prefix}_height_age_{age1}")
            height2 = features.get(f"{prefix}_height_age_{age2}")
            if height1 is not None and height2 is not None and not (np.isnan(height1) or np.isnan(height2)):
                features[f"{prefix}_velocity_{age1}_{age2}"] = (height2 - height1) / (age2 - age1)
    
    return pd.Series(features)

def format_predictions(predictions, gen2_ids):
    """
    Format predictions into the required output format:
    gen2id_age SHgt_cm
    """
    formatted_rows = []
    prediction_ages = [10, 11, 12, 13, 14, 15, 16, 17, 18]  # Excluding age 17
    
    for idx, gen2_id in enumerate(gen2_ids):
        for col_idx, age in enumerate(prediction_ages):
            row_id = f"{gen2_id}_{age}"
            height = predictions[idx, col_idx]
            formatted_rows.append([row_id, height])
    
    formatted_df = pd.DataFrame(formatted_rows, columns=['gen2id_age', 'SHgt_cm'])
    return formatted_df

# Load data
gen1_train = pd.read_csv(DATA_PATH + "gen1_train_comp_final.csv")
gen2_train = pd.read_csv(DATA_PATH + "gen2_train_comp_final.csv")
gen1_test = pd.read_csv(DATA_PATH + "gen1_test_comp_final.csv")
gen2_test = pd.read_csv(DATA_PATH + "gen2_test_upto9_comp_final.csv")

# Fix inconsistent study_parent_sex values
# There 'study_parent_sex' not consistent over time
no_match_kids_0 = gen2_train[gen2_train['gen2_id'].isin([1332, 2505])]
no_match_kids_1 = gen2_train[gen2_train['gen2_id'].isin([2517, 3012])] 

gen2_train.loc[gen2_train['gen2_id'].isin([1332, 2505]), 'study_parent_sex'] = 0
gen2_train.loc[gen2_train['gen2_id'].isin([2517, 3012]), 'study_parent_sex'] = 1

# Preprocess data
gen1_train = preprocess_data(gen1_train, is_gen1=True)
gen2_train = preprocess_data(gen2_train, is_gen1=False)
gen1_test = preprocess_data(gen1_test, is_gen1=True)
gen2_test = preprocess_data(gen2_test, is_gen1=False)

# Extract features
child_ages = range(0, 10)

# Convert age ranges to integers and handle edge cases
min_parent_age = int(min(gen1_train['age'].min(), gen1_test['age'].min()))
max_parent_age = int(max(gen1_train['age'].max(), gen1_test['age'].max()))
parent_ages = list(range(min_parent_age, max_parent_age + 1))

# Extract features for children
child_features_train = gen2_train.groupby("gen2_id").apply(
    extract_features, prefix="child", ages=child_ages, age_column="AgeGr"
).reset_index()

child_features_test = gen2_test.groupby("gen2_id").apply(
    extract_features, prefix="child", ages=child_ages, age_column="AgeGr"
).reset_index()

# Extract features for parents
parent_features_train = gen1_train.groupby("gen1_id").apply(
    extract_features, prefix="parent", ages=parent_ages, age_column="age"
).reset_index()

parent_features_test = gen1_test.groupby("gen1_id").apply(
    extract_features, prefix="parent", ages=parent_ages, age_column="age"
).reset_index()

# Get parent IDs for linking
parent_ids_train = gen2_train[['gen2_id', 'study_parent_id_new']].drop_duplicates()
parent_ids_test = gen2_test[['gen2_id', 'study_parent_id_new']].drop_duplicates()

# Merge features
train_data = pd.merge(child_features_train, parent_ids_train, on='gen2_id')
train_data = pd.merge(train_data, parent_features_train, 
                     left_on='study_parent_id_new', 
                     right_on='gen1_id', 
                     how='left')
    
train_data.interpolate(method='linear', inplace=True)  
train_data.fillna(method='ffill', inplace=True)
train_data.fillna(method='bfill', inplace=True) 

test_data = pd.merge(child_features_test, parent_ids_test, on='gen2_id')
test_data = pd.merge(test_data, parent_features_test,
                    left_on='study_parent_id_new',
                    right_on='gen1_id',
                    how='left')

test_data.interpolate(method='linear', inplace=True)
test_data.fillna(method='ffill', inplace=True)
test_data.fillna(method='bfill', inplace=True)

# Prepare target variables
target_ages = [10, 11, 12, 13, 14, 15, 16, 17, 18]
target_train = gen2_train[gen2_train["AgeGr"].isin(target_ages)].pivot(
    index="gen2_id", 
    columns="AgeGr", 
    values="SHgt_cm_CLEANED"
)

# Ensure all required ages are present
for age in target_ages:
    if age not in target_train.columns:
        target_train[age] = np.nan

# Sort columns and interpolate
target_train = target_train.reindex(columns=target_ages)
target_train = target_train.interpolate(axis=1, method='linear')
target_train = target_train.fillna(method='ffill').fillna(method='bfill')

# Remove any columns with all NaN values
train_data = train_data.dropna(axis=1, how='all')
test_data = test_data.dropna(axis=1, how='all')

# Align features between train and test sets
common_columns = list(set(train_data.columns) & set(test_data.columns))
train_data = train_data[common_columns]
test_data = test_data[common_columns]

In [4]:
# Split training data
X_train, X_val, y_train, y_val = train_test_split(
    train_data.drop(['gen2_id', 'study_parent_id_new', 'gen1_id'], axis=1, errors='ignore'),
    target_train,
    test_size=0.2,
    random_state=42
)

In [32]:
target_train

AgeGr,10,11,12,13,14,15,16,17,18
gen2_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1332.0,145.262229,150.670668,160.222405,164.623273,171.599576,178.575879,185.552181,192.528484,199.504787
2330.0,131.799105,146.592755,153.569058,160.545361,167.521664,174.497966,181.474269,188.450572,195.426874
2331.0,141.524124,150.015631,161.792710,171.815312,169.361521,176.337824,183.314127,190.290430,197.266732
2505.0,139.257516,144.366273,150.663565,158.903055,165.879454,172.855853,179.832251,186.808650,193.785048
2507.0,150.337479,156.747461,164.874826,159.046592,166.022894,172.999197,179.975500,186.951802,193.928105
...,...,...,...,...,...,...,...,...,...
2825.0,138.702403,145.678706,152.655008,159.631311,166.607614,173.583916,180.560219,187.536522,194.512824
2827.0,137.226438,147.350383,152.487799,155.343264,165.374217,170.255790,172.711099,173.431458,174.151818
2829.0,141.468590,150.291349,160.823434,164.816989,166.242011,167.438765,168.077352,168.528878,168.980405
2830.0,145.422500,152.627457,156.193087,158.433856,159.909045,159.748067,159.855986,176.179835,192.503685


In [33]:
y_train

AgeGr,10,11,12,13,14,15,16,17,18
gen2_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2631.0,143.042022,150.180173,156.728915,162.670357,165.603767,172.580070,179.556373,186.532676,193.508978
2679.0,145.353045,151.626757,156.851685,161.437572,165.326802,172.028609,177.635524,180.352143,183.068762
2777.0,148.446154,153.857966,159.071699,166.760890,177.072832,183.116858,186.548369,188.272532,189.996695
2553.0,155.031660,160.923434,164.355868,160.538018,167.514321,174.490623,181.466926,188.443229,195.419531
2722.0,150.893923,156.345451,163.344689,167.772201,171.969990,172.564448,173.506096,173.448711,173.391326
...,...,...,...,...,...,...,...,...,...
2672.0,134.294750,139.437493,146.936546,154.292560,157.517112,160.917344,161.660344,162.903967,164.147590
2521.0,139.330904,143.768080,147.265142,153.612438,162.362470,170.830936,178.194150,186.932985,195.671821
2645.0,142.349893,146.435034,152.011726,156.993556,164.132367,172.120912,181.228489,183.214650,185.200811
2806.0,137.414478,145.978641,153.401218,158.907505,161.047309,162.218524,162.408216,162.694316,162.980415


In [31]:
train_data

Unnamed: 0,parent_height_age_18,parent_velocity_4_5,parent_height_age_11,parent_height_age_16,child_height_age_0,parent_height_age_8,parent_velocity_15_16,parent_velocity_3_4,parent_height_age_4,parent_height_age_10,...,parent_height_age_13,parent_velocity_10_11,parent_velocity_16_17,parent_height_age_19,parent_height_age_1,parent_height_age_3,parent_height_age_20,parent_velocity_6_7,parent_velocity_18_19,child_velocity_3_4
0,187.478546,7.707125,152.345734,185.229461,60.161104,135.152449,1.913254,8.263564,106.702972,143.754477,...,168.923287,8.591257,1.336591,187.292054,77.759349,98.439409,191.110969,7.953222,-0.186491,9.313594
1,187.478546,7.707125,152.345734,185.229461,72.615222,135.152449,1.913254,8.263564,106.702972,143.754477,...,168.923287,8.591257,1.336591,187.292054,77.759349,98.439409,191.110969,7.953222,-0.186491,4.225584
2,187.478546,7.707125,152.345734,185.229461,68.754287,135.152449,1.913254,8.263564,106.702972,143.754477,...,168.923287,8.591257,1.336591,187.292054,77.759349,98.439409,191.110969,7.953222,-0.186491,5.055503
3,187.478546,7.707125,152.345734,185.229461,58.894508,135.152449,1.913254,8.263564,106.702972,143.754477,...,168.923287,8.591257,1.336591,187.292054,77.759349,98.439409,191.110969,7.953222,-0.186491,5.664513
4,187.478546,7.707125,152.345734,185.229461,61.609136,135.152449,1.913254,8.263564,106.702972,143.754477,...,168.923287,8.591257,1.336591,187.292054,77.759349,98.439409,191.110969,7.953222,-0.186491,8.721616
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
187,182.656130,5.997087,146.848890,170.672888,63.062708,131.094769,1.137305,7.850813,106.763902,141.510714,...,163.232130,5.338175,-1.139493,169.941494,72.197089,98.913089,180.627368,5.393826,-12.714636,6.532380
188,182.674533,5.541563,144.362457,173.282151,62.050382,129.250975,1.687749,7.352158,105.690654,139.634804,...,161.504147,4.727653,-0.035486,173.512205,73.863204,98.338496,183.263540,5.568030,-9.162328,7.803524
189,182.692935,5.086040,141.876024,175.891414,58.368564,127.407181,2.238194,6.853503,104.617406,137.758894,...,159.776165,4.117130,1.068521,177.082916,75.529320,97.763903,185.899711,5.742234,-5.610020,6.716232
190,176.571363,5.777911,147.926120,168.269721,60.459271,127.413688,2.265626,12.768845,104.578596,138.105427,...,160.624364,9.820694,-2.238160,166.097378,75.965851,91.809751,178.513879,5.733060,-10.473984,10.341753


In [6]:
param_grid = {
    "n_estimators": [50, 100, 200],  # Number of boosting rounds
    "learning_rate": [0.01, 0.05, 0.1, 0.2],  # Step size shrinkage
    "max_depth": [3, 5, 7, 9],  # Maximum depth of trees
    "subsample": [0.6, 0.8, 1.0],  # Fraction of samples used per tree
    "colsample_bytree": [0.6, 0.8, 1.0],  # Fraction of features used per tree
    "gamma": [0, 0.1, 0.2, 0.3],  # Minimum loss reduction required to make a split
    "reg_alpha": [0, 0.01, 0.1, 1.0],  # L1 regularization
    "reg_lambda": [1.0, 2.0, 5.0]  # L2 regularization
}

xgboost_reg = xgb.XGBRegressor(objective="reg:squarederror", random_state=42)

random_search = RandomizedSearchCV(
    estimator=xgboost_reg,
    param_distributions=param_grid,
    n_iter=30, 
    scoring="neg_mean_absolute_error",
    cv=5,
    n_jobs=-1,
    random_state=42
)

random_search.fit(X_train, y_train)

print("Best Hyperparameters:", random_search.best_params_)
print("Best MAE:", -random_search.best_score_)


xgboost_reg = xgb.XGBRegressor(objective="reg:squarederror", random_state=42, **random_search.best_params_)
xgboost_reg.fit(X_train, y_train)

# Evaluate on test set
y_pred = xgboost_reg.predict(X_val)
mae = mean_absolute_error(y_val, y_pred)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"Test MAE: {mae:.2f} cm")
print(f"Test RMSE: {rmse:.2f} cm")

multi_output_model = MultiOutputRegressor(xgboost_reg)
multi_output_model.fit(X_train, y_train)

# Evaluate model
y_pred = multi_output_model.predict(X_val)
mse = mean_squared_error(y_val, y_pred)
print(f"MSE: {mse}")

# Make predictions on test data
test_features = test_data.drop(['gen2_id', 'study_parent_id_new', 'gen1_id'], axis=1, errors='ignore')
test_predictions = multi_output_model.predict(test_features)

# Format and save predictions
formatted_predictions_xgb = format_predictions(
    predictions=test_predictions,
    gen2_ids=test_data['gen2_id']
)

# Round heights to 2 decimal places
formatted_predictions_xgb['SHgt_cm'] = formatted_predictions_xgb['SHgt_cm']

# Save predictions
# formatted_predictions_rf.to_csv("test_predictions.csv", index=False)

print("Process completed successfully!")
print(f"Final predictions shape: {formatted_predictions_xgb.shape}")
print("\nFirst few predictions:")
print(formatted_predictions_xgb.head(10))

Best Hyperparameters: {'subsample': 0.6, 'reg_lambda': 2.0, 'reg_alpha': 0, 'n_estimators': 100, 'max_depth': 3, 'learning_rate': 0.05, 'gamma': 0.1, 'colsample_bytree': 1.0}
Best MAE: 4.3154161930084225
Test MAE: 3.96 cm
Test RMSE: 5.20 cm
MSE: 26.40380859375
Process completed successfully!
Final predictions shape: (792, 2)

First few predictions:
  gen2id_age     SHgt_cm
0  2332.0_10  131.341980
1  2332.0_11  136.605377
2  2332.0_12  143.343384
3  2332.0_13  150.244293
4  2332.0_14  158.064255
5  2332.0_15  163.737595
6  2332.0_16  167.174713
7  2332.0_17  170.819489
8  2332.0_18  172.557083
9  2503.0_10  148.003342


In [7]:
# Define the parameter grid
param_grid = {
    "n_estimators": np.arange(50, 500, 50),  # Number of trees in the forest
    "max_depth": [None] + list(np.arange(5, 30, 5)),  # Maximum depth of the tree
    "min_samples_split": [2, 5, 10, 20],  # Minimum samples required to split a node
    "min_samples_leaf": [1, 2, 4, 10],  # Minimum samples required at a leaf node
    "max_features": ["sqrt", "log2", None],  # Number of features to consider for best split
    "bootstrap": [True, False],  # Whether to use bootstrap sampling
}

# Initialize the Random Forest model
rf = RandomForestRegressor(random_state=42)

# Randomized Search CV
random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_grid,
    n_iter=50,  # Number of parameter settings to sample
    cv=5,  # 5-fold cross-validation
    scoring="neg_mean_squared_error",  # Use MSE as evaluation metric
    n_jobs=-1,  # Use all available CPU cores
    random_state=42
)

# Fit the model on data (replace X_train, y_train with actual data)
random_search.fit(X_train, y_train)

# Print best parameters (once fitted)
print("Best parameters found: ", random_search.best_params_)
print("Best MAE:", -random_search.best_score_)


rf = RandomForestRegressor(random_state=42, **random_search.best_params_)
rf.fit(X_train, y_train)

# Evaluate on test set
y_pred = rf.predict(X_val)
mae = mean_absolute_error(y_val, y_pred)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"Test MAE: {mae:.2f} cm")
print(f"Test RMSE: {rmse:.2f} cm")

multi_output_model = MultiOutputRegressor(rf)
multi_output_model.fit(X_train, y_train)

# Evaluate model
y_pred = multi_output_model.predict(X_val)
mse = mean_squared_error(y_val, y_pred)
print(f"MSE: {mse}")

# Make predictions on test data
test_features = test_data.drop(['gen2_id', 'study_parent_id_new', 'gen1_id'], axis=1, errors='ignore')
test_predictions = multi_output_model.predict(test_features)

# Format and save predictions
formatted_predictions_rf = format_predictions(
    predictions=test_predictions,
    gen2_ids=test_data['gen2_id']
)

# Round heights to 2 decimal places
formatted_predictions_rf['SHgt_cm'] = formatted_predictions_rf['SHgt_cm']

# Save predictions
# formatted_predictions_rf.to_csv("test_predictions.csv", index=False)

print("Process completed successfully!")
print(f"Final predictions shape: {formatted_predictions_rf.shape}")
print("\nFirst few predictions:")
print(formatted_predictions_rf.head(10))

Best parameters found:  {'n_estimators': np.int64(150), 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': None, 'bootstrap': False}
Best MAE: 33.59234736019692
Test MAE: 4.03 cm
Test RMSE: 5.26 cm
MSE: 28.676473515803796
Process completed successfully!
Final predictions shape: (792, 2)

First few predictions:
  gen2id_age     SHgt_cm
0  2332.0_10  132.215608
1  2332.0_11  139.470143
2  2332.0_12  146.414216
3  2332.0_13  152.246598
4  2332.0_14  157.231683
5  2332.0_15  162.867164
6  2332.0_16  167.380629
7  2332.0_17  172.193925
8  2332.0_18  174.333919
9  2503.0_10  145.442336


In [8]:
pd.DataFrame(xgboost_reg.predict(X_val))

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,137.015182,145.168976,149.956436,155.75708,159.418243,164.916687,166.753616,171.061829,174.246124
1,145.927567,152.414551,161.36676,166.810013,171.580963,174.572861,175.904846,177.290649,179.150223
2,147.338654,155.422562,161.067093,164.07962,172.345001,179.942078,182.200531,185.326767,188.054764
3,149.480087,155.753189,163.248871,164.251465,173.6315,175.994354,182.823532,185.989105,188.068588
4,145.125534,149.952225,155.653656,161.407181,166.916245,173.598709,179.004761,181.326416,184.83667
5,135.722534,143.263687,148.650558,157.047653,164.122589,172.206131,175.599274,180.619949,186.426193
6,134.313629,141.048111,146.383026,153.993866,159.306839,166.099991,165.403534,167.099655,165.667679
7,140.737762,146.546936,152.330795,160.720505,165.069153,169.928879,172.001465,175.056641,173.320312
8,135.292435,140.989975,147.276657,153.681168,157.585938,164.95929,168.497452,171.690536,175.678741
9,149.384247,155.308609,160.47551,164.615585,173.863724,178.03418,181.951385,181.992279,186.196198


In [20]:
formatted_predictions_rf["gen2id_age"] = formatted_predictions_rf["gen2id_age"].str.replace(".0", "")
formatted_predictions_rf

Unnamed: 0,gen2id_age,SHgt_cm
0,2332_10,132.215608
1,2332_11,139.470143
2,2332_12,146.414216
3,2332_13,152.246598
4,2332_14,157.231683
...,...,...
787,2831_14,162.746191
788,2831_15,166.848577
789,2831_16,170.423711
790,2831_17,173.485272


In [21]:
formatted_predictions_xgb["gen2id_age"] = formatted_predictions_xgb["gen2id_age"].str.replace(".0", "")
formatted_predictions_xgb

Unnamed: 0,gen2id_age,SHgt_cm
0,2332_10,131.341980
1,2332_11,136.605377
2,2332_12,143.343384
3,2332_13,150.244293
4,2332_14,158.064255
...,...,...
787,2831_14,161.648361
788,2831_15,164.460205
789,2831_16,169.642487
790,2831_17,170.824326


In [17]:
# formatted_predictions.to_csv("wh_xgboost_submission_2.csv", index = False)
# formatted_predictions.shape

In [22]:
submission = pd.read_csv(DATA_PATH + "gen2_test_solution_template.csv")
submission.shape

(589, 2)

In [25]:
sub_rf = submission.merge(formatted_predictions_rf, on="gen2id_age", how="inner").drop(columns=["SHgt_cm_x"]).rename(columns={"SHgt_cm_y": "SHgt_cm"})# .to_csv("wh_xgboost_submission_2.csv", index = False)
sub_xgb = submission.merge(formatted_predictions_xgb, on="gen2id_age", how="inner").drop(columns=["SHgt_cm_x"]).rename(columns={"SHgt_cm_y": "SHgt_cm"})# .to_csv("wh_xgboost_submission_2.csv", index = False)

In [30]:
final = sub_rf.copy()
final["SHgt_cm"] = 0.5 * (final["SHgt_cm"] + sub_xgb["SHgt_cm"])
final.to_csv("wh_stacked_submission.csv", index = False)

In [34]:
train_data

Unnamed: 0,parent_height_age_18,parent_velocity_4_5,parent_height_age_11,parent_height_age_16,child_height_age_0,parent_height_age_8,parent_velocity_15_16,parent_velocity_3_4,parent_height_age_4,parent_height_age_10,...,parent_height_age_13,parent_velocity_10_11,parent_velocity_16_17,parent_height_age_19,parent_height_age_1,parent_height_age_3,parent_height_age_20,parent_velocity_6_7,parent_velocity_18_19,child_velocity_3_4
0,187.478546,7.707125,152.345734,185.229461,60.161104,135.152449,1.913254,8.263564,106.702972,143.754477,...,168.923287,8.591257,1.336591,187.292054,77.759349,98.439409,191.110969,7.953222,-0.186491,9.313594
1,187.478546,7.707125,152.345734,185.229461,72.615222,135.152449,1.913254,8.263564,106.702972,143.754477,...,168.923287,8.591257,1.336591,187.292054,77.759349,98.439409,191.110969,7.953222,-0.186491,4.225584
2,187.478546,7.707125,152.345734,185.229461,68.754287,135.152449,1.913254,8.263564,106.702972,143.754477,...,168.923287,8.591257,1.336591,187.292054,77.759349,98.439409,191.110969,7.953222,-0.186491,5.055503
3,187.478546,7.707125,152.345734,185.229461,58.894508,135.152449,1.913254,8.263564,106.702972,143.754477,...,168.923287,8.591257,1.336591,187.292054,77.759349,98.439409,191.110969,7.953222,-0.186491,5.664513
4,187.478546,7.707125,152.345734,185.229461,61.609136,135.152449,1.913254,8.263564,106.702972,143.754477,...,168.923287,8.591257,1.336591,187.292054,77.759349,98.439409,191.110969,7.953222,-0.186491,8.721616
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
187,182.656130,5.997087,146.848890,170.672888,63.062708,131.094769,1.137305,7.850813,106.763902,141.510714,...,163.232130,5.338175,-1.139493,169.941494,72.197089,98.913089,180.627368,5.393826,-12.714636,6.532380
188,182.674533,5.541563,144.362457,173.282151,62.050382,129.250975,1.687749,7.352158,105.690654,139.634804,...,161.504147,4.727653,-0.035486,173.512205,73.863204,98.338496,183.263540,5.568030,-9.162328,7.803524
189,182.692935,5.086040,141.876024,175.891414,58.368564,127.407181,2.238194,6.853503,104.617406,137.758894,...,159.776165,4.117130,1.068521,177.082916,75.529320,97.763903,185.899711,5.742234,-5.610020,6.716232
190,176.571363,5.777911,147.926120,168.269721,60.459271,127.413688,2.265626,12.768845,104.578596,138.105427,...,160.624364,9.820694,-2.238160,166.097378,75.965851,91.809751,178.513879,5.733060,-10.473984,10.341753
