<a href="https://colab.research.google.com/github/tousifo/ml_notebooks/blob/main/Baby_birthweight_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%pip install lime
%pip install shap

Collecting lime
  Downloading lime-0.2.0.1.tar.gz (275 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m275.7/275.7 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: lime
  Building wheel for lime (setup.py) ... [?25l[?25hdone
  Created wheel for lime: filename=lime-0.2.0.1-py3-none-any.whl size=283834 sha256=d21112f2deae08eb773ce9ab94f6df4c4878053fd0fb9fe1ba66d51986673f89
  Stored in directory: /root/.cache/pip/wheels/85/fa/a3/9c2d44c9f3cd77cf4e533b58900b2bf4487f2a17e8ec212a3d
Successfully built lime
Installing collected packages: lime
Successfully installed lime-0.2.0.1


In [19]:
import pandas as pd
import numpy as np

# Load Data
df = pd.read_csv('cleaned_data(1).csv')
print("Dataset loaded for final feature engineering push.")

# 1. Advanced BMI-related features
height_m = df['height'] * 0.0254
weight_kg = df['weight'] * 0.453592
df['bmi'] = weight_kg / (height_m**2)

# 2. High-Order Interaction Features
df['age_x_gestation'] = df['age'] * df['gestation']
df['weight_x_gestation'] = df['weight'] * df['gestation']

# 3. Create a 'Health Score'
# We'll create a simple composite score. This is a common feature engineering technique.
# We'll scale the inputs to this score to give them equal weighting.
df['scaled_age'] = (df['age'] - df['age'].min()) / (df['age'].max() - df['age'].min())
df['scaled_bmi'] = (df['bmi'] - df['bmi'].min()) / (df['bmi'].max() - df['bmi'].min())
df['health_score'] = df['scaled_age'] + (1 - df['scaled_bmi']) + (1 - df['smoke'])

# Prepare Final Dataset
X_engineered = df.drop(['case', 'bwt', 'scaled_age', 'scaled_bmi'], axis=1) # Drop the scaled columns after use
y = df['bwt']
MAX_BWT = y.max() # For PSNR calculation

# One-hot encode the original categorical features
X_engineered = pd.get_dummies(X_engineered, columns=['parity', 'smoke'], drop_first=True)

print("Final feature engineering complete.")
print(f"Shape of the new feature set: {X_engineered.shape}")

Dataset loaded for final feature engineering push.
Final feature engineering complete.
Shape of the new feature set: (1236, 11)


In [20]:
!pip install catboost lightgbm

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tqdm.notebook import tqdm

# Split and Scale the Engineered Data
numerical_features = X_engineered.columns[~X_engineered.columns.str.contains('parity_|smoke_')]

X_train, X_test, y_train, y_test = train_test_split(X_engineered, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train[numerical_features] = scaler.fit_transform(X_train[numerical_features])
X_test[numerical_features] = scaler.transform(X_test[numerical_features])

# Define All Models
models = {
    "LGBM Regressor": LGBMRegressor(random_state=42),
    "XGBoost Regressor": XGBRegressor(random_state=42),
    "CatBoost Regressor": CatBoostRegressor(random_state=42, verbose=0),
    "Random Forest": RandomForestRegressor(random_state=42),
    "DNN": Sequential([Dense(128, activation='relu', input_shape=[X_train.shape[1]]), Dropout(0.2), Dense(64, 'relu'), Dense(1)]),
    "MLP": Sequential([Dense(64, activation='relu', input_shape=[X_train.shape[1]]), Dropout(0.3), Dense(32, 'relu'), Dense(1)])
}
results = {}

# Train and Evaluate All Models
print("\nTraining all models on the final, feature-engineered dataset...")
for name, model in tqdm(models.items(), desc="Training All Models"):
    if name in ["DNN", "MLP"]:
        model.compile(optimizer='adam', loss='mean_squared_error')
        model.fit(X_train, y_train, epochs=100, batch_size=32, verbose=0, callbacks=[tf.keras.callbacks.EarlyStopping(patience=10)])
    else:
        model.fit(X_train, y_train)

    y_pred = model.predict(X_test).flatten()
    results[name] = (y_test.values, y_pred)

print("\nAll models trained successfully.")


Training all models on the final, feature-engineered dataset...


Training All Models:   0%|          | 0/6 [00:00<?, ?it/s]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000201 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1224
[LightGBM] [Info] Number of data points in the train set: 988, number of used features: 10
[LightGBM] [Info] Start training from score 120.060729
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step

All models trained successfully.


In [21]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Helper functions for custom metrics
def calculate_psnr(y_true, y_pred, max_val):
    mse = np.mean((y_true - y_pred) ** 2)
    return 10 * np.log10((max_val**2) / mse) if mse > 0 else float('inf')

def calculate_snr(y_true, y_pred):
    signal_variance = np.var(y_true)
    noise_variance = np.var(y_true - y_pred)
    return 10 * np.log10(signal_variance / noise_variance) if noise_variance > 0 else float('inf')

# Calculate Final Metrics for All Models
final_metrics = {}
for name, (y_true, y_pred) in results.items():
    final_metrics[name] = {
        'R2 Score': r2_score(y_true, y_pred),
        'MAE': mean_absolute_error(y_true, y_pred),
        'MSE': mean_squared_error(y_true, y_pred),
        'PSNR': calculate_psnr(y_true, y_pred, MAX_BWT),
        'SNR': calculate_snr(y_true, y_pred)
    }

# Final Comparison
results_df = pd.DataFrame(final_metrics).T
print("\n--- Final Model Performance Comparison (with Final Feature Engineering) ---")
print(results_df.sort_values(by='R2 Score', ascending=False))


--- Final Model Performance Comparison (with Final Feature Engineering) ---
                    R2 Score        MAE         MSE       PSNR       SNR
DNN                 0.297069  13.267066  277.205505  20.482235  1.674343
MLP                 0.296505  13.341362  277.427979  20.478751  1.613214
Random Forest       0.275746  13.477903  285.614342  20.352453  1.501623
CatBoost Regressor  0.261009  13.730483  291.426287  20.264966  1.391220
LGBM Regressor      0.194490  14.124388  317.658298  19.890651  1.005325
XGBoost Regressor   0.115701  14.573599  348.729309  19.485369  0.632713
