<a href="https://colab.research.google.com/github/tousifo/ml_notebooks/blob/main/Birth_Weight_Prediction_using_DL_Models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Loading & Comprehensive EDA

In [1]:
# ============================================
# SNIPPET 1: DATA LOADING & COMPREHENSIVE EDA
# ============================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from scipy import stats
from scipy.stats import skew, kurtosis
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

warnings.filterwarnings('ignore')

# Configure visualization settings
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

# ============================================
# 1. LOAD DATA
# ============================================

# Load the dataset
df = pd.read_csv('CBWDB.csv')

print("="*80)
print("BIRTH WEIGHT PREDICTION - DEEP LEARNING APPROACH")
print("="*80)
print(f"\n📊 Dataset Shape: {df.shape[0]} rows × {df.shape[1]} columns")
print(f"🎯 Target Variable: BWt(kg) - Birth Weight in Kilograms")

# ============================================
# 2. INITIAL DATA INSPECTION
# ============================================

print("\n" + "="*80)
print("DATASET OVERVIEW")
print("="*80)

# Basic info
print("\n📋 Dataset Info:")
print(df.info())

print("\n📊 First 5 Rows:")
print(df.head())

print("\n📈 Statistical Summary:")
print(df.describe(include='all'))

# ============================================
# 3. TARGET VARIABLE ANALYSIS
# ============================================

print("\n" + "="*80)
print("TARGET VARIABLE ANALYSIS: BWt(kg)")
print("="*80)

target_stats = {
    'Mean': df['BWt(kg)'].mean(),
    'Median': df['BWt(kg)'].median(),
    'Std Dev': df['BWt(kg)'].std(),
    'Min': df['BWt(kg)'].min(),
    'Max': df['BWt(kg)'].max(),
    'Skewness': skew(df['BWt(kg)'].dropna()),
    'Kurtosis': kurtosis(df['BWt(kg)'].dropna())
}

for key, value in target_stats.items():
    print(f"{key:12}: {value:.4f}")

# Categorize birth weight for analysis (WHO standards)
def categorize_birth_weight(weight):
    if pd.isna(weight):
        return 'Unknown'
    elif weight < 2.5:
        return 'Low'
    elif weight <= 4.0:
        return 'Normal'
    else:
        return 'High'

df['BW_Category'] = df['BWt(kg)'].apply(categorize_birth_weight)
print("\n🏷️ Birth Weight Distribution:")
print(df['BW_Category'].value_counts())
print(df['BW_Category'].value_counts(normalize=True) * 100)

# ============================================
# 4. MISSING VALUES ANALYSIS
# ============================================

print("\n" + "="*80)
print("MISSING VALUES ANALYSIS")
print("="*80)

missing_df = pd.DataFrame({
    'Column': df.columns,
    'Missing_Count': df.isnull().sum(),
    'Missing_Percentage': (df.isnull().sum() / len(df)) * 100
}).sort_values('Missing_Percentage', ascending=False)

print("\n🔍 Missing Values Summary:")
print(missing_df[missing_df['Missing_Count'] > 0])

# Missing patterns analysis
def analyze_missing_patterns(df):
    """Analyze patterns in missing data"""
    missing_cols = df.columns[df.isnull().any()].tolist()

    if len(missing_cols) > 0:
        print("\n📊 Missing Value Patterns:")

        # Check if missingness is related to other variables
        for col in missing_cols[:5]:  # Analyze top 5 columns with missing values
            print(f"\n  {col}:")
            # Check correlation with categorical variables
            if 'Term/Preterm' in df.columns:
                term_missing = df.groupby('Term/Preterm')[col].apply(lambda x: x.isnull().sum())
                print(f"    By Term/Preterm: {dict(term_missing)}")

analyze_missing_patterns(df)

# ============================================
# 5. DATA TYPE ANALYSIS
# ============================================

print("\n" + "="*80)
print("DATA TYPE ANALYSIS")
print("="*80)

categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns.tolist()

print(f"\n📝 Categorical Variables ({len(categorical_cols)}): {categorical_cols}")
print(f"\n📊 Numerical Variables ({len(numerical_cols)}): {numerical_cols}")

# Unique values in categorical columns
print("\n🔤 Unique Values in Categorical Columns:")
for col in categorical_cols:
    unique_vals = df[col].nunique()
    print(f"  {col:15}: {unique_vals:3} unique values - {df[col].unique()[:5].tolist()}")

# ============================================
# 6. CORRELATION ANALYSIS
# ============================================

print("\n" + "="*80)
print("CORRELATION ANALYSIS WITH TARGET")
print("="*80)

# Calculate correlations with target
correlations = df[numerical_cols].corr()['BWt(kg)'].sort_values(ascending=False)
print("\n🔗 Feature Correlations with Birth Weight:")
for feature, corr in correlations.items():
    if feature != 'BWt(kg)':
        print(f"  {feature:15}: {corr:+.4f}")

# ============================================
# 7. VISUALIZATIONS
# ============================================

# Create comprehensive visualization dashboard
fig = make_subplots(
    rows=3, cols=3,
    subplot_titles=(
        'Birth Weight Distribution', 'Birth Weight by Category', 'Missing Values Heatmap',
        'Correlation Matrix', 'Birth Weight vs Mother Weight', 'Birth Weight vs Age',
        'Blood Pressure Relationship', 'Hemoglobin Levels', 'Feature Distributions'
    ),
    specs=[
        [{'type': 'histogram'}, {'type': 'box'}, {'type': 'heatmap'}],
        [{'type': 'heatmap'}, {'type': 'scatter'}, {'type': 'scatter'}],
        [{'type': 'scatter'}, {'type': 'scatter'}, {'type': 'violin'}]
    ]
)

# 1. Birth Weight Distribution
fig.add_trace(
    go.Histogram(x=df['BWt(kg)'].dropna(), nbinsx=30, name='Birth Weight'),
    row=1, col=1
)

# 2. Birth Weight by Category
for category in df['BW_Category'].unique():
    if category != 'Unknown':
        fig.add_trace(
            go.Box(y=df[df['BW_Category']==category]['BWt(kg)'], name=category),
            row=1, col=2
        )

# 3. Missing Values Heatmap
missing_matrix = df.isnull().astype(int)
fig.add_trace(
    go.Heatmap(z=missing_matrix.T, colorscale='RdBu', showscale=False),
    row=1, col=3
)

# 4. Correlation Matrix
corr_matrix = df[numerical_cols].corr()
fig.add_trace(
    go.Heatmap(z=corr_matrix, colorscale='RdBu', zmid=0),
    row=2, col=1
)

# 5. Birth Weight vs Mother's Final Weight
fig.add_trace(
    go.Scatter(x=df['FWt(kg)'], y=df['BWt(kg)'], mode='markers',
               marker=dict(size=5, opacity=0.6)),
    row=2, col=2
)

# 6. Birth Weight vs Age
fig.add_trace(
    go.Scatter(x=df['Age(years)'], y=df['BWt(kg)'], mode='markers',
               marker=dict(size=5, opacity=0.6)),
    row=2, col=3
)

# 7. Blood Pressure Relationship
fig.add_trace(
    go.Scatter(x=df['FBP_sys'], y=df['BWt(kg)'], mode='markers',
               marker=dict(size=5, opacity=0.6)),
    row=3, col=1
)

# 8. Hemoglobin Levels
fig.add_trace(
    go.Scatter(x=df['FHb(gm%)'], y=df['BWt(kg)'], mode='markers',
               marker=dict(size=5, opacity=0.6)),
    row=3, col=2
)

# Update layout
fig.update_layout(height=1200, showlegend=False, title_text="Comprehensive EDA Dashboard")
fig.show()

# ============================================
# 8. STATISTICAL TESTS
# ============================================

print("\n" + "="*80)
print("STATISTICAL TESTS")
print("="*80)

# Normality test for target variable
statistic, p_value = stats.shapiro(df['BWt(kg)'].dropna())
print(f"\n📊 Shapiro-Wilk Test for Birth Weight Normality:")
print(f"  Statistic: {statistic:.6f}")
print(f"  P-value: {p_value:.6f}")
print(f"  Result: {'Normal' if p_value > 0.05 else 'Not Normal'} distribution")

# ============================================
# 9. FEATURE INSIGHTS
# ============================================

print("\n" + "="*80)
print("KEY INSIGHTS FOR MODEL BUILDING")
print("="*80)

insights = """
🔍 Key Findings:
1. Target Distribution: Birth weight shows {'normal' if p_value > 0.05 else 'skewed'} distribution
2. Missing Values: Significant missing data in some features requiring intelligent imputation
3. Strong Correlations: Mother's weight and hemoglobin levels show strong correlation with birth weight
4. Categorical Features: Need encoding for SEC, Bgroup, Term/Preterm, Sex
5. Outliers: Some extreme values in birth weight that need investigation
6. Feature Engineering Opportunities:
   - BMI calculation from height and weight
   - Weight gain during pregnancy (FWt - Iwt)
   - Blood pressure ratios and changes
   - Interaction features between key predictors
"""
print(insights)

# Save processed insights
insights_dict = {
    'shape': df.shape,
    'missing_columns': missing_df[missing_df['Missing_Count'] > 0]['Column'].tolist(),
    'categorical_columns': categorical_cols,
    'numerical_columns': numerical_cols,
    'target_stats': target_stats,
    'top_correlations': correlations.head(6).to_dict(),
    'birth_weight_categories': df['BW_Category'].value_counts().to_dict()
}

print("\n✅ EDA Complete! Ready for preprocessing...")
print(f"📊 Dataset has {len(df)} samples with {len(numerical_cols)} numerical and {len(categorical_cols)} categorical features")
print(f"🎯 Target variable (BWt) ranges from {df['BWt(kg)'].min():.2f} to {df['BWt(kg)'].max():.2f} kg")

# Display final summary
print("\n" + "="*80)
print("NEXT STEPS")
print("="*80)
print("""
📋 Preprocessing Requirements:
1. Handle missing values with intelligent imputation
2. Engineer new features (BMI, weight gain, BP changes)
3. Encode categorical variables
4. Scale numerical features
5. Handle outliers appropriately
6. Create robust train/val/test splits
""")

BIRTH WEIGHT PREDICTION - DEEP LEARNING APPROACH

📊 Dataset Shape: 1800 rows × 19 columns
🎯 Target Variable: BWt(kg) - Birth Weight in Kilograms

DATASET OVERVIEW

📋 Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1800 entries, 0 to 1799
Data columns (total 19 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   SEC           1150 non-null   object 
 1   Age(years)    1786 non-null   float64
 2   Height(cm)    1527 non-null   float64
 3   Bgroup        798 non-null    object 
 4   Parity        1767 non-null   float64
 5   ANC           1781 non-null   float64
 6   Iwt(kg)       1751 non-null   float64
 7   FWt(kg)       1739 non-null   float64
 8   IBP_sys       1760 non-null   float64
 9   IBP_dias      1760 non-null   float64
 10  FBP_sys       1749 non-null   float64
 11  FBP_dias      1748 non-null   float64
 12  IHb(gm%)      1395 non-null   float64
 13  FHb(gm%)      1393 non-null   float64
 14  BS(RBS)       608 no


STATISTICAL TESTS

📊 Shapiro-Wilk Test for Birth Weight Normality:
  Statistic: 0.942249
  P-value: 0.000000
  Result: Not Normal distribution

KEY INSIGHTS FOR MODEL BUILDING

🔍 Key Findings:
1. Target Distribution: Birth weight shows {'normal' if p_value > 0.05 else 'skewed'} distribution
2. Missing Values: Significant missing data in some features requiring intelligent imputation
3. Strong Correlations: Mother's weight and hemoglobin levels show strong correlation with birth weight
4. Categorical Features: Need encoding for SEC, Bgroup, Term/Preterm, Sex
5. Outliers: Some extreme values in birth weight that need investigation
6. Feature Engineering Opportunities:
   - BMI calculation from height and weight
   - Weight gain during pregnancy (FWt - Iwt)
   - Blood pressure ratios and changes
   - Interaction features between key predictors


✅ EDA Complete! Ready for preprocessing...
📊 Dataset has 1800 samples with 15 numerical and 5 categorical features
🎯 Target variable (BWt) range

# Advanced Data Preprocessing

In [4]:
# ============================================
# SNIPPET 2 (REVISED): ADVANCED DATA UTILIZATION & PREPROCESSING
# ============================================

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.impute import KNNImputer
import warnings
warnings.filterwarnings('ignore')

print("="*80)
print("INNOVATIVE DATA UTILIZATION STRATEGY")
print("="*80)

# Load the data
df = pd.read_csv('CBWDB.csv')
print(f"\n📊 Initial Dataset: {df.shape[0]} samples")

# ============================================
# 1. UNDERSTAND THE MISSING PATTERN
# ============================================

print("\n" + "="*60)
print("1. ANALYZING MISSING DATA PATTERNS")
print("="*60)

# Analyze BWt and LNH relationship
has_target = df['BWt(kg)'].notna()
print(f"\n🎯 Target (BWt) available: {has_target.sum()} samples")
print(f"🎯 Target (BWt) missing: {(~has_target).sum()} samples")

# Check LNH pattern
print(f"\n🔍 LNH Analysis:")
print(f"   LNH available: {df['LNH'].notna().sum()}")
print(f"   LNH missing: {df['LNH'].isna().sum()}")
print(f"   BWt and LNH both present: {(df['BWt(kg)'].notna() & df['LNH'].notna()).sum()}")
print(f"   BWt and LNH both missing: {(df['BWt(kg)'].isna() & df['LNH'].isna()).sum()}")

# Correlation between BWt and LNH for available data
if df[['BWt(kg)', 'LNH']].notna().all().any():
    corr_bwt_lnh = df[['BWt(kg)', 'LNH']].corr().iloc[0, 1]
    print(f"   Correlation BWt-LNH: {corr_bwt_lnh:.3f}")

# ============================================
# 2. MULTI-STRATEGY APPROACH
# ============================================

print("\n" + "="*60)
print("2. IMPLEMENTING MULTI-STRATEGY DATA UTILIZATION")
print("="*60)

# Fix data quality issues first
df['Term/Preterm'] = df['Term/Preterm'].replace({'t': 'T', 'PT': 'PT', 'T': 'T'})
df['Sex'] = df['Sex'].replace({'T': np.nan})
df['Bgroup'] = df['Bgroup'].replace('NIL', np.nan)

# Separate data into labeled and unlabeled
df_labeled = df[df['BWt(kg)'].notna()].copy()
df_unlabeled = df[df['BWt(kg)'].isna()].copy()

print(f"\n📊 Data Split:")
print(f"   Labeled (with BWt): {len(df_labeled)} samples")
print(f"   Unlabeled (no BWt): {len(df_unlabeled)} samples")

# ============================================
# 3. FEATURE ENGINEERING FOR ALL DATA
# ============================================

print("\n" + "="*60)
print("3. FEATURE ENGINEERING FOR ALL SAMPLES")
print("="*60)

def create_all_features(df):
    """Create features for all samples"""
    df = df.copy()

    # Basic features - handle missing values gracefully
    df['BMI_initial'] = df['Iwt(kg)'] / ((df['Height(cm)'] / 100) ** 2)
    df['BMI_final'] = df['FWt(kg)'] / ((df['Height(cm)'] / 100) ** 2)
    df['Weight_Gain'] = df['FWt(kg)'] - df['Iwt(kg)']
    df['Weight_Gain_Percent'] = (df['Weight_Gain'] / df['Iwt(kg)']) * 100

    # BP features
    df['BP_Mean_Initial'] = (df['IBP_sys'] + 2 * df['IBP_dias']) / 3
    df['BP_Mean_Final'] = (df['FBP_sys'] + 2 * df['FBP_dias']) / 3
    df['BP_Change_Sys'] = df['FBP_sys'] - df['IBP_sys']
    df['BP_Change_Dias'] = df['FBP_dias'] - df['IBP_dias']
    df['Pulse_Pressure_Initial'] = df['IBP_sys'] - df['IBP_dias']
    df['Pulse_Pressure_Final'] = df['FBP_sys'] - df['FBP_dias']

    # Hemoglobin
    df['Hb_Change'] = df['FHb(gm%)'] - df['IHb(gm%)']

    # Risk indicators
    df['Young_Mother'] = (df['Age(years)'] < 20).astype(float)
    df['Old_Mother'] = (df['Age(years)'] > 35).astype(float)
    df['First_Pregnancy'] = (df['Parity'] == 0).astype(float)
    df['Multiple_Pregnancy'] = (df['Parity'] > 2).astype(float)
    df['Adequate_ANC'] = (df['ANC'] >= 4).astype(float)

    # Handle missing values in risk indicators
    for col in ['Young_Mother', 'Old_Mother', 'First_Pregnancy', 'Multiple_Pregnancy', 'Adequate_ANC']:
        df[col] = df[col].fillna(0)

    return df

# Apply feature engineering to all data
df_all = create_all_features(df)
print(f"✅ Created features for all {len(df_all)} samples")

# ============================================
# 4. INTELLIGENT IMPUTATION FOR FEATURES
# ============================================

print("\n" + "="*60)
print("4. INTELLIGENT FEATURE IMPUTATION")
print("="*60)

# Select features with reasonable completeness
base_features = ['Age(years)', 'Height(cm)', 'Parity', 'ANC', 'Iwt(kg)', 'FWt(kg)',
                 'IBP_sys', 'IBP_dias', 'FBP_sys', 'FBP_dias', 'IHb(gm%)', 'FHb(gm%)']

engineered_features = ['Weight_Gain', 'BMI_initial', 'BMI_final', 'BP_Mean_Final',
                       'BP_Change_Sys', 'BP_Change_Dias', 'Pulse_Pressure_Final',
                       'Young_Mother', 'Old_Mother', 'First_Pregnancy', 'Adequate_ANC']

all_features = base_features + engineered_features

# Check missing percentages
print("\n📊 Missing percentages for key features:")
cols_to_use = []
for col in all_features:
    if col in df_all.columns:
        missing_pct = df_all[col].isna().sum() / len(df_all) * 100
        if missing_pct < 50:  # Only use features with <50% missing
            cols_to_use.append(col)
            if missing_pct > 0:
                print(f"   {col:25}: {missing_pct:.1f}% missing")

print(f"\n✅ Using {len(cols_to_use)} features with <50% missing")

# Handle categorical variables
categorical_cols = ['SEC', 'Term/Preterm', 'Sex']
for col in categorical_cols:
    if col in df_all.columns:
        # Fill missing with 'Unknown'
        df_all[col] = df_all[col].fillna('Unknown')
        # Convert to numeric
        df_all[col + '_encoded'] = pd.Categorical(df_all[col]).codes
        cols_to_use.append(col + '_encoded')

# Use IterativeImputer for sophisticated imputation
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

print("\n🔧 Applying iterative imputation...")
imputer = IterativeImputer(
    estimator=RandomForestRegressor(n_estimators=10, random_state=42),
    max_iter=10,
    random_state=42,
    verbose=0
)

# Impute features
df_imputed = df_all.copy()
df_imputed[cols_to_use] = imputer.fit_transform(df_all[cols_to_use])

print("✅ Feature imputation complete")

# ============================================
# 5. PSEUDO-LABELING WITH CONFIDENCE
# ============================================

print("\n" + "="*60)
print("5. CONFIDENCE-BASED PSEUDO-LABELING")
print("="*60)

# Prepare data
X_labeled = df_imputed[has_target][cols_to_use]
y_labeled = df_imputed[has_target]['BWt(kg)']
X_unlabeled = df_imputed[~has_target][cols_to_use]

print(f"\n📊 Data for pseudo-labeling:")
print(f"   Labeled samples: {len(X_labeled)}")
print(f"   Unlabeled samples: {len(X_unlabeled)}")

# Split labeled data for validation
X_train_init, X_val_init, y_train_init, y_val_init = train_test_split(
    X_labeled, y_labeled, test_size=0.2, random_state=42
)

# Train ensemble of models
print("\n🔧 Training ensemble for pseudo-labeling...")
models = {
    'rf': RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42),
    'gbm': GradientBoostingRegressor(n_estimators=100, max_depth=5, random_state=42),
}

predictions = []
val_scores = []

for name, model in models.items():
    model.fit(X_train_init, y_train_init)

    # Validation score
    val_pred = model.predict(X_val_init)
    val_mse = np.mean((val_pred - y_val_init) ** 2)
    val_scores.append(val_mse)
    print(f"   {name}: Validation RMSE = {np.sqrt(val_mse):.4f}")

    # Predict on unlabeled
    if len(X_unlabeled) > 0:
        pred = model.predict(X_unlabeled)
        predictions.append(pred)

# Generate pseudo-labels with confidence
pseudo_labels = None
high_confidence_indices = []

if len(predictions) > 0 and len(X_unlabeled) > 0:
    # Average predictions
    pseudo_labels = np.mean(predictions, axis=0)

    # Calculate prediction variance as uncertainty
    if len(predictions) > 1:
        prediction_std = np.std(predictions, axis=0)

        # Select high-confidence predictions (low std)
        confidence_threshold = np.percentile(prediction_std, 25)  # Top 25% most confident
        high_confidence_mask = prediction_std < confidence_threshold

        # Also ensure predictions are in reasonable range
        reasonable_mask = (pseudo_labels >= 0.5) & (pseudo_labels <= 4.5)
        high_confidence_mask = high_confidence_mask & reasonable_mask

        high_confidence_indices = df_imputed[~has_target].index[high_confidence_mask].tolist()

        print(f"\n✅ Pseudo-labeling results:")
        print(f"   High confidence samples: {len(high_confidence_indices)}")
        print(f"   Confidence threshold (std): {confidence_threshold:.4f}")

# ============================================
# 6. CREATE ENHANCED DATASET
# ============================================

print("\n" + "="*60)
print("6. CREATING ENHANCED DATASET")
print("="*60)

# Add pseudo-labels to dataset
df_enhanced = df_imputed.copy()
df_enhanced['BWt_final'] = df_enhanced['BWt(kg)'].copy()
df_enhanced['is_pseudo'] = 0
df_enhanced['sample_weight'] = 1.0

# Add high-confidence pseudo-labels
if len(high_confidence_indices) > 0:
    df_enhanced.loc[high_confidence_indices, 'BWt_final'] = pseudo_labels[high_confidence_mask]
    df_enhanced.loc[high_confidence_indices, 'is_pseudo'] = 1
    df_enhanced.loc[high_confidence_indices, 'sample_weight'] = 0.8  # Slightly lower weight

# Final statistics
final_samples = df_enhanced['BWt_final'].notna().sum()
original_samples = has_target.sum()
added_samples = final_samples - original_samples

print(f"\n📊 Enhanced Dataset Statistics:")
print(f"   Original labeled: {original_samples} samples")
print(f"   Pseudo-labeled: {added_samples} samples")
print(f"   Total usable: {final_samples} samples")
print(f"   Data utilization: {final_samples/len(df)*100:.1f}% (vs original {original_samples/len(df)*100:.1f}%)")

# ============================================
# 7. FINAL PREPROCESSING AND SPLITTING
# ============================================

print("\n" + "="*60)
print("7. FINAL PREPROCESSING")
print("="*60)

# Select final samples with targets
df_final = df_enhanced[df_enhanced['BWt_final'].notna()].copy()

# Prepare features and target
X = df_final[cols_to_use]
y = df_final['BWt_final']
sample_weights = df_final['sample_weight']
is_pseudo = df_final['is_pseudo']

print(f"\n📊 Final dataset shape:")
print(f"   Samples: {len(X)}")
print(f"   Features: {X.shape[1]}")
print(f"   Target range: [{y.min():.2f}, {y.max():.2f}]")
print(f"   Mean target: {y.mean():.3f} ± {y.std():.3f}")

# Create stratified splits
# First split: separate test set
X_temp, X_test, y_temp, y_test, w_temp, w_test, pseudo_temp, pseudo_test = train_test_split(
    X, y, sample_weights, is_pseudo,
    test_size=0.15,
    random_state=42,
    stratify=is_pseudo if is_pseudo.sum() > 0 else None
)

# Second split: train and validation
X_train, X_val, y_train, y_val, w_train, w_val = train_test_split(
    X_temp, y_temp, w_temp,
    test_size=0.176,
    random_state=42
)

print(f"\n📊 Data splits:")
print(f"   Train: {len(X_train)} samples ({len(X_train)/len(X)*100:.1f}%)")
print(f"   Val: {len(X_val)} samples ({len(X_val)/len(X)*100:.1f}%)")
print(f"   Test: {len(X_test)} samples ({len(X_test)/len(X)*100:.1f}%)")

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Save enhanced dataset
np.savez_compressed('enhanced_data.npz',
                   X_train=X_train_scaled,
                   X_val=X_val_scaled,
                   X_test=X_test_scaled,
                   y_train=y_train.values,
                   y_val=y_val.values,
                   y_test=y_test.values,
                   w_train=w_train.values,
                   w_val=w_val.values,
                   w_test=w_test.values,
                   feature_names=cols_to_use)

print("\n" + "="*80)
print("PREPROCESSING COMPLETE!")
print("="*80)
print(f"\n✅ Successfully enhanced dataset using pseudo-labeling")
print(f"📊 Final: {final_samples} samples with {len(cols_to_use)} features")
print(f"📈 Improvement: {(final_samples-original_samples)/original_samples*100:.1f}% more data utilized")
print(f"💾 Data saved to 'enhanced_data.npz'")

INNOVATIVE DATA UTILIZATION STRATEGY

📊 Initial Dataset: 1800 samples

1. ANALYZING MISSING DATA PATTERNS

🎯 Target (BWt) available: 1072 samples
🎯 Target (BWt) missing: 728 samples

🔍 LNH Analysis:
   LNH available: 1072
   LNH missing: 728
   BWt and LNH both present: 1072
   BWt and LNH both missing: 728

2. IMPLEMENTING MULTI-STRATEGY DATA UTILIZATION

📊 Data Split:
   Labeled (with BWt): 1072 samples
   Unlabeled (no BWt): 728 samples

3. FEATURE ENGINEERING FOR ALL SAMPLES
✅ Created features for all 1800 samples

4. INTELLIGENT FEATURE IMPUTATION

📊 Missing percentages for key features:
   Age(years)               : 0.8% missing
   Height(cm)               : 15.2% missing
   Parity                   : 1.8% missing
   ANC                      : 1.1% missing
   Iwt(kg)                  : 2.7% missing
   FWt(kg)                  : 3.4% missing
   IBP_sys                  : 2.2% missing
   IBP_dias                 : 2.2% missing
   FBP_sys                  : 2.8% missing
   FBP_dias 