# Baseline Model

In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from imblearn.over_sampling   import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
import seaborn as sns
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline

In [2]:
df=pd.read_csv('u5mr_clean.csv')

In [15]:
# Make a safe working copy
df = df.copy()

# Object ID
df['CASEID'] = df['CASEID'].astype('object')

# Continuous floats
float_vars = [
    'Respondents current age',
    'Birth weight in kilograms (3 decimals)',
    'Childs weight in kilograms (1 decimal)',
    'Childs height in centimeters (1 decimal)',
    'Months of breastfeeding',
    'When child put to breast',
    'Height/Age standard deviation (new WHO)',
    'Weight/Age standard deviation (new WHO)',
    'Weight/Height standard deviation (new WHO)'
]
df[float_vars] = df[float_vars].apply(pd.to_numeric, errors="coerce").astype('float64')

# Discrete ints (nullable safe type)
int_vars = [
    'Number of household members (listed)',
    'Birth order number',
    'Preceding birth interval (months)',
    'Succeeding birth interval (months)',
    'Duration of pregnancy in months',
    'Timing of 1st antenatal check (months)',
    'Number of antenatal visits during pregnancy',
    'Entries in pregnancy and postnatal care roster',
    'Minutes to nearest healthcare facility',
    'Number of tetanus injections before birth',
    'Number of tetanus injections before pregnancy'
]
df[int_vars] = df[int_vars].apply(pd.to_numeric, errors="coerce").astype('Int64')

# Categorical (nominal/ordinal)
cat_vars = [
    'Region',
    'Type of place of residence',
    'Highest educational level',
    'Religion',
    'Ethnicity',
    'Wealth index combined',
    'Type of cooking fuel (smoke exposure, indoor air pollution)',
    'Sex of child',
    'Place of delivery',
    'Size of child at birth',
    'Has health card and or other vaccination document',
    'Received BCG',
    'Received POLIO 0',
    'Received POLIO 1',
    'Received POLIO 2',
    'Received POLIO 3',
    'Received MEASLES 1',
    'Received MEASLES 2',
    'Received inactivated polio (IPV)',
    'Received Pentavalent 1',
    'Received Pentavalent 2',
    'Received Pentavalent 3',
    'Received Pneumococcal 1',
    'Received Pneumococcal 2',
    'Received Pneumococcal 3',
    'Received Rotavirus 1',
    'Received Rotavirus 2',
    'Place where most vaccinations were received',
    'Yellow fever vaccine',
    'Currently breastfeeding',
    'Given child anything other than breast milk',
    'In contact with someone with cough or TB',
    'Source of drinking water',
    'Main floor material',
    'Visited health facility last 12 months',
    'Getting medical help for self: distance to health facility',
    'Mode of transportation to nearest healthcare facility',
]
df[cat_vars] = df[cat_vars].astype('category')

In [16]:
X = df.drop(columns=['CASEID','under5_mortality','infant_mortality','neonatal_mortality'])

cat_cols = X.select_dtypes(include=['object','category']).columns
num_cols = X.select_dtypes(exclude=['object','category']).columns

In [18]:

cat_cols

Index(['Region', 'Type of place of residence', 'Highest educational level',
       'Religion', 'Ethnicity', 'Wealth index combined',
       'Type of cooking fuel (smoke exposure, indoor air pollution)',
       'Sex of child', 'Place of delivery', 'Size of child at birth',
       'Has health card and or other vaccination document', 'Received BCG',
       'Received POLIO 1', 'Received POLIO 2', 'Received POLIO 3',
       'Received POLIO 0', 'Received MEASLES 1', 'Received MEASLES 2',
       'Received Pentavalent 1', 'Received Pentavalent 2',
       'Received Pentavalent 3', 'Received Pneumococcal 1',
       'Received Pneumococcal 2', 'Received Pneumococcal 3',
       'Received Rotavirus 1', 'Received inactivated polio (IPV)',
       'Received Rotavirus 2', 'Place where most vaccinations were received',
       'Yellow fever vaccine', 'Currently breastfeeding',
       'Given child anything other than breast milk',
       'In contact with someone with cough or TB', 'Source of drinking water

In [11]:
df.info(50)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19530 entries, 0 to 19529
Data columns (total 102 columns):
 #    Column                                                              Dtype  
---   ------                                                              -----  
 0    CASEID                                                              object 
 1    Respondents current age                                             float64
 2    Region                                                              float64
 3    Type of place of residence                                          float64
 4    Highest educational level                                           float64
 5    Religion                                                            float64
 6    Ethnicity                                                           float64
 7    Number of household members (listed)                                int64  
 8    Wealth index combined                                           

In [19]:
# Preprocessing: scale numeric + one-hot encode categoricals
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
])

baseline_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('logreg', LogisticRegression(max_iter=1000))
])

In [20]:
def baseline_logistic(X, y, target_name):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    baseline_pipe.fit(X_train, y_train)
    y_pred = baseline_pipe.predict(X_test)

    print(f"\n--- Baseline Logistic Regression: {target_name} ---")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))

In [21]:
baseline_logistic(X, df['under5_mortality'], 'Under-5 Mortality')
baseline_logistic(X, df['infant_mortality'], 'Infant Mortality')
baseline_logistic(X, df['neonatal_mortality'], 'Neonatal Mortality')


--- Baseline Logistic Regression: Under-5 Mortality ---
Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3767
           1       1.00      1.00      1.00       139

    accuracy                           1.00      3906
   macro avg       1.00      1.00      1.00      3906
weighted avg       1.00      1.00      1.00      3906


--- Baseline Logistic Regression: Infant Mortality ---
Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3870
           1       1.00      1.00      1.00        36

    accuracy                           1.00      3906
   macro avg       1.00      1.00      1.00      3906
weighted avg       1.00      1.00      1.00      3906


--- Baseline Logistic Regression: Neonatal Mortality ---
Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3819
           1       1.00 