# Setup

In [None]:
!pip install pyreadstat

Collecting pyreadstat
  Downloading pyreadstat-1.3.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (1.2 kB)
Downloading pyreadstat-1.3.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m49.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyreadstat
Successfully installed pyreadstat-1.3.2


In [None]:
import pandas as pd
import pyreadstat
import zipfile
import os
import numpy as np
import warnings
import re

warnings.filterwarnings('ignore')


# Extract Main Indicators

Purpose: Process DHS (Demographic and Health Survey) data to create regional-level indicators for maternal and child health services.

## A. Data DHS (2017)

### 1. Load Data



In [None]:
print("Preprocessing Data SDKI 2017...")

ibu_dta_path = 'IDIR71FL.DTA' # File ibu
anak_dta_path = 'IDKR71FL.DTA' # File anak
rt_dta_path = 'IDHR71FL.DTA'   # File Rumah Tangga
lahir_dta_path = 'IDBR71FL.DTA' # File Kelahiran

print("\nMembaca file data Stata (.DTA)...")

try:
    df_ibu, meta_ibu = pyreadstat.read_dta(ibu_dta_path)
    df_anak, meta_anak = pyreadstat.read_dta(anak_dta_path)
    df_rt, meta_rt = pyreadstat.read_dta(rt_dta_path)
    df_lahir, meta_lahir = pyreadstat.read_dta(lahir_dta_path)
    print("File Ibu, Rumah Tangga, Kelahiran, & Anak (.DTA) berhasil dibaca.")
except Exception as e:
    print(f"Error saat membaca file: {e}")
    exit()

df_ibu.columns = df_ibu.columns.str.lower()
df_anak.columns = df_anak.columns.str.lower()
df_rt.columns = df_rt.columns.str.lower()
df_lahir.columns = df_lahir.columns.str.lower()

print("\n--- Check Dataset ---")
print("Jumlah kolom di df_ibu:", len(df_ibu.columns))
print("Jumlah kolom di df_anak:", len(df_anak.columns))
print("Jumlah kolom di df_rt:", len(df_rt.columns))
print("Jumlah kolom di df_lahir:", len(df_lahir.columns))
print("-------------------------------------------------------")

Preprocessing Data SDKI 2017...

Membaca file data Stata (.DTA)...
File Ibu, Rumah Tangga, Kelahiran, & Anak (.DTA) berhasil dibaca.

--- Check Dataset ---
Jumlah kolom di df_ibu: 5491
Jumlah kolom di df_anak: 1678
Jumlah kolom di df_rt: 1283
Jumlah kolom di df_lahir: 1678
-------------------------------------------------------


### 2. Check Variable Name

In [None]:
print("\n--- Mencari Variabel Pendidikan ---")
kata_kunci = 'educa'

for var, label in meta_ibu.column_names_to_labels.items():
    if kata_kunci in var.lower() or kata_kunci in label.lower():
        print(f"Nama Variabel: {var}, Label: {label}")

print("---------------------------------")


--- Mencari Variabel Pendidikan ---
Nama Variabel: v106, Label: highest educational level
Nama Variabel: v107, Label: highest year of education
Nama Variabel: v133, Label: education in single years
Nama Variabel: v149, Label: educational attainment
Nama Variabel: awfacte, Label: all woman factor - educational
Nama Variabel: v701, Label: husband/partner's education level
Nama Variabel: v702, Label: husband/partner's highest year of education (at level in v701)
Nama Variabel: v715, Label: husband/partner's total number of years of education
Nama Variabel: v729, Label: husband/partner's educational attainment
Nama Variabel: s108, Label: highest educational level
Nama Variabel: s904, Label: partner's level of education
---------------------------------


In [None]:
print("\n--- Mencari Variabel Ukuran Bayi Lahir ---")

kata_kunci_1 = 'size'  # Kata kunci: ukuran
kata_kunci_2 = 'weight' # Kata kunci: berat

for var, label in meta_anak.column_names_to_labels.items():
    if kata_kunci_1 in label.lower() or kata_kunci_2 in label.lower():
        print(f"Nama Variabel: {var}, Label: {label}")

print("---------------------------------")


--- Mencari Variabel Ukuran Bayi Lahir ---
Nama Variabel: v005, Label: women's individual sample weight (6 decimals)
Nama Variabel: v419, Label: entries in height/weight roster
Nama Variabel: v437, Label: na - respondent's weight in kilograms (1 decimal)
Nama Variabel: v442, Label: na - weight/height percent ref. median (dhs)
Nama Variabel: v443, Label: na - weight/height percent ref. median (fog)
Nama Variabel: v444, Label: na - weight/height percent ref. median (who)
Nama Variabel: v444a, Label: weight/height standard deviation (dhs)
Nama Variabel: v447, Label: na - result of measurement - height/weight
Nama Variabel: m18, Label: size of child at birth
Nama Variabel: m19, Label: birth weight in kilograms (3 decimals)
Nama Variabel: m19a, Label: weight at birth/recall
Nama Variabel: hw2, Label: na - child's weight in kilograms (1 decimal)
Nama Variabel: hw7, Label: na - weight/age percentile
Nama Variabel: hw8, Label: na - weight/age standard deviation
Nama Variabel: hw9, Label: na -

In [None]:
print("\n--- Mencari Variabel Penolong Persalinan ---")
kata_kunci_1 = 'delivery'
kata_kunci_2 = 'birth'
kata_kunci_3 = 'attendant'

for var, label in meta_lahir.column_names_to_labels.items():
    if kata_kunci_1 in label.lower() or kata_kunci_2 in label.lower():
        print(f"Nama Variabel: {var}, Label: {label}")

print("---------------------------------")


--- Mencari Variabel Penolong Persalinan ---
Nama Variabel: bidx, Label: birth column number
Nama Variabel: v009, Label: respondent's month of birth
Nama Variabel: v010, Label: respondent's year of birth
Nama Variabel: v011, Label: date of birth (cmc)
Nama Variabel: v208, Label: births in last five years
Nama Variabel: v209, Label: births in past year
Nama Variabel: v210, Label: births in month of interview
Nama Variabel: v211, Label: date of first birth (cmc)
Nama Variabel: v212, Label: age of respondent at 1st birth
Nama Variabel: v221, Label: marriage to first birth interval (months)
Nama Variabel: v222, Label: last birth to interview (months)
Nama Variabel: v224, Label: entries in birth history
Nama Variabel: v237, Label: birth between last and interview
Nama Variabel: v238, Label: births in last three years
Nama Variabel: v244, Label: can women get pregnant after birth and before period
Nama Variabel: v401, Label: last birth a caesarean section
Nama Variabel: v468, Label: record 

In [None]:
print("\n--- Mencari Variabel Penolong Persalinan ---")
kata_kunci_1 = 'visits'

for var, label in meta_ibu.column_names_to_labels.items():
    if kata_kunci_1 in label.lower():
        print(f"Nama Variabel: {var}, Label: {label}")

print("---------------------------------")


--- Mencari Variabel Penolong Persalinan ---
Nama Variabel: v027, Label: number of visits
Nama Variabel: m14_1, Label: number of antenatal visits during pregnancy
Nama Variabel: m14_2, Label: number of antenatal visits during pregnancy
Nama Variabel: m14_3, Label: number of antenatal visits during pregnancy
Nama Variabel: m14_4, Label: number of antenatal visits during pregnancy
Nama Variabel: m14_5, Label: number of antenatal visits during pregnancy
Nama Variabel: m14_6, Label: number of antenatal visits during pregnancy
Nama Variabel: v743d, Label: person who usually decides on visits to family or relatives
Nama Variabel: v804, Label: number of visits
Nama Variabel: s412ba_1, Label: number of antenatal visits - first 3 months
Nama Variabel: s412ba_2, Label: number of antenatal visits - first 3 months
Nama Variabel: s412ba_3, Label: number of antenatal visits - first 3 months
Nama Variabel: s412ba_4, Label: number of antenatal visits - first 3 months
Nama Variabel: s412ba_5, Label: n

In [None]:
print("\n--- Mencari Semua Variabel Bantuan Persalinan ---")
kata_kunci = 'assistance'

for var, label in meta_ibu.column_names_to_labels.items():
    # hanya cari di variabel yang berawalan 'm3' untuk menjaga relevansi
    if var.startswith('m3') and kata_kunci in label.lower():
        print(f"Nama Variabel: {var}, Label: {label}")

print("---------------------------------")


--- Mencari Semua Variabel Bantuan Persalinan ---
Nama Variabel: m3a_1, Label: assistance: general practitioner
Nama Variabel: m3a_2, Label: assistance: general practitioner
Nama Variabel: m3a_3, Label: assistance: general practitioner
Nama Variabel: m3a_4, Label: assistance: general practitioner
Nama Variabel: m3a_5, Label: assistance: general practitioner
Nama Variabel: m3a_6, Label: assistance: general practitioner
Nama Variabel: m3b_1, Label: assistance: obstetrician
Nama Variabel: m3b_2, Label: assistance: obstetrician
Nama Variabel: m3b_3, Label: assistance: obstetrician
Nama Variabel: m3b_4, Label: assistance: obstetrician
Nama Variabel: m3b_5, Label: assistance: obstetrician
Nama Variabel: m3b_6, Label: assistance: obstetrician
Nama Variabel: m3c_1, Label: assistance:  nurse
Nama Variabel: m3c_2, Label: assistance:  nurse
Nama Variabel: m3c_3, Label: assistance:  nurse
Nama Variabel: m3c_4, Label: assistance:  nurse
Nama Variabel: m3c_5, Label: assistance:  nurse
Nama Variabel

In [None]:
print("\n--- Mencari Variabel Pemeriksaan Nifas Ibu ---")
kata_kunci_1 = 'postnatal'

for var, label in meta_ibu.column_names_to_labels.items():
    if kata_kunci_1 in label.lower():
        print(f"Nama Variabel: {var}, Label: {label}")

print("---------------------------------")


--- Mencari Variabel Pemeriksaan Nifas Ibu ---
Nama Variabel: m70_1, Label: baby postnatal check within 2 months
Nama Variabel: m70_2, Label: baby postnatal check within 2 months
Nama Variabel: m70_3, Label: baby postnatal check within 2 months
Nama Variabel: m70_4, Label: baby postnatal check within 2 months
Nama Variabel: m70_5, Label: baby postnatal check within 2 months
Nama Variabel: m70_6, Label: baby postnatal check within 2 months
Nama Variabel: m71_1, Label: time after delivery postnatal check took place
Nama Variabel: m71_2, Label: time after delivery postnatal check took place
Nama Variabel: m71_3, Label: time after delivery postnatal check took place
Nama Variabel: m71_4, Label: time after delivery postnatal check took place
Nama Variabel: m71_5, Label: time after delivery postnatal check took place
Nama Variabel: m71_6, Label: time after delivery postnatal check took place
Nama Variabel: m72_1, Label: person who performed postnatal checkup
Nama Variabel: m72_2, Label: per

In [None]:
print("\n--- Mencari Variabel Pemeriksaan Ibu (checkup) ---")
kata_kunci = 'checkup'

for var, label in meta_ibu.column_names_to_labels.items():
    # Kita cari di variabel 'm' (maternity)
    if var.startswith('m') and kata_kunci in label.lower():
        print(f"Nama Variabel: {var}, Label: {label}")

print("---------------------------------")


--- Mencari Variabel Pemeriksaan Ibu (checkup) ---
Nama Variabel: m72_1, Label: person who performed postnatal checkup
Nama Variabel: m72_2, Label: person who performed postnatal checkup
Nama Variabel: m72_3, Label: person who performed postnatal checkup
Nama Variabel: m72_4, Label: person who performed postnatal checkup
Nama Variabel: m72_5, Label: person who performed postnatal checkup
Nama Variabel: m72_6, Label: person who performed postnatal checkup
---------------------------------


In [None]:
print("\n--- Mencari Variabel Kunci di Data Rumah Tangga ---")

# Cari ID Klaster dan Nomor Rumah Tangga
print("\n--> Mencari ID Klaster & RT:")
for var, label in meta_rt.column_names_to_labels.items():
    if 'cluster' in label.lower() or 'household number' in label.lower():
        print(f"Nama Variabel: {var}, Label: {label}")

# Cari variabel Asuransi
print("\n--- Mencari Variabel Asuransi (Pencarian Lanjutan) ---")

kata_kunci_list = [ 'jaminan', 'bpjs', 'insurance', 'asuransi'] # Kita coba beberapa kata kunci

for var, label in meta_ibu.column_names_to_labels.items():
    for kata_kunci in kata_kunci_list:
        if kata_kunci in label.lower():
            print(f"Nama Variabel: {var}, Label: {label}")
            break # Hentikan jika sudah ketemu agar tidak duplikat

print("---------------------------------")


--- Mencari Variabel Kunci di Data Rumah Tangga ---

--> Mencari ID Klaster & RT:
Nama Variabel: hv001, Label: cluster number
Nama Variabel: hv002, Label: household number
Nama Variabel: hv040, Label: na - cluster altitude in meters

--- Mencari Variabel Asuransi (Pencarian Lanjutan) ---
Nama Variabel: v481, Label: covered by health insurance
Nama Variabel: v481a, Label: health insurance type: mutual/community organization
Nama Variabel: v481b, Label: health insurance type: through employer
Nama Variabel: v481c, Label: health insurance type: social security -kartu jkn/bp non pbi
Nama Variabel: v481d, Label: health insurance type: private health insurance
Nama Variabel: v481e, Label: health insurance type: health security-non contribution
Nama Variabel: v481f, Label: na - health insurance type: cs
Nama Variabel: v481g, Label: na - health insurance type: cs
Nama Variabel: v481h, Label: na - health insurance type: cs
Nama Variabel: v481x, Label: health insurance type: other
Nama Variabel

### 3. Extracting

#### 3.1 Define Required Variables

In [None]:
print("Defining Required Variables")
print("="*70)

# Mother-level variables
ibu_vars_needed = [
    'caseid',           # Unique case identifier
    'v001',             # Cluster number
    'v002',             # Household number
    'v024',             # Region/Province
    'v025',             # Type of residence (1=Urban, 2=Rural)
    'v106',             # Highest education level
    'v201',             # Total children ever born (v201)
    'v212',             # Age of respondent
    'm14_1',            # Number of ANC visits for last birth
    'm15_1',            # Place of delivery (codes 20+ = health facility)
    'v481',             # Health insurance coverage (1=Yes)
    'v208',             # Births in last five years
    'v005' ,            # Weighted

] + [f'm3{c}_1' for c in 'abcdef']

# Child-level variables
anak_vars_needed = [
    'caseid',           # Links to mother
    'v024',             # Region/Province
    'm18',              # Size of child at birth (1-3=Normal/Large, 4-5=Small)
    'h2',               # BCG vaccination (1=Yes)
    'h4',               # DPT1 vaccination (1=Yes)
    'h6',               # Polio1 vaccination (1=Yes)
    'h8',               # Measles vaccination (1=Yes)
    'h0',                # Child's age in months
    'm70',              # Baby postnatal check within 2 months
    'm71',               # Time after delivery postnatal check took place
    'v005' ,            # Weighted
]

# Birth history variables
lahir_vars_needed = [
    'caseid',           # Links to mother
    'v024',             # Region/Province
    'b11'            # Preceding birth interval
]

print(f"Mother variables needed: {len(ibu_vars_needed)}")
print(f"Child variables needed: {len(anak_vars_needed)}")
print(f"Birth history variables needed: {len(lahir_vars_needed)}")

Defining Required Variables
Mother variables needed: 19
Child variables needed: 11
Birth history variables needed: 3


#### 3.2 Check Data Availability & Quality

In [None]:
# ============================================================================
# VALIDATE DATASET AVAILABILITY
# ============================================================================
print("Validating Variable Availability")
print("-"*70)

def check_missing_variables(df, var_list, dataset_name):
    """Check if required variables exist in dataset"""
    missing = [v for v in var_list if v not in df.columns]
    if missing:
        print(f"\nWARNING - {dataset_name}:")
        print(f"   Missing {len(missing)} variables: {missing[:10]}")  # Show first 10
        if len(missing) > 10:
            print(f"   ... and {len(missing)-10} more")
        return False
    else:
        print(f"✓ {dataset_name}: All {len(var_list)} variables found")
        return True

# Validate each dataset
ibu_valid = check_missing_variables(df_ibu, ibu_vars_needed, "Mother Dataset")
anak_valid = check_missing_variables(df_anak, anak_vars_needed, "Child Dataset")
lahir_valid = check_missing_variables(df_lahir, lahir_vars_needed, "Birth Dataset")

if not all([ibu_valid, anak_valid, lahir_valid]):
    print("\nSome variables are missing. Please check your dataset!")

Validating Variable Availability
----------------------------------------------------------------------
✓ Mother Dataset: All 19 variables found
✓ Child Dataset: All 11 variables found
✓ Birth Dataset: All 3 variables found


In [None]:
# ============================================================================
# SELECT AND COPY VARIABLES
# ============================================================================
print("Selecting Required Variables")
print("="*70)

df_ibu_clean = df_ibu[ibu_vars_needed].copy()
df_anak_clean = df_anak[anak_vars_needed].copy()
df_lahir_clean = df_lahir[lahir_vars_needed].copy()

print(f"✓ Mother data: {df_ibu_clean.shape[0]:,} rows, {df_ibu_clean.shape[1]} columns")
print(f"✓ Child data: {df_anak_clean.shape[0]:,} rows, {df_anak_clean.shape[1]} columns")
print(f"✓ Birth data: {df_lahir_clean.shape[0]:,} rows, {df_lahir_clean.shape[1]} columns")

Selecting Required Variables
✓ Mother data: 49,627 rows, 19 columns
✓ Child data: 17,848 rows, 11 columns
✓ Birth data: 86,265 rows, 3 columns


In [None]:
# ============================================================================
# DATA CLEANING
# ============================================================================
print("Data Quality Checks & Cleaning")
print("="*70)

def handle_dhs_special_codes(df, var, missing_codes=[98, 99], method='nan'):
    """
    Handle DHS special codes (Don't Know, Missing, Not Applicable)

    Parameters:
    - df: DataFrame
    - var: Variable name
    - missing_codes: List of codes to treat as missing
    - method: 'nan' or 'zero'
    """
    if var in df.columns:
        original_missing = df[var].isna().sum()
        df[var] = df[var].replace(missing_codes, np.nan)
        new_missing = df[var].isna().sum()
        if new_missing > original_missing:
            print(f"  • {var}: Cleaned {new_missing - original_missing:,} special codes → NaN")
    return df

# Clean Mother data
print("\nCleaning Mother Dataset:")
# Corrected calls for handle_dhs_special_codes
df_ibu_clean = handle_dhs_special_codes(df_ibu_clean, 'm14_1', missing_codes=[97, 98, 99])  # ANC visits
df_ibu_clean = handle_dhs_special_codes(df_ibu_clean, 'm15_1', missing_codes=[97, 98, 99])  # Delivery Places
df_ibu_clean = handle_dhs_special_codes(df_ibu_clean, 'v212', missing_codes=[98, 99])  # Age
df_ibu_clean = handle_dhs_special_codes(df_ibu_clean, 'v106', missing_codes=[8, 9]) # Education
df_ibu_clean = handle_dhs_special_codes(df_ibu_clean, 'v481', missing_codes=[8, 9]) # Insurance
df_ibu_clean = handle_dhs_special_codes(df_ibu_clean, 'b11_01', missing_codes=[98, 99]) # Birth weight


# Clean Child data
print("\nCleaning Child Dataset:")
df_anak_clean = handle_dhs_special_codes(df_anak_clean, 'm18', missing_codes=[8, 9, 98, 99])  # Birth size
# Assuming immunization variables h2, h4, h6, h8 also need cleaning for 8, 9
df_anak_clean = handle_dhs_special_codes(df_anak_clean, 'h2', missing_codes=[8, 9])
df_anak_clean = handle_dhs_special_codes(df_anak_clean, 'h4', missing_codes=[8, 9])
df_anak_clean = handle_dhs_special_codes(df_anak_clean, 'h6', missing_codes=[8, 9])
df_anak_clean = handle_dhs_special_codes(df_anak_clean, 'h8', missing_codes=[8, 9])
df_anak_clean = handle_dhs_special_codes(df_anak_clean, 'm70', [8,9]) # Postnatal Check
df_anak_clean = handle_dhs_special_codes(df_anak_clean, 'm71', [998, 999]) # Time after delivery

# Clean Birth data
print("\nCleaning Birth Dataset:")
df_lahir_clean = handle_dhs_special_codes(df_lahir_clean, 'b11', missing_codes=[98, 99])  # Birth interval


Data Quality Checks & Cleaning

Cleaning Mother Dataset:
  • m14_1: Cleaned 79 special codes → NaN

Cleaning Child Dataset:
  • m18: Cleaned 298 special codes → NaN
  • h2: Cleaned 31 special codes → NaN
  • h4: Cleaned 28 special codes → NaN
  • h6: Cleaned 28 special codes → NaN
  • h8: Cleaned 28 special codes → NaN
  • m70: Cleaned 80 special codes → NaN
  • m71: Cleaned 123 special codes → NaN

Cleaning Birth Dataset:
  • b11: Cleaned 373 special codes → NaN


In [None]:
# ============================================================================
# FILTER TO APPROPRIATE SAMPLE
# ============================================================================
print("="*70)
print("Sample Selection")
print("="*70)

print(f"Original sample: {len(df_ibu_clean):,} mothers")

# m14_1 is only asked for most recent birth in last 5 years
df_ibu_analysis = df_ibu_clean[df_ibu_clean['v208'] > 0].copy()
print(f"Analysis sample: {len(df_ibu_analysis):,} mothers with recent births")
print(f"Filtered out: {len(df_ibu_clean) - len(df_ibu_analysis):,} mothers without recent births")

df_anak_analysis = df_anak_clean[df_anak_clean['h0']<=23].copy()
print(f"Analysis child sample: {len(df_anak_analysis):,} children aged 0–23 months")
print(f"Filtered out: {len(df_anak_clean) - len(df_anak_analysis):,} children age > 23 months")

Sample Selection
Original sample: 49,627 mothers
Analysis sample: 15,357 mothers with recent births
Filtered out: 34,270 mothers without recent births
Analysis child sample: 10,325 children aged 0–23 months
Filtered out: 7,523 children age > 23 months


In [None]:
# Show missing data summary
print("\nMissing Data Summary:")
print("-" * 50)
key_vars = ['m14_1', 'm15_1', 'v212', 'v481', 'm18', 'v220']
for var in key_vars:
    for df, name in [(df_ibu_analysis, 'ibu'), (df_anak_clean, 'anak'), (df_lahir_clean, 'lahir')]:
        if var in df.columns:
            missing_pct = (df[var].isna().sum() / len(df)) * 100
            print(f"  {name}.{var}: {missing_pct:.1f}% missing")


Missing Data Summary:
--------------------------------------------------
  ibu.m14_1: 0.6% missing
  ibu.m15_1: 0.2% missing
  ibu.v212: 0.0% missing
  ibu.v481: 0.0% missing
  anak.m18: 2.3% missing


#### 3.3 Feature Engineering

In [None]:
# ============================================================================
# FEATURE ENGINEERING
# ============================================================================
print("Feature Engineering")
print("-"*70)

# -----------------------------------------------------------------------------
# A. MATERNAL HEALTH SERVICE INDICATORS
# -----------------------------------------------------------------------------
print("\nA. Creating Maternal Health Service Indicators:")

# Ensure `df_ibu_analysis` is used as the base for feature engineering
df_ibu_analysis = df_ibu_analysis[df_ibu_analysis['m14_1'].notna()].copy()

# 1. ANC 4+ visits (m14_1 >= 4)
df_ibu_analysis['anc4_pct'] = (df_ibu_analysis['m14_1'] >= 4).astype(int)
print(f"4+ ANC visits: {df_ibu_analysis['anc4_pct'].mean()*100:.1f}%")

# 2. Facility delivery (m15_1: 20-36 = health facilities)
df_ibu_analysis['facility_delivery_pct'] = (
    (df_ibu_analysis['m15_1'] >= 20) & (df_ibu_analysis['m15_1'] <= 36)
).astype(int)
print(f"Facility delivery: {df_ibu_analysis['facility_delivery_pct'].mean()*100:.1f}%")

# 3. Skilled Birth Attendant
# m3a_1 = doctor, m3b_1 = nurse/midwife, m3c_1 = auxiliary midwife
sba_cols = [col for col in df_ibu_analysis.columns if col.startswith('m3') and col.endswith('_1')]
excluded_tba = ['m3f_1'] # Not Skilled
sba_cols_filtered = [c for c in sba_cols if c not in excluded_tba]

df_ibu_analysis['sba_pct'] = df_ibu_analysis[sba_cols_filtered].eq(1).any(axis=1).astype(int)
print(f"Skilled Birth Attendant: {df_ibu_analysis['sba_pct'].mean()*100:.1f}%")

# 4. Any PNC (m70 = 1 → dapat pemeriksaan nifas bayi dalam 2 bulan)
df_pnc = df_anak_analysis[df_anak_analysis['m70'].notna()].copy()
df_pnc['pnc_any_flag'] = (df_pnc['m70'] == 1).astype(int)
print(f"Any PNC (age<=23mo): {df_pnc['pnc_any_flag'].mean()*100:.1f}%")

# 5. PNC within 2 days (m71: hours/days after delivery)
m71 = df_pnc['m71']

mask_hours = m71.between(100, 171)          # 100–171: jam
mask_hours_dk = m71.isin([198, 199])        # DK in hours → tetap dianggap <= 48h
mask_days = m71.between(200, 202)           # 200–202: 0–2 hari

df_pnc['pnc48h_flag'] = (mask_hours | mask_hours_dk | mask_days).astype(int)
print(f"PNC within 2 days (age<=23mo): {df_pnc['pnc48h_flag'].mean()*100:.1f}%")

# 6. Parity
df_ibu_analysis.rename(columns={'v201': 'parity'}, inplace=True)

# -----------------------------------------------------------------------------
# B. SOCIO-DEMOGRAPHIC INDICATORS
# -----------------------------------------------------------------------------
print("\nB. Creating Socio-Demographic Indicators:")

df_ibu_analysis['insured_pct'] = (df_ibu_analysis['v481'] == 1).astype(int)
print(f"Health insurance: {df_ibu_analysis['insured_pct'].mean()*100:.1f}%")

df_ibu_analysis['urban_share_pct'] = (df_ibu_analysis['v025'] == 1).astype(int)
print(f"Urban residence: {df_ibu_analysis['urban_share_pct'].mean()*100:.1f}%")

df_ibu_analysis['low_education_pct'] = (df_ibu_analysis['v106'] <= 1).astype(int)
print(f"Low education (\u2264Primary): {df_ibu_analysis['low_education_pct'].mean()*100:.1f}%")

# -----------------------------------------------------------------------------
# C. RISK FACTORS
# -----------------------------------------------------------------------------
print("\nC. Creating Risk Factor Indicators:")

df_ibu_analysis['maternal_age_risky_pct'] = (
    (df_ibu_analysis['v212'] < 20) | (df_ibu_analysis['v212'] >= 35)
).astype(int)
print(f"Risky maternal age: {df_ibu_analysis['maternal_age_risky_pct'].mean()*100:.1f}%")

# Birth interval (from birth recode)
df_lahir_clean['short_interval_flag'] = (
    df_lahir_clean['b11'] < 24  # Less than 24 months
).astype(int)
print(f"Short birth interval (<24mo): {df_lahir_clean['short_interval_flag'].mean()*100:.1f}%")

# Child variables
df_anak_analysis['small_birth_size_pct'] = (
    df_anak_analysis['m18'].isin([4, 5])  # 4=small, 5=very small
).astype(int)
print(f"Small birth size: {df_anak_analysis['small_birth_size_pct'].mean()*100:.1f}%")

# Full immunization (h2=BCG, h3=DPT1, h5=Polio1, h9=Measles)
# Code 1 = Yes (from card or recall), 0 = No
immun_vars = ['h2', 'h4', 'h6', 'h8']
for var in immun_vars:
    if var in df_anak_analysis.columns:
        coverage = (df_anak_analysis[var] == 1).mean() * 100
        print(f"    \u2022 {var}: {coverage:.1f}%")

for var in immun_vars:
    if var in df_anak_analysis.columns:
        df_anak_analysis[f'{var}_binary'] = (df_anak_analysis[var] == 1).astype(int)

binary_vars = [f'{v}_binary' for v in immun_vars if f'{v}_binary' in df_anak_analysis.columns]
df_anak_analysis['full_immun_pct'] = df_anak_analysis[binary_vars].all(axis=1).astype(int)
print(f"\nFull immunization (all 4 vaccines): {df_anak_analysis['full_immun_pct'].mean()*100:.1f}%")

Feature Engineering
----------------------------------------------------------------------

A. Creating Maternal Health Service Indicators:
4+ ANC visits: 88.6%
Facility delivery: 75.1%
Skilled Birth Attendant: 90.2%
Any PNC (age<=23mo): 66.8%
PNC within 2 days (age<=23mo): 20.7%

B. Creating Socio-Demographic Indicators:
Health insurance: 62.0%
Urban residence: 49.4%
Low education (≤Primary): 26.3%

C. Creating Risk Factor Indicators:
Risky maternal age: 30.2%
Short birth interval (<24mo): 10.8%
Small birth size: 12.8%
    • h2: 46.7%
    • h4: 42.4%
    • h6: 39.1%
    • h8: 34.3%

Full immunization (all 4 vaccines): 33.4%


#### 3.4 Merging

In [None]:
# Mother-level indicators

df_ibu_2017 = df_ibu_analysis.copy()

# ANC 4+
df_ibu_2017['anc4_flag'] = (df_ibu_2017['m14_1'] >= 4).astype(int)

# Facility delivery (20–89 = health facility)
df_ibu_2017['facility_flag'] = df_ibu_2017['m15_1'].between(20, 89).astype(int)

# SBA (pakai loop semua m3*_1 kecuali TBA, misal m3f_1)
sba_cols = [c for c in df_ibu_2017.columns if c.startswith('m3') and c.endswith('_1')]
excluded_tba = ['m3f_1']  # sesuaikan kalau TBA di kode lain
sba_cols = [c for c in sba_cols if c not in excluded_tba]
df_ibu_2017['sba_flag'] = df_ibu_2017[sba_cols].eq(1).any(axis=1).astype(int)

# Insurance
df_ibu_2017['insured_flag'] = (df_ibu_2017['v481'] == 1).astype(int)

# Urban
df_ibu_2017['urban_flag'] = (df_ibu_2017['v025'] == 1).astype(int)

# Low education (≤ primary)
df_ibu_2017['lowedu_flag'] = df_ibu_2017['v106'].isin([0, 1]).astype(int)  # 0 = no edu, 1 = primary

# Risky maternal age (<20 or >=35)
df_ibu_2017['risky_age_flag'] = ((df_ibu_2017['v212'] < 20) | (df_ibu_2017['v212'] >= 35)).astype(int)

# Parity (bisa dipakai langsung mean)
df_ibu_2017['parity'] = df_ibu_2017['parity']

# Weighting
df_ibu_2017['weight'] = df_ibu_2017['v005'] / 1_000_000

In [None]:
# Merge Birth interval with df_ibu

df_birth = df_lahir_clean.copy()

# Flag short birth interval <24 bulan
df_birth['short_interval_flag'] = (df_birth['b11'] < 24).astype(int)

# Aggregate ke level ibu: kalau ada kelahiran dengan interval pendek → ibu dianggap punya risiko
df_interval_mother = (
    df_birth
    .groupby('caseid')['short_interval_flag']
    .max()
    .reset_index()
)

# Merge ke df_ibu (1-1, aman)
df_ibu_2017 = df_ibu_2017.merge(df_interval_mother, on='caseid', how='left')
df_ibu_2017['short_interval_flag'] = df_ibu_2017['short_interval_flag'].fillna(0).astype(int)

In [None]:
# Child-level indicators

df_child_2017 = df_anak_analysis.copy()

# LBW proxy: small_birth_size (m18: 4/5 = small/very small, 1–3 = average/large)
df_child_2017['small_birth_flag'] = df_child_2017['m18'].isin([4, 5]).astype(int)

# Immunization flags (anggap 1 = received, 2/3 juga treated as received kalau ada)
for col in ['h2', 'h4', 'h6', 'h8']:
    df_child_2017[col + '_flag'] = df_child_2017[col].isin([1, 2, 3]).astype(int)

# Full immunization (semua 4 vaksin diterima)
imm_cols = ['h2_flag', 'h4_flag', 'h6_flag', 'h8_flag']
df_child_2017['full_immun_flag'] = df_child_2017[imm_cols].all(axis=1).astype(int)

# Weighting
df_child_2017['weight'] = df_child_2017['v005'] / 1_000_000

In [None]:
# PNC indicators

df_pnc = df_child_2017[df_child_2017['m70'].notna()].copy()

# Any PNC
df_pnc['pnc_any_flag'] = (df_pnc['m70'] == 1).astype(int)

# PNC ≤48h (pakai coding numeric)
m71 = df_pnc['m71']
mask_hours = m71.between(100, 171)
mask_hours_dk = m71.isin([198, 199])
mask_days = m71.between(200, 202)
df_pnc['pnc48h_flag'] = (mask_hours | mask_hours_dk | mask_days).astype(int)


In [None]:
group_cols = ['v024']  # v024 = province

# Aggregate Mother
prov_mother = (
    df_ibu_2017
    .groupby(group_cols)
    .agg(
        anc4_pct=('anc4_flag', 'mean'),
        facility_delivery_pct=('facility_flag', 'mean'),
        sba_pct=('sba_flag', 'mean'),
        insured_pct=('insured_flag', 'mean'),
        urban_share_pct=('urban_flag', 'mean'),
        low_education_pct=('lowedu_flag', 'mean'),
        risky_maternal_age_pct=('risky_age_flag', 'mean'),
        birth_interval_short_pct=('short_interval_flag', 'mean'),
        avg_parity=('parity', 'mean')
    )
    .reset_index()
)

# Aggregate Child
prov_child = (
    df_child_2017
    .groupby(group_cols)
    .agg(
        lbw_pct=('small_birth_flag', 'mean'),
        full_immun_pct=('full_immun_flag', 'mean')
    )
    .reset_index()
)

# Aggregate PNC
prov_pnc = (
    df_pnc
    .groupby(group_cols)
    .agg(
        pnc_any_pct=('pnc_any_flag', 'mean'),
        pnc48h_pct=('pnc48h_flag', 'mean')
    )
    .reset_index()
)

In [None]:
df_2017 = prov_mother.merge(prov_child, on=['v024'], how='left')
df_2017 = df_2017.merge(prov_pnc, on=['v024'], how='left')

df_2017.rename(columns={'v024': 'region'}, inplace=True)


In [None]:
prov_map = {
    11: "Aceh",
    12: "Sumatera Utara",
    13: "Sumatera Barat",
    14: "Riau",
    15: "Jambi",
    16: "Sumatera Selatan",
    17: "Bengkulu",
    18: "Lampung",
    19: "Kepulauan Bangka Belitung",
    21: "Kepulauan Riau",
    31: "DKI Jakarta",
    32: "Jawa Barat",
    33: "Jawa Tengah",
    34: "DI Yogyakarta",
    35: "Jawa Timur",
    36: "Banten",
    51: "Bali",
    52: "Nusa Tenggara Barat",
    53: "Nusa Tenggara Timur",
    61: "Kalimantan Barat",
    62: "Kalimantan Tengah",
    63: "Kalimantan Selatan",
    64: "Kalimantan Timur",
    65: "Kalimantan Utara",
    71: "Sulawesi Utara",
    72: "Sulawesi Tengah",
    73: "Sulawesi Selatan",
    74: "Sulawesi Tenggara",
    75: "Gorontalo",
    76: "Sulawesi Barat",
    81: "Maluku",
    82: "Maluku Utara",
    91: "Papua",
    92: "Papua Barat",
    94: "Papua Barat"
}

df_2017['province'] = df_2017['region'].map(prov_map)

# cek apakah ada kode yang belum ter-map
unmapped = df_2017[df_2017['province'].isna()]['region'].unique()
print("Unmapped province codes:", unmapped)

Unmapped province codes: []


In [None]:
cols = df_2017.columns.tolist()

new_order = ['region', 'province'] + [c for c in cols if c not in ['region', 'province']]

# Reorder dataframe
df_2012 = df_2017[new_order]

In [None]:
# ============================================================================
# FINAL DATA QUALITY SUMMARY
# ============================================================================
print("="*80)
print("FINAL DATA QUALITY SUMMARY – PROVINCIAL INDICATORS")
print("="*80)

# 1. Basic dataset info
print(f"\nTotal observations        : {len(df_2017):,}")
print(f"Unique provinces          : {df_2017['region'].nunique()}")
print(f"Total indicators          : {df_2017.shape[1] - 2} (excluding region/year)")

# 2. Missing check
missing = df_2017.isna().sum()
if missing.sum() == 0:
    print("\nMissing values            : 0")
else:
    print("\nMissing values:")
    print(missing[missing > 0])

# 3. Indicator ranges (auto-detect pct columns)
print("\nIndicator Ranges by Province:")
print("-" * 80)

indicator_cols = [
    col for col in df_2017.columns
    if col not in ['region', 'province'] # Exclude 'province' column from numerical checks
]

for col in indicator_cols:
    col_min = df_2017[col].min()
    col_max = df_2017[col].max()

    # convert to percentage if value seems 0-1
    if col_max <= 1.5:
        print(f"{col:30s}: {col_min*100:6.1f}% – {col_max*100:6.1f}%")
    else:
        print(f"{col:30s}: {col_min:6.2f} – {col_max:6.2f}")

# 4. Quick descriptive stats
print("\nQuick Descriptive Stats:")
print("-" * 80)
print(df_2017[indicator_cols].describe().T[['mean','std','min','max']])

print("\nData preview:")
print(df_2017.head())

print("\nPROCESSING COMPLETE ✔")
print("="*80)


FINAL DATA QUALITY SUMMARY – PROVINCIAL INDICATORS

Total observations        : 34
Unique provinces          : 34
Total indicators          : 13 (excluding region/year)

Missing values            : 0

Indicator Ranges by Province:
--------------------------------------------------------------------------------
anc4_pct                      :   70.3% –   98.1%
facility_delivery_pct         :   34.1% –   99.4%
sba_pct                       :   72.0% –   99.3%
insured_pct                   :   40.5% –   89.7%
urban_share_pct               :   18.9% –  100.0%
low_education_pct             :    5.0% –   43.8%
risky_maternal_age_pct        :   14.3% –   46.0%
birth_interval_short_pct      :    5.0% –   39.8%
avg_parity                    :   1.89 –   3.28
lbw_pct                       :    6.5% –   26.9%
full_immun_pct                :   26.7% –   77.4%
pnc_any_pct                   :   37.7% –   94.1%
pnc48h_pct                    :    4.0% –   52.6%

Quick Descriptive Stats:
--------------

In [None]:
print("Saving Results")

df_2017.to_csv('dhs_regional_2017.csv', index=False)
print("Files saved successfully!")

Saving Results
Files saved successfully!


## B. Data DHS (2012)

### 1. Load Data

In [None]:
print("Preprocessing Data SDKI 2012...")

ibu_dta_path = 'IDIR63FL.DTA' # File ibu
anak_dta_path = 'IDKR63FL.DTA' # File anak
lahir_dta_path = 'IDBR63FL.DTA' # File Kelahiran

print("\nMembaca file data Stata (.DTA)...")

try:
    df_ibu, meta_ibu = pyreadstat.read_dta(ibu_dta_path)
    df_anak, meta_anak = pyreadstat.read_dta(anak_dta_path)
    df_lahir, meta_lahir = pyreadstat.read_dta(lahir_dta_path)
    print("File Ibu, Rumah Tangga, Kelahiran, & Anak (.DTA) berhasil dibaca.")
except Exception as e:
    print(f"Error saat membaca file: {e}")
    exit()

df_ibu.columns = df_ibu.columns.str.lower()
df_anak.columns = df_anak.columns.str.lower()
df_lahir.columns = df_lahir.columns.str.lower()

print("\n--- Check Dataset ---")
print("Jumlah kolom di df_ibu:", len(df_ibu.columns))
print("Jumlah kolom di df_anak:", len(df_anak.columns))
print("Jumlah kolom di df_lahir:", len(df_lahir.columns))
print("-------------------------------------------------------")

Preprocessing Data SDKI 2012...

Membaca file data Stata (.DTA)...
File Ibu, Rumah Tangga, Kelahiran, & Anak (.DTA) berhasil dibaca.

--- Check Dataset ---
Jumlah kolom di df_ibu: 4167
Jumlah kolom di df_anak: 1136
Jumlah kolom di df_lahir: 1136
-------------------------------------------------------


### 2. Check Variable Name

In [None]:
print("\n--- Mencari Variabel Pendidikan ---")
kata_kunci = 'educa'

# Mencari di dalam nama kolom dan label deskripsinya
for var, label in meta_ibu.column_names_to_labels.items():
    if kata_kunci in var.lower() or kata_kunci in label.lower():
        print(f"Nama Variabel: {var}, Label: {label}")

print("---------------------------------")


--- Mencari Variabel Pendidikan ---
Nama Variabel: v106, Label: highest educational level
Nama Variabel: v107, Label: highest year of education
Nama Variabel: v133, Label: education in single years
Nama Variabel: v149, Label: educational attainment
Nama Variabel: awfacte, Label: all woman factor - educational
Nama Variabel: v701, Label: husband/partner's education level
Nama Variabel: v702, Label: husband/partner's highest year of education (at level in v701)
Nama Variabel: v715, Label: husband/partner's total number of years of education
Nama Variabel: v729, Label: husband/partner's educational attainment
Nama Variabel: s105, Label: highest educational level
Nama Variabel: s804, Label: partner's level of education
---------------------------------


In [None]:
print("\n--- Mencari Variabel Ukuran Bayi Lahir ---")
# meta_anak berisi "kamus" untuk df_anak
kata_kunci_1 = 'size'  # Kata kunci: ukuran
kata_kunci_2 = 'weight' # Kata kunci: berat

# Mencari di dalam nama kolom dan label deskripsinya
for var, label in meta_anak.column_names_to_labels.items():
    if kata_kunci_1 in label.lower() or kata_kunci_2 in label.lower():
        print(f"Nama Variabel: {var}, Label: {label}")

print("---------------------------------")


--- Mencari Variabel Ukuran Bayi Lahir ---
Nama Variabel: v005, Label: women's individual sample weight (6 decimals)
Nama Variabel: v419, Label: entries in height/weight table
Nama Variabel: v437, Label: na - respondent's weight in kilograms (1 decimal)
Nama Variabel: v442, Label: na - weight/height percent ref. median (dhs)
Nama Variabel: v443, Label: na - weight/height percent ref. median (fog)
Nama Variabel: v444, Label: na - weight/height percent ref. median (who)
Nama Variabel: v444a, Label: weight/height standard deviation (dhs)
Nama Variabel: v447, Label: na - result of measurement - height/weight
Nama Variabel: m18, Label: size of child at birth
Nama Variabel: m19, Label: birth weight in kilograms (3 decimals)
Nama Variabel: m19a, Label: weight at birth/recall
Nama Variabel: hw2, Label: na - child's weight in kilograms (1 decimal)
Nama Variabel: hw7, Label: na - weight/age percentile
Nama Variabel: hw8, Label: weight/age standard deviation
Nama Variabel: hw9, Label: na - weigh

In [None]:
print("\n--- Mencari Variabel Penolong Persalinan ---")
kata_kunci_1 = 'order'
kata_kunci_2 = 'birth'

for var, label in meta_lahir.column_names_to_labels.items():
    if kata_kunci_1 in label.lower() or kata_kunci_2 in label.lower():
        print(f"Nama Variabel: {var}, Label: {label}")

print("---------------------------------")


--- Mencari Variabel Penolong Persalinan ---
Nama Variabel: bidx, Label: birth column number
Nama Variabel: v009, Label: respondent's month of birth
Nama Variabel: v010, Label: respondent's year of birth
Nama Variabel: v011, Label: date of birth (cmc)
Nama Variabel: v208, Label: births in last five years
Nama Variabel: v209, Label: births in past year
Nama Variabel: v210, Label: births in month of interview
Nama Variabel: v211, Label: date of first birth (cmc)
Nama Variabel: v212, Label: age of respondent at 1st birth
Nama Variabel: v221, Label: marriage to first birth interval (months)
Nama Variabel: v222, Label: last birth to interview (months)
Nama Variabel: v224, Label: entries in birth history
Nama Variabel: v237, Label: birth between last and interview
Nama Variabel: v238, Label: births in last three years
Nama Variabel: v401, Label: last birth a caesarean section
Nama Variabel: v468, Label: record for last birth
Nama Variabel: v603, Label: preferred waiting time for birth of a/

In [None]:
print("\n--- Mencari Semua Variabel Bantuan Persalinan ---")
kata_kunci = 'assistance'

for var, label in meta_ibu.column_names_to_labels.items():
    # Kita hanya cari di variabel yang berawalan 'm3' untuk menjaga relevansi
    if var.startswith('m3') and kata_kunci in label.lower():
        print(f"Nama Variabel: {var}, Label: {label}")

print("---------------------------------")


--- Mencari Semua Variabel Bantuan Persalinan ---
Nama Variabel: m3a_1, Label: assistance: doctor
Nama Variabel: m3a_2, Label: assistance: doctor
Nama Variabel: m3a_3, Label: assistance: doctor
Nama Variabel: m3a_4, Label: assistance: doctor
Nama Variabel: m3a_5, Label: assistance: doctor
Nama Variabel: m3a_6, Label: assistance: doctor
Nama Variabel: m3b_1, Label: assistance: obstetrician
Nama Variabel: m3b_2, Label: assistance: obstetrician
Nama Variabel: m3b_3, Label: assistance: obstetrician
Nama Variabel: m3b_4, Label: assistance: obstetrician
Nama Variabel: m3b_5, Label: assistance: obstetrician
Nama Variabel: m3b_6, Label: assistance: obstetrician
Nama Variabel: m3c_1, Label: assistance: nurse
Nama Variabel: m3c_2, Label: assistance: nurse
Nama Variabel: m3c_3, Label: assistance: nurse
Nama Variabel: m3c_4, Label: assistance: nurse
Nama Variabel: m3c_5, Label: assistance: nurse
Nama Variabel: m3c_6, Label: assistance: nurse
Nama Variabel: m3d_1, Label: assistance: midwife
Nama V

In [None]:
print("\n--- Mencari Variabel Pemeriksaan Nifas Ibu ---")
kata_kunci_1 = 'postnatal'

for var, label in meta_ibu.column_names_to_labels.items():
    if kata_kunci_1 in label.lower():
        print(f"Nama Variabel: {var}, Label: {label}")

print("---------------------------------")


--- Mencari Variabel Pemeriksaan Nifas Ibu ---
Nama Variabel: m70_1, Label: baby postnatal check within 2 months
Nama Variabel: m70_2, Label: baby postnatal check within 2 months
Nama Variabel: m70_3, Label: baby postnatal check within 2 months
Nama Variabel: m70_4, Label: baby postnatal check within 2 months
Nama Variabel: m70_5, Label: baby postnatal check within 2 months
Nama Variabel: m70_6, Label: baby postnatal check within 2 months
Nama Variabel: m71_1, Label: time after delivery postnatal check took place
Nama Variabel: m71_2, Label: time after delivery postnatal check took place
Nama Variabel: m71_3, Label: time after delivery postnatal check took place
Nama Variabel: m71_4, Label: time after delivery postnatal check took place
Nama Variabel: m71_5, Label: time after delivery postnatal check took place
Nama Variabel: m71_6, Label: time after delivery postnatal check took place
Nama Variabel: m72_1, Label: person who performed postnatal checkup
Nama Variabel: m72_2, Label: per

In [None]:
# --- KODE DETEKTIF: Cari kata kunci "checkup" ---
print("\n--- Mencari Variabel Pemeriksaan Ibu (checkup) ---")
kata_kunci = 'checkup'

for var, label in meta_ibu.column_names_to_labels.items():
    # Kita cari di variabel 'm' (maternity)
    if var.startswith('m') and kata_kunci in label.lower():
        print(f"Nama Variabel: {var}, Label: {label}")

print("---------------------------------")


--- Mencari Variabel Pemeriksaan Ibu (checkup) ---
Nama Variabel: m51_1, Label: respondent's checkup after deliver timing
Nama Variabel: m51_2, Label: respondent's checkup after deliver timing
Nama Variabel: m51_3, Label: respondent's checkup after deliver timing
Nama Variabel: m51_4, Label: respondent's checkup after deliver timing
Nama Variabel: m51_5, Label: respondent's checkup after deliver timing
Nama Variabel: m51_6, Label: respondent's checkup after deliver timing
Nama Variabel: m72_1, Label: person who performed postnatal checkup
Nama Variabel: m72_2, Label: person who performed postnatal checkup
Nama Variabel: m72_3, Label: person who performed postnatal checkup
Nama Variabel: m72_4, Label: person who performed postnatal checkup
Nama Variabel: m72_5, Label: person who performed postnatal checkup
Nama Variabel: m72_6, Label: person who performed postnatal checkup
---------------------------------


In [None]:
# Cari variabel Asuransi
# --- KODE DETEKTIF: Cari variabel Asuransi dengan kata kunci baru ---
print("\n--- Mencari Variabel Asuransi (Pencarian Lanjutan) ---")

kata_kunci_list = [ 'jaminan', 'bpjs', 'insurance', 'asuransi'] # Kita coba beberapa kata kunci

for var, label in meta_ibu.column_names_to_labels.items():
    for kata_kunci in kata_kunci_list:
        if kata_kunci in label.lower():
            print(f"Nama Variabel: {var}, Label: {label}")
            break # Hentikan jika sudah ketemu agar tidak duplikat

print("---------------------------------")


--- Mencari Variabel Asuransi (Pencarian Lanjutan) ---
Nama Variabel: v481, Label: covered by health insurance
Nama Variabel: v481a, Label: na - health insurance type: mutual/community organization
Nama Variabel: v481b, Label: health insurance type: provided by employer
Nama Variabel: v481c, Label: health insurance type: social security
Nama Variabel: v481d, Label: health insurance type: private/commercially purchased
Nama Variabel: v481e, Label: na - health insurance type: cs
Nama Variabel: v481f, Label: na - health insurance type: cs
Nama Variabel: v481g, Label: na - health insurance type: cs
Nama Variabel: v481h, Label: na - health insurance type: cs
Nama Variabel: v481x, Label: health insurance type: other
Nama Variabel: s1010a, Label: covered by health insurance: health donation
Nama Variabel: s1010b, Label: covered by health insurance: jpk pns/ veteran/ pensiun
Nama Variabel: s1010c, Label: covered by health insurance: jpk jamsostek
Nama Variabel: s1010d, Label: covered by healt

In [None]:
# Cari variabel Asuransi
# --- KODE DETEKTIF: Cari variabel Asuransi dengan kata kunci baru ---
print("\n--- Mencari Variabel Age ---")

kata_kunci_list = [ 'ages', 'age'] # Kita coba beberapa kata kunci

for var, label in meta_lahir.column_names_to_labels.items():
    for kata_kunci in kata_kunci_list:
        if kata_kunci in label.lower():
            print(f"Nama Variabel: {var}, Label: {label}")
            break # Hentikan jika sudah ketemu agar tidak duplikat

print("---------------------------------")


--- Mencari Variabel Age ---
Nama Variabel: v012, Label: respondent's current age
Nama Variabel: v013, Label: age in 5-year groups
Nama Variabel: v014, Label: completeness of age information
Nama Variabel: v152, Label: age of household head
Nama Variabel: v212, Label: age of respondent at 1st birth
Nama Variabel: v221, Label: marriage to first birth interval (months)
Nama Variabel: v320, Label: age at sterilization
Nama Variabel: v372, Label: na - shown pill package
Nama Variabel: v372a, Label: na - shown condom package
Nama Variabel: v3a00f, Label: source of family planning for non-users: village health post
Nama Variabel: v3a00s, Label: source of family planning for non-users: private village midwife
Nama Variabel: v439, Label: na - height/age percentile
Nama Variabel: v440, Label: height/age standard deviation
Nama Variabel: v441, Label: na - height/age percent ref. median
Nama Variabel: v447a, Label: na - women's age in years (from household questionnaire)
Nama Variabel: v452a, La

### 3. Extracting

#### 3.1 Define Required Variables

In [None]:
print("Defining Required Variables")
print("="*70)

# Mother-level variables
ibu_vars_needed = [
    'caseid',           # Unique case identifier
    'v001',             # Cluster number
    'v002',             # Household number
    'v024',             # Region/Province
    'v025',             # Type of residence (1=Urban, 2=Rural)
    'v106',             # Highest education level
    'v201',             # Total children ever born (v201)
    'v212',             # Age of respondent
    'm14_1',            # Number of ANC visits for last birth
    'm15_1',            # Place of delivery (codes 20+ = health facility)
    'v481',             # Health insurance coverage (1=Yes)
    'v208',             # Births in last five years
    'v005',             # Weighted
] + [f'm3{c}_1' for c in 'abcdef']

# Child-level variables
anak_vars_needed = [
    'caseid',           # Links to mother
    'v024',             # Region/Province
    'm18',              # Size of child at birth (1-3=Normal/Large, 4-5=Small)
    'h2',               # BCG vaccination (1=Yes)
    'h4',               # DPT1 vaccination (1=Yes)
    'h6',               # Polio1 vaccination (1=Yes)
    'h8',               # Measles vaccination (1=Yes)
    'v008',             # Interview Date (to calculate age)
    'b3',               # Date of birth of child (to calculate age)
    'm70',              # Baby postnatal check within 2 months
    'm71',              # Time after delivery postnatal check took place
    'v005',             # Weighted
]

# Birth history variables
lahir_vars_needed = [
    'caseid',           # Links to mother
    'v024',             # Region/Province
    'b11',               # Preceding birth interval
    'bord'              # birth order
]

print(f"Mother variables needed: {len(ibu_vars_needed)}")
print(f"Child variables needed: {len(anak_vars_needed)}")
print(f"Birth history variables needed: {len(lahir_vars_needed)}")

Defining Required Variables
Mother variables needed: 19
Child variables needed: 12
Birth history variables needed: 4


#### 3.2 Check Data Availability & Quality

In [None]:
# ============================================================================
# VALIDATE DATASET AVAILABILITY
# ============================================================================
print("Validating Variable Availability")
print("-"*70)

def check_missing_variables(df, var_list, dataset_name):
    """Check if required variables exist in dataset"""
    missing = [v for v in var_list if v not in df.columns]
    if missing:
        print(f"\nWARNING - {dataset_name}:")
        print(f"   Missing {len(missing)} variables: {missing[:10]}")  # Show first 10
        if len(missing) > 10:
            print(f"   ... and {len(missing)-10} more")
        return False
    else:
        print(f"✓ {dataset_name}: All {len(var_list)} variables found")
        return True

# Validate each dataset
ibu_valid = check_missing_variables(df_ibu, ibu_vars_needed, "Mother Dataset")
anak_valid = check_missing_variables(df_anak, anak_vars_needed, "Child Dataset")
lahir_valid = check_missing_variables(df_lahir, lahir_vars_needed, "Birth Dataset")

if not all([ibu_valid, anak_valid, lahir_valid]):
    print("\nSome variables are missing. Please check your dataset!")

Validating Variable Availability
----------------------------------------------------------------------
✓ Mother Dataset: All 19 variables found
✓ Child Dataset: All 12 variables found
✓ Birth Dataset: All 4 variables found


In [None]:
# ============================================================================
# SELECT AND COPY VARIABLES
# ============================================================================
print("Selecting Required Variables")
print("="*70)

df_ibu_clean = df_ibu[ibu_vars_needed].copy()
df_anak_clean = df_anak[anak_vars_needed].copy()
df_lahir_clean = df_lahir[lahir_vars_needed].copy()

print(f"✓ Mother data: {df_ibu_clean.shape[0]:,} rows, {df_ibu_clean.shape[1]} columns")
print(f"✓ Child data: {df_anak_clean.shape[0]:,} rows, {df_anak_clean.shape[1]} columns")
print(f"✓ Birth data: {df_lahir_clean.shape[0]:,} rows, {df_lahir_clean.shape[1]} columns")

Selecting Required Variables
✓ Mother data: 45,607 rows, 19 columns
✓ Child data: 18,021 rows, 12 columns
✓ Birth data: 83,650 rows, 4 columns


In [None]:
# ============================================================================
# DATA CLEANING
# ============================================================================
print("Data Quality Checks & Cleaning")
print("="*70)

def handle_dhs_special_codes(df, var, missing_codes=[98, 99], method='nan'):
    """
    Handle DHS special codes (Don't Know, Missing, Not Applicable)

    Parameters:
    - df: DataFrame
    - var: Variable name
    - missing_codes: List of codes to treat as missing
    - method: 'nan' or 'zero'
    """
    if var in df.columns:
        original_missing = df[var].isna().sum()
        df[var] = df[var].replace(missing_codes, np.nan)
        new_missing = df[var].isna().sum()
        if new_missing > original_missing:
            print(f"  • {var}: Cleaned {new_missing - original_missing:,} special codes → NaN")
    return df

# Clean Mother data
print("\nCleaning Mother Dataset:")
# Corrected calls for handle_dhs_special_codes
df_ibu_clean = handle_dhs_special_codes(df_ibu_clean, 'm14_1', missing_codes=[97, 98, 99])  # ANC visits
df_ibu_clean = handle_dhs_special_codes(df_ibu_clean, 'm15_1', missing_codes=[97, 98, 99])  # Delivery Places
df_ibu_clean = handle_dhs_special_codes(df_ibu_clean, 'v212', missing_codes=[98, 99])  # Age
df_ibu_clean = handle_dhs_special_codes(df_ibu_clean, 'v106', missing_codes=[8, 9]) # Education
df_ibu_clean = handle_dhs_special_codes(df_ibu_clean, 'v481', missing_codes=[8, 9]) # Insurance
df_ibu_clean = handle_dhs_special_codes(df_ibu_clean, 'b11_01', missing_codes=[98, 99]) # Birth weight


# Clean Child data
print("\nCleaning Child Dataset:")
df_anak_clean = handle_dhs_special_codes(df_anak_clean, 'm18', missing_codes=[8, 9, 98, 99])  # Birth size
# Assuming immunization variables h2, h4, h6, h8 also need cleaning for 8, 9
df_anak_clean = handle_dhs_special_codes(df_anak_clean, 'h2', missing_codes=[8, 9])
df_anak_clean = handle_dhs_special_codes(df_anak_clean, 'h4', missing_codes=[8, 9])
df_anak_clean = handle_dhs_special_codes(df_anak_clean, 'h6', missing_codes=[8, 9])
df_anak_clean = handle_dhs_special_codes(df_anak_clean, 'h8', missing_codes=[8, 9])
df_anak_clean = handle_dhs_special_codes(df_anak_clean, 'm70', [8,9]) # Postnatal Check
df_anak_clean = handle_dhs_special_codes(df_anak_clean, 'm71', [998, 999]) # Time after delivery
required_cols = ['v008', 'b3']
missing_cols = [c for c in required_cols if c not in df_anak_clean.columns]
if missing_cols:
    raise ValueError(f"Missing required columns for age calculation: {missing_cols}")

# Hitung umur dalam bulan
df_anak_clean['age_months'] = df_anak_clean['v008'] - df_anak_clean['b3']

# Bersihkan nilai-nilai tidak valid
df_anak_clean.loc[df_anak_clean['age_months'] < 0, 'age_months'] = None  # impossible
df_anak_clean.loc[df_anak_clean['age_months'] > 200, 'age_months'] = None  # upper sanity bound

print("=== AGE MONTHS CHECK ===")
print(df_anak_clean['age_months'].describe())
print("Invalid ages:", df_anak_clean['age_months'].isna().sum())

# Clean Birth data
print("\nCleaning Birth Dataset:")
df_lahir_clean = handle_dhs_special_codes(df_lahir_clean, 'b11', missing_codes=[98, 99])  # Birth interval


Data Quality Checks & Cleaning

Cleaning Mother Dataset:
  • m14_1: Cleaned 139 special codes → NaN

Cleaning Child Dataset:
  • m18: Cleaned 570 special codes → NaN
  • h2: Cleaned 73 special codes → NaN
  • h4: Cleaned 72 special codes → NaN
  • h6: Cleaned 72 special codes → NaN
  • h8: Cleaned 72 special codes → NaN
  • m70: Cleaned 86 special codes → NaN
  • m71: Cleaned 337 special codes → NaN
=== AGE MONTHS CHECK ===
count    18021.000000
mean        29.267965
std         17.289612
min          0.000000
25%         14.000000
50%         29.000000
75%         44.000000
max         59.000000
Name: age_months, dtype: float64
Invalid ages: 0

Cleaning Birth Dataset:
  • b11: Cleaned 297 special codes → NaN


In [None]:
# ============================================================================
# FILTER TO APPROPRIATE SAMPLE
# ============================================================================
print("="*70)
print("Sample Selection")
print("="*70)

print(f"Original sample: {len(df_ibu_clean):,} mothers")

# m14_1 is only asked for most recent birth in last 5 years
df_ibu_analysis = df_ibu_clean[df_ibu_clean['v208'] > 0].copy()
print(f"Analysis sample: {len(df_ibu_analysis):,} mothers with recent births")
print(f"Filtered out: {len(df_ibu_clean) - len(df_ibu_analysis):,} mothers without recent births")

df_anak_analysis = df_anak_clean[df_anak_clean['age_months'].between(0, 23)].copy()
print(f"Analysis child sample: {len(df_anak_analysis):,}")
print(f"Filtered out: {len(df_anak_clean) - len(df_anak_analysis):,}")

Sample Selection
Original sample: 45,607 mothers
Analysis sample: 15,262 mothers with recent births
Filtered out: 30,345 mothers without recent births
Analysis child sample: 7,334
Filtered out: 10,687


In [None]:
# Show missing data summary
print("\nMissing Data Summary:")
print("-" * 50)
key_vars = ['m14_1', 'm15_1', 'v212', 'v481', 'm18', 'v220']
for var in key_vars:
    for df, name in [(df_ibu_clean, 'ibu'), (df_anak_clean, 'anak'), (df_lahir_clean, 'lahir')]:
        if var in df.columns:
            missing_pct = (df[var].isna().sum() / len(df)) * 100
            print(f"  {name}.{var}: {missing_pct:.1f}% missing")


Missing Data Summary:
--------------------------------------------------
  ibu.m14_1: 66.8% missing
  ibu.m15_1: 66.6% missing
  ibu.v212: 29.6% missing
  ibu.v481: 0.1% missing
  anak.m18: 4.0% missing


#### 3.3 Feature Engineering

In [None]:
# ============================================================================
# FEATURE ENGINEERING
# ============================================================================
print("Feature Engineering")
print("-"*70)

# -----------------------------------------------------------------------------
# A. MATERNAL HEALTH SERVICE INDICATORS
# -----------------------------------------------------------------------------
print("\nA. Creating Maternal Health Service Indicators:")

# Ensure `df_ibu_analysis` is used as the base for feature engineering
df_ibu_analysis = df_ibu_analysis[df_ibu_analysis['m14_1'].notna()].copy()

# 1. ANC 4+ visits (m14_1 >= 4)
df_ibu_analysis['anc4_pct'] = (df_ibu_analysis['m14_1'] >= 4).astype(int)
print(f"4+ ANC visits: {df_ibu_analysis['anc4_pct'].mean()*100:.1f}%")

# 2. Facility delivery (m15_1: 20-36 = health facilities)
df_ibu_analysis['facility_delivery_pct'] = (
    ((df_ibu_analysis['m15_1'] >= 21) & (df_ibu_analysis['m15_1'] <= 27)) |
    ((df_ibu_analysis['m15_1'] >= 31) & (df_ibu_analysis['m15_1'] <= 37))
).astype(int)
print(f"Facility delivery: {df_ibu_analysis['facility_delivery_pct'].mean()*100:.1f}%")

# 3. Skilled Birth Attendant
# m3a_1 = doctor, m3b_1 = nurse/midwife, m3c_1 = auxiliary midwife
sba_cols = [col for col in df_ibu_analysis.columns if col.startswith('m3') and col.endswith('_1')]
excluded_tba = ['m3f_1'] # Not Skilled
sba_cols_filtered = [c for c in sba_cols if c not in excluded_tba]

df_ibu_analysis['sba_pct'] = df_ibu_analysis[sba_cols_filtered].eq(1).any(axis=1).astype(int)
print(f"Skilled Birth Attendant: {df_ibu_analysis['sba_pct'].mean()*100:.1f}%")

# 4. Any PNC (m70 = 1 → dapat pemeriksaan nifas bayi dalam 2 bulan)
df_pnc = df_anak_analysis[df_anak_analysis['m70'].notna()].copy()
df_pnc['pnc_any_flag'] = (df_pnc['m70'] == 1).astype(int)
print(f"Any PNC (age<=23mo): {df_pnc['pnc_any_flag'].mean()*100:.1f}%")

# 5. PNC within 2 days (m71: hours/days after delivery)
m71 = df_pnc['m71']

mask_hours = m71.between(100, 171)          # 100–171: jam
mask_hours_dk = m71.isin([198, 199])        # DK in hours → tetap dianggap <= 48h
mask_days = m71.between(200, 202)           # 200–202: 0–2 hari

df_pnc['pnc48h_flag'] = (mask_hours | mask_hours_dk | mask_days).astype(int)
print(f"PNC within 2 days (age<=23mo): {df_pnc['pnc48h_flag'].mean()*100:.1f}%")

# 6. Parity
df_ibu_analysis.rename(columns={'v201': 'parity'}, inplace=True)

# -----------------------------------------------------------------------------
# B. SOCIO-DEMOGRAPHIC INDICATORS
# -----------------------------------------------------------------------------
print("\nB. Creating Socio-Demographic Indicators:")

df_ibu_analysis['insured_pct'] = (df_ibu_analysis['v481'] == 1).astype(int)
print(f"Health insurance: {df_ibu_analysis['insured_pct'].mean()*100:.1f}%")

df_ibu_analysis['urban_share_pct'] = (df_ibu_analysis['v025'] == 1).astype(int)
print(f"Urban residence: {df_ibu_analysis['urban_share_pct'].mean()*100:.1f}%")

df_ibu_analysis['low_education_pct'] = (df_ibu_analysis['v106'] <= 1).astype(int)
print(f"Low education (\u2264Primary): {df_ibu_analysis['low_education_pct'].mean()*100:.1f}%")

# -----------------------------------------------------------------------------
# C. RISK FACTORS
# -----------------------------------------------------------------------------
print("\nC. Creating Risk Factor Indicators:")

df_ibu_analysis['maternal_age_risky_pct'] = (
    (df_ibu_analysis['v212'] < 20) | (df_ibu_analysis['v212'] >= 35)
).astype(int)
print(f"Risky maternal age: {df_ibu_analysis['maternal_age_risky_pct'].mean()*100:.1f}%")

# Birth interval (from birth recode)
mask_multibirth = df_lahir_clean['bord'] > 1

df_lahir_clean['short_interval_flag'] = np.nan
df_lahir_clean.loc[mask_multibirth, 'short_interval_flag'] = (
    df_lahir_clean.loc[mask_multibirth, 'b11'] < 24
).astype(int)

print(f"Short birth interval (<24mo): {df_lahir_clean.loc[mask_multibirth, 'short_interval_flag'].mean()*100:.1f}%")

# Child variables
df_anak_analysis['small_birth_size_pct'] = (
    df_anak_analysis['m18'].isin([4, 5])  # 4=small, 5=very small
).astype(int)
print(f"Small birth size: {df_anak_analysis['small_birth_size_pct'].mean()*100:.1f}%")

valid_yes = [1, 2, 3]
mask_12_23 = df_anak_clean['age_months'].between(12, 23)

# Default: NaN (anak di luar 12–23 bulan tidak ikut denominator)
# Full immunization (h2=BCG, h4=DPT1, h6=Polio3, h8=Measles)
# Code 1/2/3 = Yes (card/recall/marked), 0 = No, 8 = Don't know
df_anak_clean['full_immun_binary'] = np.nan

df_anak_clean.loc[mask_12_23, 'full_immun_binary'] = (
    df_anak_clean.loc[mask_12_23, 'h2'].isin(valid_yes) &
    df_anak_clean.loc[mask_12_23, 'h4'].isin(valid_yes) &
    df_anak_clean.loc[mask_12_23, 'h6'].isin(valid_yes) &
    df_anak_clean.loc[mask_12_23, 'h8'].isin(valid_yes)
).astype(int)

print(
    "Full immunization (12–23 mo, all 4 vaccines): "
    f"{df_anak_clean.loc[mask_12_23, 'full_immun_binary'].mean()*100:.1f}%"
)

Feature Engineering
----------------------------------------------------------------------

A. Creating Maternal Health Service Indicators:
4+ ANC visits: 84.4%
Facility delivery: 52.8%
Skilled Birth Attendant: 80.8%
Any PNC (age<=23mo): 63.6%
PNC within 2 days (age<=23mo): 46.0%

B. Creating Socio-Demographic Indicators:
Health insurance: 40.0%
Urban residence: 45.9%
Low education (≤Primary): 33.5%

C. Creating Risk Factor Indicators:
Risky maternal age: 33.2%
Short birth interval (<24mo): 21.2%
Small birth size: 13.8%
Full immunization (12–23 mo, all 4 vaccines): 69.5%


#### 3.4 Merging

In [None]:
# Mother-level indicators

df_ibu_2012 = df_ibu_analysis.copy()

# ANC 4+
df_ibu_2012['anc4_flag'] = (df_ibu_2012['m14_1'] >= 4).astype(int)

# Facility delivery (20–89 = health facility)
df_ibu_2012['facility_flag'] = df_ibu_2012['m15_1'].between(20, 89).astype(int)

# SBA (pakai loop semua m3*_1 kecuali TBA, misal m3f_1)
sba_cols = [c for c in df_ibu_2012.columns if c.startswith('m3') and c.endswith('_1')]
excluded_tba = ['m3f_1']  # sesuaikan kalau TBA di kode lain
sba_cols = [c for c in sba_cols if c not in excluded_tba]
df_ibu_2012['sba_flag'] = df_ibu_2012[sba_cols].eq(1).any(axis=1).astype(int)

# Insurance
df_ibu_2012['insured_flag'] = (df_ibu_2012['v481'] == 1).astype(int)

# Urban
df_ibu_2012['urban_flag'] = (df_ibu_2012['v025'] == 1).astype(int)

# Low education (≤ primary)
df_ibu_2012['lowedu_flag'] = df_ibu_2012['v106'].isin([0, 1]).astype(int)  # 0 = no edu, 1 = primary

# Risky maternal age (<20 or >=35)
df_ibu_2012['risky_age_flag'] = ((df_ibu_2012['v212'] < 20) | (df_ibu['v212'] >= 35)).astype(int)

# Parity (bisa dipakai langsung mean)
df_ibu_2012['parity'] = df_ibu_2012['parity']

# Weighting
df_ibu_2012['weight'] = df_ibu_2012['v005'] / 1_000_000

In [None]:
df_birth = df_lahir_clean.copy()

# Hanya kelahiran dengan older sibling yang punya interval
df_birth_valid = df_birth[df_birth['bord'] > 1].copy()

df_birth_valid['short_interval_flag'] = (df_birth_valid['b11'] < 24).astype(int)

# Aggregate ke ibu: jika ada short interval pada salah satu kelahiran → ibu berisiko
df_interval_mother = (
    df_birth_valid
    .groupby('caseid')['short_interval_flag']
    .max()
    .reset_index()
)

# Merge ke ibu
df_ibu_2012 = df_ibu_2012.merge(df_interval_mother, on='caseid', how='left')

# Ibu yang hanya punya 1 anak (tidak punya interval) = NA → bukan risiko, assign 0
df_ibu_2012['short_interval_flag'] = df_ibu_2012['short_interval_flag'].fillna(0).astype(int)

In [None]:
# Child-level indicators

df_child_2012 = df_anak_analysis.copy()

# LBW proxy: small_birth_size (m18: 4/5 = small/very small, 1–3 = average/large)
df_child_2012['small_birth_flag'] = df_child_2012['m18'].isin([4, 5]).astype(int)

# Immunization flags (anggap 1 = received, 2/3 juga treated as received kalau ada)
for col in ['h2', 'h4', 'h6', 'h8']:
    df_child_2012[col + '_flag'] = df_child_2012[col].isin([1, 2, 3]).astype(int)

# Full immunization (semua 4 vaksin diterima)
imm_cols = ['h2_flag', 'h4_flag', 'h6_flag', 'h8_flag']
df_child_2012['full_immun_flag'] = df_child_2012[imm_cols].all(axis=1).astype(int)

# Weighting
df_child_2012['weight'] = df_child_2012['v005'] / 1_000_000

In [None]:
# PNC indicators

df_pnc = df_child_2012[df_child_2012['m70'].notna()].copy()

# Any PNC
df_pnc['pnc_any_flag'] = (df_pnc['m70'] == 1).astype(int)

# PNC ≤48h (pakai coding numeric)
m71 = df_pnc['m71']
mask_hours = m71.between(100, 171)
mask_hours_dk = m71.isin([198, 199])
mask_days = m71.between(200, 202)
df_pnc['pnc48h_flag'] = (mask_hours | mask_hours_dk | mask_days).astype(int)


In [None]:
group_cols = ['v024']  # v024 = province

# Aggregate Mother
prov_mother = (
    df_ibu_2012
    .groupby(group_cols)
    .agg(
        anc4_pct=('anc4_flag', 'mean'),
        facility_delivery_pct=('facility_flag', 'mean'),
        sba_pct=('sba_flag', 'mean'),
        insured_pct=('insured_flag', 'mean'),
        urban_share_pct=('urban_flag', 'mean'),
        low_education_pct=('lowedu_flag', 'mean'),
        risky_maternal_age_pct=('risky_age_flag', 'mean'),
        birth_interval_short_pct=('short_interval_flag', 'mean'),
        avg_parity=('parity', 'mean')
    )
    .reset_index()
)

# Aggregate Child
prov_child = (
    df_child_2012
    .groupby(group_cols)
    .agg(
        lbw_pct=('small_birth_flag', 'mean'),
        full_immun_pct=('full_immun_flag', 'mean')
    )
    .reset_index()
)

# Aggregate PNC
prov_pnc = (
    df_pnc
    .groupby(group_cols)
    .agg(
        pnc_any_pct=('pnc_any_flag', 'mean'),
        pnc48h_pct=('pnc48h_flag', 'mean')
    )
    .reset_index()
)

In [None]:
df_2012 = prov_mother.merge(prov_child, on=['v024'], how='left')
df_2012 = df_2012.merge(prov_pnc, on=['v024'], how='left')

df_2012.rename(columns={'v024': 'region'}, inplace=True)


In [None]:
prov_map = {
    11: "Aceh",
    12: "Sumatera Utara",
    13: "Sumatera Barat",
    14: "Riau",
    15: "Jambi",
    16: "Sumatera Selatan",
    17: "Bengkulu",
    18: "Lampung",
    19: "Kepulauan Bangka Belitung",
    21: "Kepulauan Riau",
    31: "DKI Jakarta",
    32: "Jawa Barat",
    33: "Jawa Tengah",
    34: "DI Yogyakarta",
    35: "Jawa Timur",
    36: "Banten",
    51: "Bali",
    52: "Nusa Tenggara Barat",
    53: "Nusa Tenggara Timur",
    61: "Kalimantan Barat",
    62: "Kalimantan Tengah",
    63: "Kalimantan Selatan",
    64: "Kalimantan Timur",
    65: "Kalimantan Utara",
    71: "Sulawesi Utara",
    72: "Sulawesi Tengah",
    73: "Sulawesi Selatan",
    74: "Sulawesi Tenggara",
    75: "Gorontalo",
    76: "Sulawesi Barat",
    81: "Maluku",
    82: "Maluku Utara",
    91: "Papua",
    92: "Papua Barat",
    94: "Papua Barat"
}

df_2012['province'] = df_2012['region'].map(prov_map)

# cek apakah ada kode yang belum ter-map
unmapped = df_2012[df_2012['province'].isna()]['region'].unique()
print("Unmapped province codes:", unmapped)

Unmapped province codes: []


In [None]:
cols = df_2012.columns.tolist()

new_order = ['region', 'province'] + [c for c in cols if c not in ['region', 'province']]

# Reorder dataframe
df_2012 = df_2012[new_order]

In [None]:
# ============================================================================
# FINAL DATA QUALITY SUMMARY
# ============================================================================
print("="*80)
print("FINAL DATA QUALITY SUMMARY – PROVINCIAL INDICATORS")
print("="*80)

# 1. Basic dataset info
print(f"\nTotal observations        : {len(df_2012):,}")
print(f"Unique provinces          : {df_2012['region'].nunique()}")
print(f"Total indicators          : {df_2012.shape[1] - 2} (excluding region/year)")

# 2. Missing check
missing = df_2012.isna().sum()
if missing.sum() == 0:
    print("\nMissing values            : 0")
else:
    print("\nMissing values:")
    print(missing[missing > 0])

# 3. Indicator ranges (auto-detect pct columns)
print("\nIndicator Ranges by Province:")
print("-" * 80)

indicator_cols = [
    col for col in df_2012.columns
    if col not in ['region', 'province'] # Exclude 'province' column from numerical checks
]

for col in indicator_cols:
    col_min = df_2012[col].min()
    col_max = df_2012[col].max()

    # convert to percentage if value seems 0-1
    if col_max <= 1.5:
        print(f"{col:30s}: {col_min*100:6.1f}% – {col_max*100:6.1f}%")
    else:
        print(f"{col:30s}: {col_min:6.2f} – {col_max:6.2f}")

# 4. Quick descriptive stats
print("\nQuick Descriptive Stats:")
print("-" * 80)
print(df_2012[indicator_cols].describe().T[['mean','std','min','max']])

print("\nData preview:")
print(df_2012.head())

print("\nPROCESSING COMPLETE ✔")
print("="*80)


FINAL DATA QUALITY SUMMARY – PROVINCIAL INDICATORS

Total observations        : 33
Unique provinces          : 33
Total indicators          : 13 (excluding region/year)

Missing values            : 0

Indicator Ranges by Province:
--------------------------------------------------------------------------------
anc4_pct                      :   41.2% –   97.2%
facility_delivery_pct         :   20.9% –   98.4%
sba_pct                       :   46.3% –   98.7%
insured_pct                   :   23.2% –   75.9%
urban_share_pct               :   22.3% –  100.0%
low_education_pct             :   10.4% –   55.3%
risky_maternal_age_pct        :   19.2% –   48.2%
birth_interval_short_pct      :    7.2% –   40.3%
avg_parity                    :   1.80 –   3.23
lbw_pct                       :    5.2% –   24.9%
full_immun_pct                :   34.0% –   79.0%
pnc_any_pct                   :   26.7% –   94.5%
pnc48h_pct                    :   12.7% –   89.0%

Quick Descriptive Stats:
--------------

In [None]:
print("Saving Results")

df_2012.to_csv('dhs_regional_2012.csv', index=False)
print("Files saved successfully!")

Saving Results
Files saved successfully!


## C. Data DHS (2007)



### 1. Load Data

In [None]:
print("Preprocessing Data SDKI 2007...")

ibu_dta_path = 'IDIR51FL.DTA' # File ibu
anak_dta_path = 'IDKR51FL.DTA' # File anak
lahir_dta_path = 'IDBR51FL.DTA' # File Kelahiran

print("\nMembaca file data Stata (.DTA)...")

try:
    df_ibu, meta_ibu = pyreadstat.read_dta(ibu_dta_path)
    df_anak, meta_anak = pyreadstat.read_dta(anak_dta_path)
    df_lahir, meta_lahir = pyreadstat.read_dta(lahir_dta_path)
    print("File Ibu, Rumah Tangga, Kelahiran, & Anak (.DTA) berhasil dibaca.")
except Exception as e:
    print(f"Error saat membaca file: {e}")
    exit()

df_ibu.columns = df_ibu.columns.str.lower()
df_anak.columns = df_anak.columns.str.lower()
df_lahir.columns = df_lahir.columns.str.lower()

print("\n--- Check Dataset ---")
print("Jumlah kolom di df_ibu:", len(df_ibu.columns))
print("Jumlah kolom di df_anak:", len(df_anak.columns))
print("Jumlah kolom di df_lahir:", len(df_lahir.columns))
print("-------------------------------------------------------")

Preprocessing Data SDKI 2007...

Membaca file data Stata (.DTA)...
File Ibu, Rumah Tangga, Kelahiran, & Anak (.DTA) berhasil dibaca.

--- Check Dataset ---
Jumlah kolom di df_ibu: 4920
Jumlah kolom di df_anak: 1346
Jumlah kolom di df_lahir: 1346
-------------------------------------------------------


### 2. Check Variable Name

In [None]:
print("\n--- Mencari Variabel Pendidikan ---")
kata_kunci = 'educa'

# Mencari di dalam nama kolom dan label deskripsinya
for var, label in meta_ibu.column_names_to_labels.items():
    if kata_kunci in var.lower() or kata_kunci in label.lower():
        print(f"Nama Variabel: {var}, Label: {label}")

print("---------------------------------")


--- Mencari Variabel Pendidikan ---
Nama Variabel: v106, Label: highest educational level
Nama Variabel: v107, Label: highest year of education
Nama Variabel: v133, Label: education in single years
Nama Variabel: v149, Label: educational attainment
Nama Variabel: awfacte, Label: all woman factor - educational (as used in report)
Nama Variabel: v701, Label: partner's education level
Nama Variabel: v702, Label: highest year of education
Nama Variabel: v715, Label: husband's education-single years
Nama Variabel: v729, Label: partner's educational attainment
Nama Variabel: swfacte, Label: all woman factor - educational (standard dhs breakdown)
---------------------------------


In [None]:
print("\n--- Mencari Variabel Ukuran Bayi Lahir ---")
# meta_anak berisi "kamus" untuk df_anak
kata_kunci_1 = 'size'  # Kata kunci: ukuran
kata_kunci_2 = 'weight' # Kata kunci: berat

# Mencari di dalam nama kolom dan label deskripsinya
for var, label in meta_anak.column_names_to_labels.items():
    if kata_kunci_1 in label.lower() or kata_kunci_2 in label.lower():
        print(f"Nama Variabel: {var}, Label: {label}")

print("---------------------------------")


--- Mencari Variabel Ukuran Bayi Lahir ---
Nama Variabel: v005, Label: sample weight
Nama Variabel: v419, Label: entries in height/weight table
Nama Variabel: v437, Label: na-respondent's weight (kilos-1d)
Nama Variabel: m18, Label: size of child at birth
Nama Variabel: m19, Label: birth weight (kilos - 3 dec.)
Nama Variabel: m19a, Label: weight at birth recall
Nama Variabel: hw2, Label: na-weight in kilograms (1 dec.)
Nama Variabel: s818k, Label: male sti symptoms: loss of weight
Nama Variabel: s819k, Label: female sti symptoms: loss of weight
---------------------------------


In [None]:
print("\n--- Mencari Variabel Penolong Persalinan ---")
kata_kunci_1 = 'delivery'
kata_kunci_2 = 'birth'

for var, label in meta_ibu.column_names_to_labels.items():
    if kata_kunci_1 in label.lower() or kata_kunci_2 in label.lower():
        print(f"Nama Variabel: {var}, Label: {label}")

print("---------------------------------")


--- Mencari Variabel Penolong Persalinan ---
Nama Variabel: v009, Label: respondent's month of birth
Nama Variabel: v010, Label: respondent's year of birth
Nama Variabel: v011, Label: date of birth (cmc)
Nama Variabel: bidx_01, Label: birth column number
Nama Variabel: bidx_02, Label: birth column number
Nama Variabel: bidx_03, Label: birth column number
Nama Variabel: bidx_04, Label: birth column number
Nama Variabel: bidx_05, Label: birth column number
Nama Variabel: bidx_06, Label: birth column number
Nama Variabel: bidx_07, Label: birth column number
Nama Variabel: bidx_08, Label: birth column number
Nama Variabel: bidx_09, Label: birth column number
Nama Variabel: bidx_10, Label: birth column number
Nama Variabel: bidx_11, Label: birth column number
Nama Variabel: bidx_12, Label: birth column number
Nama Variabel: bidx_13, Label: birth column number
Nama Variabel: bidx_14, Label: birth column number
Nama Variabel: bidx_15, Label: birth column number
Nama Variabel: bidx_16, Label:

In [None]:
print("\n--- Mencari Semua Variabel Bantuan Persalinan ---")
kata_kunci = 'assistance'

for var, label in meta_ibu.column_names_to_labels.items():
    # Kita hanya cari di variabel yang berawalan 'm3' untuk menjaga relevansi
    if var.startswith('m3') and kata_kunci in label.lower():
        print(f"Nama Variabel: {var}, Label: {label}")

print("---------------------------------")


--- Mencari Semua Variabel Bantuan Persalinan ---
Nama Variabel: m3a_1, Label: assistance: doctor
Nama Variabel: m3a_2, Label: assistance: doctor
Nama Variabel: m3a_3, Label: assistance: doctor
Nama Variabel: m3a_4, Label: assistance: doctor
Nama Variabel: m3a_5, Label: assistance: doctor
Nama Variabel: m3a_6, Label: assistance: doctor
Nama Variabel: m3b_1, Label: assistance: nurse
Nama Variabel: m3b_2, Label: assistance: nurse
Nama Variabel: m3b_3, Label: assistance: nurse
Nama Variabel: m3b_4, Label: assistance: nurse
Nama Variabel: m3b_5, Label: assistance: nurse
Nama Variabel: m3b_6, Label: assistance: nurse
Nama Variabel: m3c_1, Label: assistance: village midwife
Nama Variabel: m3c_2, Label: assistance: village midwife
Nama Variabel: m3c_3, Label: assistance: village midwife
Nama Variabel: m3c_4, Label: assistance: village midwife
Nama Variabel: m3c_5, Label: assistance: village midwife
Nama Variabel: m3c_6, Label: assistance: village midwife
Nama Variabel: m3d_1, Label: assistan

In [None]:
print("\n--- Mencari Variabel Pemeriksaan Nifas Ibu ---")
kata_kunci_1 = 'postnatal'

for var, label in meta_anak.column_names_to_labels.items():
    if kata_kunci_1 in label.lower():
        print(f"Nama Variabel: {var}, Label: {label}")

print("---------------------------------")


--- Mencari Variabel Pemeriksaan Nifas Ibu ---
Nama Variabel: m70, Label: baby postnatal check within 2 months
Nama Variabel: m71, Label: how long after delivery postnatal check took place
Nama Variabel: m72, Label: who performed postnatal checkup
Nama Variabel: s429, Label: child received postnatal check by health professional or tba
Nama Variabel: s429a, Label: timing of first postnatal check
Nama Variabel: s431, Label: person who did first postnatal check
Nama Variabel: s432, Label: place of first postnatal check
---------------------------------


In [None]:
# --- KODE DETEKTIF: Cari kata kunci "checkup" ---
print("\n--- Mencari Variabel Pemeriksaan Ibu (checkup) ---")
kata_kunci = 'checkup'

for var, label in meta_ibu.column_names_to_labels.items():
    # Kita cari di variabel 'm' (maternity)
    if var.startswith('m') and kata_kunci in label.lower():
        print(f"Nama Variabel: {var}, Label: {label}")

print("---------------------------------")


--- Mencari Variabel Pemeriksaan Ibu (checkup) ---
Nama Variabel: m51a_1, Label: na-respondent checkup after deliver timing
Nama Variabel: m51a_2, Label: na-respondent checkup after deliver timing
Nama Variabel: m51a_3, Label: na-respondent checkup after deliver timing
Nama Variabel: m51a_4, Label: na-respondent checkup after deliver timing
Nama Variabel: m51a_5, Label: na-respondent checkup after deliver timing
Nama Variabel: m51a_6, Label: na-respondent checkup after deliver timing
Nama Variabel: m72_1, Label: who performed postnatal checkup
Nama Variabel: m72_2, Label: who performed postnatal checkup
Nama Variabel: m72_3, Label: who performed postnatal checkup
Nama Variabel: m72_4, Label: who performed postnatal checkup
Nama Variabel: m72_5, Label: who performed postnatal checkup
Nama Variabel: m72_6, Label: who performed postnatal checkup
---------------------------------


In [None]:
# Cari variabel Asuransi
# --- KODE DETEKTIF: Cari variabel Asuransi dengan kata kunci baru ---
print("\n--- Mencari Variabel Asuransi (Pencarian Lanjutan) ---")

kata_kunci_list = [ 'jaminan', 'bpjs', 'insurance', 'asuransi'] # Kita coba beberapa kata kunci

for var, label in meta_ibu.column_names_to_labels.items():
    for kata_kunci in kata_kunci_list:
        if kata_kunci in label.lower():
            print(f"Nama Variabel: {var}, Label: {label}")
            break # Hentikan jika sudah ketemu agar tidak duplikat

print("---------------------------------")


--- Mencari Variabel Asuransi (Pencarian Lanjutan) ---
Nama Variabel: v481, Label: na-covered by health insurance
Nama Variabel: v481a, Label: na-health insurance type: mutual/community organization
Nama Variabel: v481b, Label: na-health insurance type: provided by employer
Nama Variabel: v481c, Label: na-health insurance type: social security
Nama Variabel: v481d, Label: na-health insurance type: private/commercial purchased
Nama Variabel: v481e, Label: na-health insurance type: cs
Nama Variabel: v481f, Label: na-health insurance type: cs
Nama Variabel: v481g, Label: na-health insurance type: cs
Nama Variabel: v481h, Label: na-health insurance type: cs
Nama Variabel: v481x, Label: na-health insurance type: other
---------------------------------


### 3. Extracting

#### 3.1 Define Required Variables

In [None]:
print("Defining Required Variables")
print("="*70)

# Mother-level variables
ibu_vars_needed = [
    'caseid',           # Unique case identifier
    'v001',             # Cluster number
    'v002',             # Household number
    'v024',             # Region/Province
    'v025',             # Type of residence (1=Urban, 2=Rural)
    'v106',             # Highest education level
    'v201',             # Total children ever born (v201)
    'v212',             # Age of respondent
    'm14_1',            # Number of ANC visits for last birth
    'm15_1',            # Place of delivery (codes 20+ = health facility)
    'v481',             # Health insurance coverage (1=Yes)
    'v208',             # Births in last five years
    'm70_1',            # Baby postnatal check within 2 months
    'm71_1',            # Time after delivery postnatal check took place
    'v005',             # Weighted
] + [f'm3{c}_1' for c in 'abcdef']

# Child-level variables
anak_vars_needed = [
    'caseid',           # Links to mother
    'v024',             # Region/Province
    'm18',              # Size of child at birth (1-3=Normal/Large, 4-5=Small)
    'h2',               # BCG vaccination (1=Yes)
    'h4',               # DPT1 vaccination (1=Yes)
    'h6',               # Polio1 vaccination (1=Yes)
    'h8',               # Measles vaccination (1=Yes)
    'v008',             # Interview Date (to calculate age)
    'b3',               # Date of birth of child (to calculate age)
    's429',             # Postnatal check from health professional / TBA
    's429a',            # Timing of first postnatal check
    'v005',             # Weighted
]

# Birth history variables
lahir_vars_needed = [
    'caseid',           # Links to mother
    'v024',             # Region/Province
    'b11',               # Preceding birth interval
    'bord'              # birth order
]

print(f"Mother variables needed: {len(ibu_vars_needed)}")
print(f"Child variables needed: {len(anak_vars_needed)}")
print(f"Birth history variables needed: {len(lahir_vars_needed)}")

Defining Required Variables
Mother variables needed: 21
Child variables needed: 12
Birth history variables needed: 4


#### 3.2 Check Data Availability & Quality

In [None]:
# ============================================================================
# VALIDATE DATASET AVAILABILITY
# ============================================================================
print("Validating Variable Availability")
print("-"*70)

def check_missing_variables(df, var_list, dataset_name):
    """Check if required variables exist in dataset"""
    missing = [v for v in var_list if v not in df.columns]
    if missing:
        print(f"\nWARNING - {dataset_name}:")
        print(f"   Missing {len(missing)} variables: {missing[:10]}")  # Show first 10
        if len(missing) > 10:
            print(f"   ... and {len(missing)-10} more")
        return False
    else:
        print(f"✓ {dataset_name}: All {len(var_list)} variables found")
        return True

# Validate each dataset
ibu_valid = check_missing_variables(df_ibu, ibu_vars_needed, "Mother Dataset")
anak_valid = check_missing_variables(df_anak, anak_vars_needed, "Child Dataset")
lahir_valid = check_missing_variables(df_lahir, lahir_vars_needed, "Birth Dataset")

if not all([ibu_valid, anak_valid, lahir_valid]):
    print("\nSome variables are missing. Please check your dataset!")

Validating Variable Availability
----------------------------------------------------------------------
✓ Mother Dataset: All 21 variables found
✓ Child Dataset: All 12 variables found
✓ Birth Dataset: All 4 variables found


In [None]:
# ============================================================================
# SELECT AND COPY VARIABLES
# ============================================================================
print("Selecting Required Variables")
print("="*70)

df_ibu_clean = df_ibu[ibu_vars_needed].copy()
df_anak_clean = df_anak[anak_vars_needed].copy()
df_lahir_clean = df_lahir[lahir_vars_needed].copy()

print(f"✓ Mother data: {df_ibu_clean.shape[0]:,} rows, {df_ibu_clean.shape[1]} columns")
print(f"✓ Child data: {df_anak_clean.shape[0]:,} rows, {df_anak_clean.shape[1]} columns")
print(f"✓ Birth data: {df_lahir_clean.shape[0]:,} rows, {df_lahir_clean.shape[1]} columns")

Selecting Required Variables
✓ Mother data: 32,895 rows, 21 columns
✓ Child data: 18,645 rows, 12 columns
✓ Birth data: 84,726 rows, 4 columns


In [None]:
# ============================================================================
# DATA CLEANING
# ============================================================================
print("Data Quality Checks & Cleaning")
print("="*70)

def handle_dhs_special_codes(df, var, missing_codes=[98, 99], method='nan'):
    """
    Handle DHS special codes (Don't Know, Missing, Not Applicable)

    Parameters:
    - df: DataFrame
    - var: Variable name
    - missing_codes: List of codes to treat as missing
    - method: 'nan' or 'zero'
    """
    if var in df.columns:
        original_missing = df[var].isna().sum()
        df[var] = df[var].replace(missing_codes, np.nan)
        new_missing = df[var].isna().sum()
        if new_missing > original_missing:
            print(f"  • {var}: Cleaned {new_missing - original_missing:,} special codes → NaN")
    return df

# Clean Mother data
print("\nCleaning Mother Dataset:")
# Corrected calls for handle_dhs_special_codes
df_ibu_clean = handle_dhs_special_codes(df_ibu_clean, 'm14_1', missing_codes=[97, 98, 99])  # ANC visits
df_ibu_clean = handle_dhs_special_codes(df_ibu_clean, 'm15_1', missing_codes=[97, 98, 99])  # Delivery Places
df_ibu_clean = handle_dhs_special_codes(df_ibu_clean, 'v212', missing_codes=[98, 99])  # Age
df_ibu_clean = handle_dhs_special_codes(df_ibu_clean, 'v106', missing_codes=[8, 9]) # Education
df_ibu_clean = handle_dhs_special_codes(df_ibu_clean, 'v481', missing_codes=[8, 9]) # Insurance
df_ibu_clean = handle_dhs_special_codes(df_ibu_clean, 'b11_01', missing_codes=[98, 99]) # Birth weight
df_ibu_clean = handle_dhs_special_codes(df_ibu_clean, 'm70_1', [8,9]) # Postnatal Check
df_ibu_clean = handle_dhs_special_codes(df_ibu_clean, 'm71_1', [998, 999]) # Time after delivery

# Clean Child data
print("\nCleaning Child Dataset:")
df_anak_clean = handle_dhs_special_codes(df_anak_clean, 'm18', missing_codes=[8, 9, 98, 99])  # Birth size
# Assuming immunization variables h2, h4, h6, h8 also need cleaning for 8, 9
df_anak_clean = handle_dhs_special_codes(df_anak_clean, 'h2', missing_codes=[8, 9])
df_anak_clean = handle_dhs_special_codes(df_anak_clean, 'h4', missing_codes=[8, 9])
df_anak_clean = handle_dhs_special_codes(df_anak_clean, 'h6', missing_codes=[8, 9])
df_anak_clean = handle_dhs_special_codes(df_anak_clean, 'h8', missing_codes=[8, 9])
df_anak_clean = handle_dhs_special_codes(df_anak_clean, 's429a', [8,9,98,99,998,999])


required_cols = ['v008', 'b3']
missing_cols = [c for c in required_cols if c not in df_anak_clean.columns]
if missing_cols:
    raise ValueError(f"Missing required columns for age calculation: {missing_cols}")

# Hitung umur dalam bulan
df_anak_clean['age_months'] = df_anak_clean['v008'] - df_anak_clean['b3']

# Bersihkan nilai-nilai tidak valid
df_anak_clean.loc[df_anak_clean['age_months'] < 0, 'age_months'] = None  # impossible
df_anak_clean.loc[df_anak_clean['age_months'] > 200, 'age_months'] = None  # upper sanity bound

print("=== AGE MONTHS CHECK ===")
print(df_anak_clean['age_months'].describe())
print("Invalid ages:", df_anak_clean['age_months'].isna().sum())

# Clean Birth data
print("\nCleaning Birth Dataset:")
df_lahir_clean = handle_dhs_special_codes(df_lahir_clean, 'b11', missing_codes=[98, 99])  # Birth interval


Data Quality Checks & Cleaning

Cleaning Mother Dataset:
  • m14_1: Cleaned 174 special codes → NaN
  • m15_1: Cleaned 50 special codes → NaN
  • v106: Cleaned 3 special codes → NaN
  • m70_1: Cleaned 97 special codes → NaN
  • m71_1: Cleaned 53 special codes → NaN

Cleaning Child Dataset:
  • m18: Cleaned 1,151 special codes → NaN
  • h2: Cleaned 181 special codes → NaN
  • h4: Cleaned 182 special codes → NaN
  • h6: Cleaned 302 special codes → NaN
  • h8: Cleaned 302 special codes → NaN
  • s429a: Cleaned 41 special codes → NaN
=== AGE MONTHS CHECK ===
count    18645.000000
mean        29.328345
std         17.392878
min          0.000000
25%         14.000000
50%         29.000000
75%         45.000000
max         59.000000
Name: age_months, dtype: float64
Invalid ages: 0

Cleaning Birth Dataset:
  • b11: Cleaned 269 special codes → NaN


In [None]:
# ============================================================================
# FILTER TO APPROPRIATE SAMPLE
# ============================================================================
print("="*70)
print("Sample Selection")
print("="*70)

print(f"Original sample: {len(df_ibu_clean):,} mothers")

# m14_1 is only asked for most recent birth in last 5 years
df_ibu_analysis = df_ibu_clean[df_ibu_clean['v208'] > 0].copy()
print(f"Analysis sample: {len(df_ibu_analysis):,} mothers with recent births")
print(f"Filtered out: {len(df_ibu_clean) - len(df_ibu_analysis):,} mothers without recent births")

df_anak_analysis = df_anak_clean[df_anak_clean['age_months'].between(0, 23)].copy()
print(f"Analysis child sample: {len(df_anak_analysis):,}")
print(f"Filtered out: {len(df_anak_clean) - len(df_anak_analysis):,}")

Sample Selection
Original sample: 32,895 mothers
Analysis sample: 15,334 mothers with recent births
Filtered out: 17,561 mothers without recent births
Analysis child sample: 7,508
Filtered out: 11,137


In [None]:
# Show missing data summary
print("\nMissing Data Summary:")
print("-" * 50)
key_vars = ['m14_1', 'm15_1', 'v212', 'v481', 'm18', 'v220']
for var in key_vars:
    for df, name in [(df_ibu_clean, 'ibu'), (df_anak_clean, 'anak'), (df_lahir_clean, 'lahir')]:
        if var in df.columns:
            missing_pct = (df[var].isna().sum() / len(df)) * 100
            print(f"  {name}.{var}: {missing_pct:.1f}% missing")


Missing Data Summary:
--------------------------------------------------
  ibu.m14_1: 53.9% missing
  ibu.m15_1: 53.5% missing
  ibu.v212: 7.5% missing
  ibu.v481: 100.0% missing
  anak.m18: 6.2% missing


#### 3.3 Feature Engineering

In [None]:
# ============================================================================
# FEATURE ENGINEERING
# ============================================================================
print("Feature Engineering")
print("-"*70)

# -----------------------------------------------------------------------------
# A. MATERNAL HEALTH SERVICE INDICATORS
# -----------------------------------------------------------------------------
print("\nA. Creating Maternal Health Service Indicators:")

# Ensure `df_ibu_analysis` is used as the base for feature engineering
df_ibu_analysis = df_ibu_analysis[df_ibu_analysis['m14_1'].notna()].copy()

# 1. ANC 4+ visits (m14_1 >= 4)
df_ibu_analysis['anc4_pct'] = (df_ibu_analysis['m14_1'] >= 4).astype(int)
print(f"4+ ANC visits: {df_ibu_analysis['anc4_pct'].mean()*100:.1f}%")

# 2. Facility delivery (m15_1: 20-36 = health facilities)
df_ibu_analysis['facility_delivery_pct'] = (
    ((df_ibu_analysis['m15_1'] >= 21) & (df_ibu_analysis['m15_1'] <= 27)) |
    ((df_ibu_analysis['m15_1'] >= 31) & (df_ibu_analysis['m15_1'] <= 37))
).astype(int)
print(f"Facility delivery: {df_ibu_analysis['facility_delivery_pct'].mean()*100:.1f}%")

# 3. Skilled Birth Attendant
# m3a_1 = doctor, m3b_1 = nurse/midwife, m3c_1 = auxiliary midwife
sba_cols = [col for col in df_ibu_analysis.columns if col.startswith('m3') and col.endswith('_1')]
excluded_tba = ['m3f_1'] # Not Skilled
sba_cols_filtered = [c for c in sba_cols if c not in excluded_tba]

df_ibu_analysis['sba_pct'] = df_ibu_analysis[sba_cols_filtered].eq(1).any(axis=1).astype(int)
print(f"Skilled Birth Attendant: {df_ibu_analysis['sba_pct'].mean()*100:.1f}%")

# Subset anak usia 0–23 bulan
df_pnc = df_anak_clean[df_anak_clean['age_months'] <= 23].copy()

# 4. Any PNC by health professional / TBA
df_pnc['pnc_any_flag'] = df_pnc['s429'].isin([1]).astype(int)
print(f"Any PNC (age<=23mo): {df_pnc['pnc_any_flag'].mean()*100:.1f}%")

# 5. PNC within 48h (0,1,2 = immediate / <24h / 24–47h)
df_ibu_analysis['pnc48h_flag'] = df_ibu_analysis['m71_1'].isin([201, 202]).astype(int)
print(f"PNC within 2 days (age<=23mo): {df_ibu_analysis['pnc48h_flag'].mean()*100:.1f}%")

# 6. Parity
df_ibu_analysis.rename(columns={'v201': 'parity'}, inplace=True)

# -----------------------------------------------------------------------------
# B. SOCIO-DEMOGRAPHIC INDICATORS
# -----------------------------------------------------------------------------
print("\nB. Creating Socio-Demographic Indicators:")

df_ibu_analysis['insured_pct'] = (df_ibu_analysis['v481'] == 1).astype(int)
print(f"Health insurance: {df_ibu_analysis['insured_pct'].mean()*100:.1f}%")

df_ibu_analysis['urban_share_pct'] = (df_ibu_analysis['v025'] == 1).astype(int)
print(f"Urban residence: {df_ibu_analysis['urban_share_pct'].mean()*100:.1f}%")

df_ibu_analysis['low_education_pct'] = (df_ibu_analysis['v106'] <= 1).astype(int)
print(f"Low education (\u2264Primary): {df_ibu_analysis['low_education_pct'].mean()*100:.1f}%")

# -----------------------------------------------------------------------------
# C. RISK FACTORS
# -----------------------------------------------------------------------------
print("\nC. Creating Risk Factor Indicators:")

df_ibu_analysis['maternal_age_risky_pct'] = (
    (df_ibu_analysis['v212'] < 20) | (df_ibu_analysis['v212'] >= 35)
).astype(int)
print(f"Risky maternal age: {df_ibu_analysis['maternal_age_risky_pct'].mean()*100:.1f}%")

# Birth interval (from birth recode)
mask_multibirth = df_lahir_clean['bord'] > 1

df_lahir_clean['short_interval_flag'] = np.nan
df_lahir_clean.loc[mask_multibirth, 'short_interval_flag'] = (
    df_lahir_clean.loc[mask_multibirth, 'b11'] < 24
).astype(int)

print(f"Short birth interval (<24mo): {df_lahir_clean.loc[mask_multibirth, 'short_interval_flag'].mean()*100:.1f}%")

# Child variables
df_anak_analysis['small_birth_size_pct'] = (
    df_anak_analysis['m18'].isin([4, 5])  # 4=small, 5=very small
).astype(int)
print(f"Small birth size: {df_anak_analysis['small_birth_size_pct'].mean()*100:.1f}%")

valid_yes = [1, 2, 3]
mask_12_23 = df_anak_clean['age_months'].between(12, 23)

# Default: NaN (anak di luar 12–23 bulan tidak ikut denominator)
# Full immunization (h2=BCG, h4=DPT1, h6=Polio3, h8=Measles)
# Code 1/2/3 = Yes (card/recall/marked), 0 = No, 8 = Don't know
df_anak_clean['full_immun_binary'] = np.nan

df_anak_clean.loc[mask_12_23, 'full_immun_binary'] = (
    df_anak_clean.loc[mask_12_23, 'h2'].isin(valid_yes) &
    df_anak_clean.loc[mask_12_23, 'h4'].isin(valid_yes) &
    df_anak_clean.loc[mask_12_23, 'h6'].isin(valid_yes) &
    df_anak_clean.loc[mask_12_23, 'h8'].isin(valid_yes)
).astype(int)

print(
    "Full immunization (12–23 mo, all 4 vaccines): "
    f"{df_anak_clean.loc[mask_12_23, 'full_immun_binary'].mean()*100:.1f}%"
)

Feature Engineering
----------------------------------------------------------------------

A. Creating Maternal Health Service Indicators:
4+ ANC visits: 78.4%
Facility delivery: 38.3%
Skilled Birth Attendant: 71.4%
Any PNC (age<=23mo): 82.8%
PNC within 2 days (age<=23mo): 20.7%

B. Creating Socio-Demographic Indicators:
Health insurance: 0.0%
Urban residence: 38.7%
Low education (≤Primary): 43.1%

C. Creating Risk Factor Indicators:
Risky maternal age: 36.5%
Short birth interval (<24mo): 24.6%
Small birth size: 15.0%
Full immunization (12–23 mo, all 4 vaccines): 64.1%


#### 3.4 Merging

In [None]:
# Mother-level indicators

df_ibu_2007 = df_ibu_analysis.copy()

# ANC 4+
df_ibu_2007['anc4_flag'] = (df_ibu_2007['m14_1'] >= 4).astype(int)

# Facility delivery (20–89 = health facility)
df_ibu_2007['facility_flag'] = df_ibu_2007['m15_1'].between(20, 89).astype(int)

# SBA (pakai loop semua m3*_1 kecuali TBA, misal m3f_1)
sba_cols = [c for c in df_ibu_2007.columns if c.startswith('m3') and c.endswith('_1')]
excluded_tba = ['m3f_1']  # sesuaikan kalau TBA di kode lain
sba_cols = [c for c in sba_cols if c not in excluded_tba]
df_ibu_2007['sba_flag'] = df_ibu_2007[sba_cols].eq(1).any(axis=1).astype(int)

# Insurance
df_ibu_2007['insured_flag'] = (df_ibu_2007['v481'] == 1).astype(int)

# Urban
df_ibu_2007['urban_flag'] = (df_ibu_2007['v025'] == 1).astype(int)

# Low education (≤ primary)
df_ibu_2007['lowedu_flag'] = df_ibu_2007['v106'].isin([0, 1]).astype(int)  # 0 = no edu, 1 = primary

# Risky maternal age (<20 or >=35)
df_ibu_2007['risky_age_flag'] = ((df_ibu_2007['v212'] < 20) | (df_ibu_2007['v212'] >= 35)).astype(int)

# Parity (bisa dipakai langsung mean)
df_ibu_2007['parity'] = df_ibu_2007['parity']

df_ibu_2007['weight'] = df_ibu_2007['v005'] / 1_000_000


In [None]:
df_birth = df_lahir_clean.copy()

# Hanya kelahiran dengan older sibling yang punya interval
df_birth_valid = df_birth[df_birth['bord'] > 1].copy()

df_birth_valid['short_interval_flag'] = (df_birth_valid['b11'] < 24).astype(int)

# Aggregate ke ibu: jika ada short interval pada salah satu kelahiran → ibu berisiko
df_interval_mother = (
    df_birth_valid
    .groupby('caseid')['short_interval_flag']
    .max()
    .reset_index()
)

# Merge ke ibu
df_ibu_2007 = df_ibu_2007.merge(df_interval_mother, on='caseid', how='left')

# Ibu yang hanya punya 1 anak (tidak punya interval) = NA → bukan risiko, assign 0
df_ibu_2007['short_interval_flag'] = df_ibu_2007['short_interval_flag'].fillna(0).astype(int)

In [None]:
# Child-level indicators

df_child_2007 = df_anak_analysis.copy()

# LBW proxy: small_birth_size (m18: 4/5 = small/very small, 1–3 = average/large)
df_child_2007['small_birth_flag'] = df_child_2007['m18'].isin([4, 5]).astype(int)

# Immunization flags (anggap 1 = received, 2/3 juga treated as received kalau ada)
for col in ['h2', 'h4', 'h6', 'h8']:
    df_child_2007[col + '_flag'] = df_child_2007[col].isin([1, 2, 3]).astype(int)

# Full immunization (semua 4 vaksin diterima)
imm_cols = ['h2_flag', 'h4_flag', 'h6_flag', 'h8_flag']
df_child_2007['full_immun_flag'] = df_child_2007[imm_cols].all(axis=1).astype(int)

In [None]:
# PNC indicators

# Subset hanya ibu yang punya data PNC (m70_1 atau m71_1)
df_pnc = df_ibu_clean[df_ibu_clean['m70_1'].notna() | df_ibu_clean['m71_1'].notna()].copy()

# Any PNC
df_pnc['pnc_any_flag'] = (df_pnc['m70_1'] == 1).astype(int)

# PNC ≤48h (pakai coding numeric)
df_pnc['pnc48h_flag'] = df_pnc['m71_1'].isin([201, 202]).astype(int)



In [None]:
group_cols = ['v024']  # v024 = province

# Aggregate Mother
prov_mother = (
    df_ibu_2007
    .groupby(group_cols)
    .agg(
        anc4_pct=('anc4_flag', 'mean'),
        facility_delivery_pct=('facility_flag', 'mean'),
        sba_pct=('sba_flag', 'mean'),
        insured_pct=('insured_flag', 'mean'),
        urban_share_pct=('urban_flag', 'mean'),
        low_education_pct=('lowedu_flag', 'mean'),
        risky_maternal_age_pct=('risky_age_flag', 'mean'),
        birth_interval_short_pct=('short_interval_flag', 'mean'),
        avg_parity=('parity', 'mean')
    )
    .reset_index()
)

# Aggregate Child
prov_child = (
    df_child_2007
    .groupby(group_cols)
    .agg(
        lbw_pct=('small_birth_flag', 'mean'),
        full_immun_pct=('full_immun_flag', 'mean')
    )
    .reset_index()
)

# Aggregate PNC
prov_pnc = (
    df_pnc
    .groupby(group_cols)
    .agg(
        pnc_any_pct=('pnc_any_flag', 'mean'),
        pnc48h_pct=('pnc48h_flag', 'mean')
    )
    .reset_index()
)

In [None]:
df_2007 = prov_mother.merge(prov_child, on=['v024'], how='left')
df_2007 = df_2007.merge(prov_pnc, on=['v024'], how='left')

df_2007.rename(columns={'v024': 'region'}, inplace=True)


In [None]:
prov_map = {
    11: "Aceh",
    12: "Sumatera Utara",
    13: "Sumatera Barat",
    14: "Riau",
    15: "Jambi",
    16: "Sumatera Selatan",
    17: "Bengkulu",
    18: "Lampung",
    19: "Kepulauan Bangka Belitung",
    21: "Kepulauan Riau",
    31: "DKI Jakarta",
    32: "Jawa Barat",
    33: "Jawa Tengah",
    34: "DI Yogyakarta",
    35: "Jawa Timur",
    36: "Banten",
    51: "Bali",
    52: "Nusa Tenggara Barat",
    53: "Nusa Tenggara Timur",
    61: "Kalimantan Barat",
    62: "Kalimantan Tengah",
    63: "Kalimantan Selatan",
    64: "Kalimantan Timur",
    65: "Kalimantan Utara",
    71: "Sulawesi Utara",
    72: "Sulawesi Tengah",
    73: "Sulawesi Selatan",
    74: "Sulawesi Tenggara",
    75: "Gorontalo",
    76: "Sulawesi Barat",
    81: "Maluku",
    82: "Maluku Utara",
    91: "Papua",
    92: "Papua Barat",
    94: "Papua Barat"
}

df_2007['province'] = df_2007['region'].map(prov_map)

# cek apakah ada kode yang belum ter-map
unmapped = df_2007[df_2007['province'].isna()]['region'].unique()
print("Unmapped province codes:", unmapped)

Unmapped province codes: []


In [None]:
cols = df_2007.columns.tolist()

new_order = ['region', 'province'] + [c for c in cols if c not in ['region', 'province']]

# Reorder dataframe
df_2007 = df_2007[new_order]

In [None]:
# ============================================================================
# FINAL DATA QUALITY SUMMARY
# ============================================================================
print("="*80)
print("FINAL DATA QUALITY SUMMARY – PROVINCIAL INDICATORS")
print("="*80)

# 1. Basic dataset info
print(f"\nTotal observations        : {len(df_2007):,}")
print(f"Unique provinces          : {df_2007['region'].nunique()}")
print(f"Total indicators          : {df_2007.shape[1] - 2} (excluding region/year)")

# 2. Missing check
missing = df_2007.isna().sum()
if missing.sum() == 0:
    print("\nMissing values            : 0")
else:
    print("\nMissing values:")
    print(missing[missing > 0])

# 3. Indicator ranges (auto-detect pct columns)
print("\nIndicator Ranges by Province:")
print("-" * 80)

indicator_cols = [
    col for col in df_2007.columns
    if col not in ['region', 'province'] # Exclude 'province' column from numerical checks
]

for col in indicator_cols:
    col_min = df_2007[col].min()
    col_max = df_2007[col].max()

    # convert to percentage if value seems 0-1
    if col_max <= 1.5:
        print(f"{col:30s}: {col_min*100:6.1f}% – {col_max*100:6.1f}%")
    else:
        print(f"{col:30s}: {col_min:6.2f} – {col_max:6.2f}")

# 4. Quick descriptive stats
print("\nQuick Descriptive Stats:")
print("-" * 80)
print(df_2007[indicator_cols].describe().T[['mean','std','min','max']])

print("\nData preview:")
print(df_2007.head())

print("\nPROCESSING COMPLETE ✔")
print("="*80)


FINAL DATA QUALITY SUMMARY – PROVINCIAL INDICATORS

Total observations        : 33
Unique provinces          : 33
Total indicators          : 13 (excluding region/year)

Missing values            : 0

Indicator Ranges by Province:
--------------------------------------------------------------------------------
anc4_pct                      :   50.6% –   97.7%
facility_delivery_pct         :    9.0% –   92.1%
sba_pct                       :   39.6% –   96.8%
insured_pct                   :    0.0% –    0.0%
urban_share_pct               :   13.2% –  100.0%
low_education_pct             :   18.1% –   62.1%
risky_maternal_age_pct        :   18.9% –   51.4%
birth_interval_short_pct      :    8.6% –   46.0%
avg_parity                    :   1.94 –   3.39
lbw_pct                       :    5.7% –   26.3%
full_immun_pct                :   23.0% –   75.3%
pnc_any_pct                   :   49.4% –   99.7%
pnc48h_pct                    :    6.4% –   61.6%

Quick Descriptive Stats:
--------------

In [None]:
print("Saving Results")

df_2007.to_csv('dhs_regional_2007.csv', index=False)
print("Files saved successfully!")

Saving Results
Files saved successfully!


## D. Data DHS (2002)



### 1. Load Data

In [None]:
print("Preprocessing Data SDKI 2002...")

ibu_dta_path = 'IDIR42FL.DTA' # File ibu
anak_dta_path = 'IDKR42FL.DTA' # File anak
lahir_dta_path = 'IDBR42FL.DTA' # File Kelahiran
rt_dta_path = 'IDHR42FL.DTA' # File House Hold

print("\nMembaca file data Stata (.DTA)...")

try:
    df_ibu, meta_ibu = pyreadstat.read_dta(ibu_dta_path)
    df_anak, meta_anak = pyreadstat.read_dta(anak_dta_path)
    df_lahir, meta_lahir = pyreadstat.read_dta(lahir_dta_path)
    df_rt, meta_rt = pyreadstat.read_dta(rt_dta_path)
    print("File Ibu, Rumah Tangga, Kelahiran, & Anak (.DTA) berhasil dibaca.")
except Exception as e:
    print(f"Error saat membaca file: {e}")
    exit()

df_ibu.columns = df_ibu.columns.str.lower()
df_anak.columns = df_anak.columns.str.lower()
df_lahir.columns = df_lahir.columns.str.lower()
df_rt.columns = df_rt.columns.str.lower()

print("\n--- Check Dataset ---")
print("Jumlah kolom di df_ibu:", len(df_ibu.columns))
print("Jumlah kolom di df_anak:", len(df_anak.columns))
print("Jumlah kolom di df_lahir:", len(df_lahir.columns))
print("Jumlah kolom di df_rt:", len(df_rt.columns))
print("-------------------------------------------------------")

Preprocessing Data SDKI 2002...

Membaca file data Stata (.DTA)...
File Ibu, Rumah Tangga, Kelahiran, & Anak (.DTA) berhasil dibaca.

--- Check Dataset ---
Jumlah kolom di df_ibu: 3833
Jumlah kolom di df_anak: 974
Jumlah kolom di df_lahir: 974
Jumlah kolom di df_rt: 1758
-------------------------------------------------------


### 2. Check Variable Name

In [None]:
print("\n--- Mencari Variabel Pendidikan ---")
kata_kunci = 'educa'

# Mencari di dalam nama kolom dan label deskripsinya
for var, label in meta_ibu.column_names_to_labels.items():
    if kata_kunci in var.lower() or kata_kunci in label.lower():
        print(f"Nama Variabel: {var}, Label: {label}")

print("---------------------------------")


--- Mencari Variabel Pendidikan ---
Nama Variabel: v106, Label: highest educational level
Nama Variabel: v107, Label: highest year of education
Nama Variabel: v133, Label: education in single years
Nama Variabel: v149, Label: educational attainment
Nama Variabel: awfacte, Label: all woman factor - educational
Nama Variabel: v701, Label: partner's education level
Nama Variabel: v702, Label: highest year of education
Nama Variabel: v715, Label: husbands education-single yrs
Nama Variabel: v729, Label: partner's educational attainm.
Nama Variabel: awfacte1, Label: all women factor for education used in the report
Nama Variabel: s108a, Label: highest educational level
---------------------------------


In [None]:
print("\n--- Mencari Variabel Ukuran Bayi Lahir ---")
# meta_anak berisi "kamus" untuk df_anak
kata_kunci_1 = 'size'  # Kata kunci: ukuran
kata_kunci_2 = 'weight' # Kata kunci: berat

# Mencari di dalam nama kolom dan label deskripsinya
for var, label in meta_anak.column_names_to_labels.items():
    if kata_kunci_1 in label.lower() or kata_kunci_2 in label.lower():
        print(f"Nama Variabel: {var}, Label: {label}")

print("---------------------------------")


--- Mencari Variabel Ukuran Bayi Lahir ---
Nama Variabel: v005, Label: sample weight
Nama Variabel: v419, Label: entries in height/weight table
Nama Variabel: v437, Label: respondent's weight (kilos--na
Nama Variabel: m18, Label: size of child at birth
Nama Variabel: m19, Label: birth weight (kilos - 3 dec.)
Nama Variabel: m19a, Label: weight at birth recall
Nama Variabel: hw2, Label: weight in kilograms (1 dec.-na
---------------------------------


In [None]:
print("\n--- Mencari Variabel Penolong Persalinan ---")
kata_kunci_1 = 'delivery'
kata_kunci_2 = 'birth'

for var, label in meta_ibu.column_names_to_labels.items():
    if kata_kunci_1 in label.lower() or kata_kunci_2 in label.lower():
        print(f"Nama Variabel: {var}, Label: {label}")

print("---------------------------------")


--- Mencari Variabel Penolong Persalinan ---
Nama Variabel: v009, Label: respondent's month of birth
Nama Variabel: v010, Label: respondent's year of birth
Nama Variabel: v011, Label: date of birth (cmc)
Nama Variabel: bidx_01, Label: birth column number
Nama Variabel: bidx_02, Label: birth column number
Nama Variabel: bidx_03, Label: birth column number
Nama Variabel: bidx_04, Label: birth column number
Nama Variabel: bidx_05, Label: birth column number
Nama Variabel: bidx_06, Label: birth column number
Nama Variabel: bidx_07, Label: birth column number
Nama Variabel: bidx_08, Label: birth column number
Nama Variabel: bidx_09, Label: birth column number
Nama Variabel: bidx_10, Label: birth column number
Nama Variabel: bidx_11, Label: birth column number
Nama Variabel: bidx_12, Label: birth column number
Nama Variabel: bidx_13, Label: birth column number
Nama Variabel: bidx_14, Label: birth column number
Nama Variabel: bidx_15, Label: birth column number
Nama Variabel: bidx_16, Label:

In [None]:
print("\n--- Mencari Semua Variabel Bantuan Persalinan ---")
kata_kunci = 'assistance'

for var, label in meta_ibu.column_names_to_labels.items():
    # Kita hanya cari di variabel yang berawalan 'm3' untuk menjaga relevansi
    if var.startswith('m3') and kata_kunci in label.lower():
        print(f"Nama Variabel: {var}, Label: {label}")

print("---------------------------------")


--- Mencari Semua Variabel Bantuan Persalinan ---
Nama Variabel: m3a_1, Label: assistance: doctor
Nama Variabel: m3a_2, Label: assistance: doctor
Nama Variabel: m3a_3, Label: assistance: doctor
Nama Variabel: m3a_4, Label: assistance: doctor
Nama Variabel: m3a_5, Label: assistance: doctor
Nama Variabel: m3a_6, Label: assistance: doctor
Nama Variabel: m3b_1, Label: assistance: nurse/midwife
Nama Variabel: m3b_2, Label: assistance: nurse/midwife
Nama Variabel: m3b_3, Label: assistance: nurse/midwife
Nama Variabel: m3b_4, Label: assistance: nurse/midwife
Nama Variabel: m3b_5, Label: assistance: nurse/midwife
Nama Variabel: m3b_6, Label: assistance: nurse/midwife
Nama Variabel: m3c_1, Label: assistance: auxiliary midwi-na
Nama Variabel: m3c_2, Label: assistance: auxiliary midwi-na
Nama Variabel: m3c_3, Label: assistance: auxiliary midwi-na
Nama Variabel: m3c_4, Label: assistance: auxiliary midwi-na
Nama Variabel: m3c_5, Label: assistance: auxiliary midwi-na
Nama Variabel: m3c_6, Label: as

In [None]:
print("\n--- Mencari Variabel Pemeriksaan Nifas Ibu ---")
kata_kunci_1 = 'postnatal'

for var, label in meta_ibu.column_names_to_labels.items():
    if kata_kunci_1 in label.lower():
        print(f"Nama Variabel: {var}, Label: {label}")

print("---------------------------------")


--- Mencari Variabel Pemeriksaan Nifas Ibu ---
---------------------------------


In [None]:
print("\n--- Mencari Variabel Pemeriksaan Ibu (checkup) ---")
kata_kunci = 'checkup'

for var, label in meta_ibu.column_names_to_labels.items():
    # Kita cari di variabel 'm' (maternity)
    if var.startswith('m') and kata_kunci in label.lower():
        print(f"Nama Variabel: {var}, Label: {label}")

print("---------------------------------")


--- Mencari Variabel Pemeriksaan Ibu (checkup) ---
Nama Variabel: m51_1, Label: checkup after deliver timing
Nama Variabel: m51_2, Label: checkup after deliver timing
Nama Variabel: m51_3, Label: checkup after deliver timing
Nama Variabel: m51_4, Label: checkup after deliver timing
Nama Variabel: m51_5, Label: checkup after deliver timing
Nama Variabel: m51_6, Label: checkup after deliver timing
Nama Variabel: m53_1, Label: place for checkup
Nama Variabel: m53_2, Label: place for checkup
Nama Variabel: m53_3, Label: place for checkup
Nama Variabel: m53_4, Label: place for checkup
Nama Variabel: m53_5, Label: place for checkup
Nama Variabel: m53_6, Label: place for checkup
---------------------------------


In [None]:
# Cari variabel Asuransi
print("\n--- Mencari Variabel Asuransi (Pencarian Lanjutan) ---")

kata_kunci_list = [ 'jaminan', 'bpjs', 'insurance', 'asuransi'] # Kita coba beberapa kata kunci

for var, label in meta_rt.column_names_to_labels.items():
    for kata_kunci in kata_kunci_list:
        if kata_kunci in label.lower():
            print(f"Nama Variabel: {var}, Label: {label}")
            break # Hentikan jika sudah ketemu agar tidak duplikat

print("---------------------------------")


--- Mencari Variabel Asuransi (Pencarian Lanjutan) ---
---------------------------------


### 3. Extracting

#### 3.1 Define Required Variables

In [None]:
print("Defining Required Variables")
print("="*70)

# Mother-level variables
ibu_vars_needed = [
    'caseid',           # Unique case identifier
    'v001',             # Cluster number
    'v002',             # Household number
    'v024',             # Region/Province
    'v025',             # Type of residence (1=Urban, 2=Rural)
    'v106',             # Highest education level
    'v201',             # Total children ever born (v201)
    'v212',             # Age of respondent
    'm14_1',            # Number of ANC visits for last birth
    'm15_1',            # Place of delivery (codes 20+ = health facility)
    'v208',             # Births in last five years
    'v005',             # Weighted
] + [f'm3{c}_1' for c in 'abcdef']

# Child-level variables
anak_vars_needed = [
    'caseid',           # Links to mother
    'v024',             # Region/Province
    'm18',              # Size of child at birth (1-3=Normal/Large, 4-5=Small)
    'h2',               # BCG vaccination (1=Yes)
    'h4',               # DPT1 vaccination (1=Yes)
    'h6',               # Polio1 vaccination (1=Yes)
    'h8',               # Measles vaccination (1=Yes)
    'v008',             # Interview Date (to calculate age)
    'b3',               # Date of birth of child (to calculate age)
    'v005'              # Weighted - Added to anak_vars_needed
]

# Birth history variables
lahir_vars_needed = [
    'caseid',           # Links to mother
    'v024',             # Region/Province
    'b11',               # Preceding birth interval
    'bord'              # birth order
]

print(f"Mother variables needed: {len(ibu_vars_needed)}")
print(f"Child variables needed: {len(anak_vars_needed)}")
print(f"Birth history variables needed: {len(lahir_vars_needed)}")

Defining Required Variables
Mother variables needed: 18
Child variables needed: 10
Birth history variables needed: 4


#### 3.2 Check Data Availability & Quality

In [None]:
# ============================================================================
# VALIDATE DATASET AVAILABILITY
# ============================================================================
print("Validating Variable Availability")
print("-"*70)

def check_missing_variables(df, var_list, dataset_name):
    """Check if required variables exist in dataset"""
    missing = [v for v in var_list if v not in df.columns]
    if missing:
        print(f"\nWARNING - {dataset_name}:")
        print(f"   Missing {len(missing)} variables: {missing[:10]}")  # Show first 10
        if len(missing) > 10:
            print(f"   ... and {len(missing)-10} more")
        return False
    else:
        print(f"✓ {dataset_name}: All {len(var_list)} variables found")
        return True

# Validate each dataset
ibu_valid = check_missing_variables(df_ibu, ibu_vars_needed, "Mother Dataset")
anak_valid = check_missing_variables(df_anak, anak_vars_needed, "Child Dataset")
lahir_valid = check_missing_variables(df_lahir, lahir_vars_needed, "Birth Dataset")

if not all([ibu_valid, anak_valid, lahir_valid]):
    print("\nSome variables are missing. Please check your dataset!")

Validating Variable Availability
----------------------------------------------------------------------
✓ Mother Dataset: All 18 variables found
✓ Child Dataset: All 10 variables found
✓ Birth Dataset: All 4 variables found


In [None]:
# ============================================================================
# SELECT AND COPY VARIABLES
# ============================================================================
print("Selecting Required Variables")
print("="*70)

df_ibu_clean = df_ibu[ibu_vars_needed].copy()
df_anak_clean = df_anak[anak_vars_needed].copy()
df_lahir_clean = df_lahir[lahir_vars_needed].copy()

print(f"✓ Mother data: {df_ibu_clean.shape[0]:,} rows, {df_ibu_clean.shape[1]} columns")
print(f"✓ Child data: {df_anak_clean.shape[0]:,} rows, {df_anak_clean.shape[1]} columns")
print(f"✓ Birth data: {df_lahir_clean.shape[0]:,} rows, {df_lahir_clean.shape[1]} columns")

Selecting Required Variables
✓ Mother data: 29,483 rows, 18 columns
✓ Child data: 16,206 rows, 10 columns
✓ Birth data: 79,791 rows, 4 columns


In [None]:
# ============================================================================
# DATA CLEANING
# ============================================================================
print("Data Quality Checks & Cleaning")
print("="*70)

def handle_dhs_special_codes(df, var, missing_codes=[98, 99], method='nan'):
    """
    Handle DHS special codes (Don't Know, Missing, Not Applicable)

    Parameters:
    - df: DataFrame
    - var: Variable name
    - missing_codes: List of codes to treat as missing
    - method: 'nan' or 'zero'
    """
    if var in df.columns:
        original_missing = df[var].isna().sum()
        df[var] = df[var].replace(missing_codes, np.nan)
        new_missing = df[var].isna().sum()
        if new_missing > original_missing:
            print(f"  • {var}: Cleaned {new_missing - original_missing:,} special codes → NaN")
    return df

# Clean Mother data
print("\nCleaning Mother Dataset:")
# Corrected calls for handle_dhs_special_codes
df_ibu_clean = handle_dhs_special_codes(df_ibu_clean, 'm14_1', missing_codes=[97, 98, 99])  # ANC visits
df_ibu_clean = handle_dhs_special_codes(df_ibu_clean, 'm15_1', missing_codes=[97, 98, 99])  # Delivery Places
df_ibu_clean = handle_dhs_special_codes(df_ibu_clean, 'v212', missing_codes=[98, 99])  # Age
df_ibu_clean = handle_dhs_special_codes(df_ibu_clean, 'v106', missing_codes=[8, 9]) # Education
df_ibu_clean = handle_dhs_special_codes(df_ibu_clean, 'v481', missing_codes=[8, 9]) # Insurance
df_ibu_clean = handle_dhs_special_codes(df_ibu_clean, 'b11_01', missing_codes=[98, 99]) # Birth weight
df_ibu_clean = handle_dhs_special_codes(df_ibu_clean, 'm70_1', [8,9]) # Postnatal Check
df_ibu_clean = handle_dhs_special_codes(df_ibu_clean, 'm71_1', [998, 999]) # Time after delivery

# Clean Child data
print("\nCleaning Child Dataset:")
df_anak_clean = handle_dhs_special_codes(df_anak_clean, 'm18', missing_codes=[8, 9, 98, 99])  # Birth size
# Assuming immunization variables h2, h4, h6, h8 also need cleaning for 8, 9
df_anak_clean = handle_dhs_special_codes(df_anak_clean, 'h2', missing_codes=[8, 9])
df_anak_clean = handle_dhs_special_codes(df_anak_clean, 'h4', missing_codes=[8, 9])
df_anak_clean = handle_dhs_special_codes(df_anak_clean, 'h6', missing_codes=[8, 9])
df_anak_clean = handle_dhs_special_codes(df_anak_clean, 'h8', missing_codes=[8, 9])
df_anak_clean = handle_dhs_special_codes(df_anak_clean, 's429a', [8,9,98,99,998,999])


required_cols = ['v008', 'b3']
missing_cols = [c for c in required_cols if c not in df_anak_clean.columns]
if missing_cols:
    raise ValueError(f"Missing required columns for age calculation: {missing_cols}")

# Hitung umur dalam bulan
df_anak_clean['age_months'] = df_anak_clean['v008'] - df_anak_clean['b3']

# Bersihkan nilai-nilai tidak valid
df_anak_clean.loc[df_anak_clean['age_months'] < 0, 'age_months'] = None  # impossible
df_anak_clean.loc[df_anak_clean['age_months'] > 200, 'age_months'] = None  # upper sanity bound

print("=== AGE MONTHS CHECK ===")
print(df_anak_clean['age_months'].describe())
print("Invalid ages:", df_anak_clean['age_months'].isna().sum())

# Clean Birth data
print("\nCleaning Birth Dataset:")
df_lahir_clean = handle_dhs_special_codes(df_lahir_clean, 'b11', missing_codes=[98, 99])  # Birth interval


Data Quality Checks & Cleaning

Cleaning Mother Dataset:
  • m14_1: Cleaned 161 special codes → NaN
  • m15_1: Cleaned 46 special codes → NaN

Cleaning Child Dataset:
  • m18: Cleaned 865 special codes → NaN
  • h2: Cleaned 95 special codes → NaN
  • h4: Cleaned 80 special codes → NaN
  • h6: Cleaned 122 special codes → NaN
  • h8: Cleaned 122 special codes → NaN
=== AGE MONTHS CHECK ===
count    16206.000000
mean        29.017278
std         17.067474
min          0.000000
25%         14.000000
50%         29.000000
75%         43.000000
max         59.000000
Name: age_months, dtype: float64
Invalid ages: 0

Cleaning Birth Dataset:
  • b11: Cleaned 186 special codes → NaN


In [None]:
# ============================================================================
# FILTER TO APPROPRIATE SAMPLE
# ============================================================================
print("="*70)
print("Sample Selection")
print("="*70)

print(f"Original sample: {len(df_ibu_clean):,} mothers")

# m14_1 is only asked for most recent birth in last 5 years
df_ibu_analysis = df_ibu_clean[df_ibu_clean['v208'] > 0].copy()
print(f"Analysis sample: {len(df_ibu_analysis):,} mothers with recent births")
print(f"Filtered out: {len(df_ibu_clean) - len(df_ibu_analysis):,} mothers without recent births")

df_anak_analysis = df_anak_clean[df_anak_clean['age_months'].between(0, 23)].copy()
print(f"Analysis child sample: {len(df_anak_analysis):,}")
print(f"Filtered out: {len(df_anak_clean) - len(df_anak_analysis):,}")

Sample Selection
Original sample: 29,483 mothers
Analysis sample: 13,349 mothers with recent births
Filtered out: 16,134 mothers without recent births
Analysis child sample: 6,451
Filtered out: 9,755


In [None]:
# Show missing data summary
print("\nMissing Data Summary:")
print("-" * 50)
key_vars = ['m14_1', 'm15_1', 'v212', 'v481', 'm18', 'v220']
for var in key_vars:
    for df, name in [(df_ibu_clean, 'ibu'), (df_anak_clean, 'anak'), (df_lahir_clean, 'lahir')]:
        if var in df.columns:
            missing_pct = (df[var].isna().sum() / len(df)) * 100
            print(f"  {name}.{var}: {missing_pct:.1f}% missing")


Missing Data Summary:
--------------------------------------------------
  ibu.m14_1: 55.3% missing
  ibu.m15_1: 54.9% missing
  ibu.v212: 7.3% missing
  anak.m18: 5.3% missing


#### 3.3 Feature Engineering

In [None]:
# ============================================================================
# FEATURE ENGINEERING
# ============================================================================
print("Feature Engineering")
print("-"*70)

# -----------------------------------------------------------------------------
# A. MATERNAL HEALTH SERVICE INDICATORS
# -----------------------------------------------------------------------------
print("\nA. Creating Maternal Health Service Indicators:")

# Ensure `df_ibu_analysis` is used as the base for feature engineering
df_ibu_analysis = df_ibu_analysis[df_ibu_analysis['m14_1'].notna()].copy()

# 1. ANC 4+ visits (m14_1 >= 4)
df_ibu_analysis['anc4_pct'] = (df_ibu_analysis['m14_1'] >= 4).astype(int)
print(f"4+ ANC visits: {df_ibu_analysis['anc4_pct'].mean()*100:.1f}%")

# 2. Facility delivery (m15_1: 20-36 = health facilities)
df_ibu_analysis['facility_delivery_pct'] = (
    ((df_ibu_analysis['m15_1'] >= 21) & (df_ibu_analysis['m15_1'] <= 27)) |
    ((df_ibu_analysis['m15_1'] >= 31) & (df_ibu_analysis['m15_1'] <= 37))
).astype(int)
print(f"Facility delivery: {df_ibu_analysis['facility_delivery_pct'].mean()*100:.1f}%")

# 3. Skilled Birth Attendant
# m3a_1 = doctor, m3b_1 = nurse/midwife, m3c_1 = auxiliary midwife
sba_cols = [col for col in df_ibu_analysis.columns if col.startswith('m3') and col.endswith('_1')]
excluded_tba = ['m3f_1'] # Not Skilled
sba_cols_filtered = [c for c in sba_cols if c not in excluded_tba]

df_ibu_analysis['sba_pct'] = df_ibu_analysis[sba_cols_filtered].eq(1).any(axis=1).astype(int)
print(f"Skilled Birth Attendant: {df_ibu_analysis['sba_pct'].mean()*100:.1f}%")

# 6. Parity
df_ibu_analysis.rename(columns={'v201': 'parity'}, inplace=True)

# -----------------------------------------------------------------------------
# B. SOCIO-DEMOGRAPHIC INDICATORS
# -----------------------------------------------------------------------------
print("\nB. Creating Socio-Demographic Indicators:")

df_ibu_analysis['urban_share_pct'] = (df_ibu_analysis['v025'] == 1).astype(int)
print(f"Urban residence: {df_ibu_analysis['urban_share_pct'].mean()*100:.1f}%")

df_ibu_analysis['low_education_pct'] = (df_ibu_analysis['v106'] <= 1).astype(int)
print(f"Low education (\u2264Primary): {df_ibu_analysis['low_education_pct'].mean()*100:.1f}%")

# -----------------------------------------------------------------------------
# C. RISK FACTORS
# -----------------------------------------------------------------------------
print("\nC. Creating Risk Factor Indicators:")

df_ibu_analysis['maternal_age_risky_pct'] = (
    (df_ibu_analysis['v212'] < 20) | (df_ibu_analysis['v212'] >= 35)
).astype(int)
print(f"Risky maternal age: {df_ibu_analysis['maternal_age_risky_pct'].mean()*100:.1f}%")

# Birth interval (from birth recode)
mask_multibirth = df_lahir_clean['bord'] > 1

df_lahir_clean['short_interval_flag'] = np.nan
df_lahir_clean.loc[mask_multibirth, 'short_interval_flag'] = (
    df_lahir_clean.loc[mask_multibirth, 'b11'] < 24
).astype(int)

print(f"Short birth interval (<24mo): {df_lahir_clean.loc[mask_multibirth, 'short_interval_flag'].mean()*100:.1f}%")

# Child variables
df_anak_analysis['small_birth_size_pct'] = (
    df_anak_analysis['m18'].isin([4, 5])  # 4=small, 5=very small
).astype(int)
print(f"Small birth size: {df_anak_analysis['small_birth_size_pct'].mean()*100:.1f}%")

valid_yes = [1, 2, 3]
mask_12_23 = df_anak_clean['age_months'].between(12, 23)

# Default: NaN (anak di luar 12–23 bulan tidak ikut denominator)
# Full immunization (h2=BCG, h4=DPT1, h6=Polio3, h8=Measles)
# Code 1/2/3 = Yes (card/recall/marked), 0 = No, 8 = Don't know
df_anak_clean['full_immun_binary'] = np.nan

df_anak_clean.loc[mask_12_23, 'full_immun_binary'] = (
    df_anak_clean.loc[mask_12_23, 'h2'].isin(valid_yes) &
    df_anak_clean.loc[mask_12_23, 'h4'].isin(valid_yes) &
    df_anak_clean.loc[mask_12_23, 'h6'].isin(valid_yes) &
    df_anak_clean.loc[mask_12_23, 'h8'].isin(valid_yes)
).astype(int)

print(
    "Full immunization (12–23 mo, all 4 vaccines): "
    f"{df_anak_clean.loc[mask_12_23, 'full_immun_binary'].mean()*100:.1f}%"
)

Feature Engineering
----------------------------------------------------------------------

A. Creating Maternal Health Service Indicators:
4+ ANC visits: 79.7%
Facility delivery: 38.3%
Skilled Birth Attendant: 68.5%

B. Creating Socio-Demographic Indicators:
Urban residence: 41.4%
Low education (≤Primary): 49.7%

C. Creating Risk Factor Indicators:
Risky maternal age: 40.3%
Short birth interval (<24mo): 27.3%
Small birth size: 13.9%
Full immunization (12–23 mo, all 4 vaccines): 64.5%


#### 3.4 Merging

In [None]:
# Mother-level indicators

df_ibu_2002 = df_ibu_analysis.copy()

# ANC 4+
df_ibu_2002['anc4_flag'] = (df_ibu_2002['m14_1'] >= 4).astype(int)

# Facility delivery (20–89 = health facility)
df_ibu_2002['facility_flag'] = df_ibu_2002['m15_1'].between(20, 89).astype(int)

# SBA (pakai loop semua m3*_1 kecuali TBA, misal m3f_1)
sba_cols = [c for c in df_ibu_2002.columns if c.startswith('m3') and c.endswith('_1')]
excluded_tba = ['m3f_1']  # sesuaikan kalau TBA di kode lain
sba_cols = [c for c in sba_cols if c not in excluded_tba]
df_ibu_2002['sba_flag'] = df_ibu_2002[sba_cols].eq(1).any(axis=1).astype(int)

# Urban
df_ibu_2002['urban_flag'] = (df_ibu_2002['v025'] == 1).astype(int)

# Low education (≤ primary)
df_ibu_2002['lowedu_flag'] = df_ibu_2002['v106'].isin([0, 1]).astype(int)  # 0 = no edu, 1 = primary

# Risky maternal age (<20 or >=35)
df_ibu_2002['risky_age_flag'] = ((df_ibu_2002['v212'] < 20) | (df_ibu_2002['v212'] >= 35)).astype(int)

# Parity (bisa dipakai langsung mean)
df_ibu_2002['parity'] = df_ibu_2002['parity']

# Scaling Weight
df_ibu_2002['weight'] = df_ibu_2002['v005'] / 1_000_000  # scaling standard DHS


In [None]:
df_birth = df_lahir_clean.copy()

# Hanya kelahiran dengan older sibling yang punya interval
df_birth_valid = df_birth[df_birth['bord'] > 1].copy()

df_birth_valid['short_interval_flag'] = (df_birth_valid['b11'] < 24).astype(int)

# Aggregate ke ibu: jika ada short interval pada salah satu kelahiran → ibu berisiko
df_interval_mother = (
    df_birth_valid
    .groupby('caseid')['short_interval_flag']
    .max()
    .reset_index()
)

# Merge ke ibu
df_ibu_2002 = df_ibu_2002.merge(df_interval_mother, on='caseid', how='left')

# Ibu yang hanya punya 1 anak (tidak punya interval) = NA → bukan risiko, assign 0
df_ibu_2002['short_interval_flag'] = df_ibu_2002['short_interval_flag'].fillna(0).astype(int)

In [None]:
# Child-level indicators

df_child_2002 = df_anak_analysis.copy()

# LBW proxy: small_birth_size (m18: 4/5 = small/very small, 1–3 = average/large)
df_child_2002['small_birth_flag'] = df_child_2002['m18'].isin([4, 5]).astype(int)
df_child_2002['small_birth_flag'] = df_child_2002['small_birth_flag'].fillna(0)

# Immunization flags (anggap 1 = received, 2/3 juga treated as received kalau ada)
for col in ['h2', 'h4', 'h6', 'h8']:
    df_child_2002[col + '_flag'] = df_child_2002[col].isin([1, 2, 3]).astype(int)

# Full immunization (semua 4 vaksin diterima)
imm_cols = ['h2_flag', 'h4_flag', 'h6_flag', 'h8_flag']
df_child_2002['full_immun_flag'] = df_child_2002[imm_cols].all(axis=1).astype(int)
df_child_2002['full_immun_flag'] = df_child_2002['full_immun_flag'].fillna(0)

In [None]:
group_cols = ['v024']  # v024 = province

# Aggregate Mother
prov_mother = (
    df_ibu_2002
    .groupby(group_cols)
    .agg(
        anc4_pct=('anc4_flag', 'mean'),
        facility_delivery_pct=('facility_flag', 'mean'),
        sba_pct=('sba_flag', 'mean'),
        urban_share_pct=('urban_flag', 'mean'),
        low_education_pct=('lowedu_flag', 'mean'),
        risky_maternal_age_pct=('risky_age_flag', 'mean'),
        birth_interval_short_pct=('short_interval_flag', 'mean'),
        avg_parity=('parity', 'mean')
    )
    .reset_index()
)

# Aggregate Child
prov_child = (
    df_child_2002
    .groupby(group_cols)
    .agg(
        lbw_pct=('small_birth_flag', 'mean'),
        full_immun_pct=('full_immun_flag', 'mean')
    )
    .reset_index()
)

In [None]:
df_2002 = prov_mother.merge(prov_child, on=['v024'], how='left')

df_2002.rename(columns={'v024': 'region'}, inplace=True)


In [None]:
prov_map = {
    11: "Aceh",
    12: "Sumatera Utara",
    13: "Sumatera Barat",
    14: "Riau",
    15: "Jambi",
    16: "Sumatera Selatan",
    17: "Bengkulu",
    18: "Lampung",
    19: "Kepulauan Bangka Belitung",
    21: "Kepulauan Riau",
    31: "DKI Jakarta",
    32: "Jawa Barat",
    33: "Jawa Tengah",
    34: "DI Yogyakarta",
    35: "Jawa Timur",
    36: "Banten",
    51: "Bali",
    52: "Nusa Tenggara Barat",
    53: "Nusa Tenggara Timur",
    61: "Kalimantan Barat",
    62: "Kalimantan Tengah",
    63: "Kalimantan Selatan",
    64: "Kalimantan Timur",
    65: "Kalimantan Utara",
    71: "Sulawesi Utara",
    72: "Sulawesi Tengah",
    73: "Sulawesi Selatan",
    74: "Sulawesi Tenggara",
    75: "Gorontalo",
    76: "Sulawesi Barat",
    81: "Maluku",
    82: "Maluku Utara",
    91: "Papua",
    92: "Papua Barat",
    94: "Papua Barat"
}

df_2002['province'] = df_2002['region'].map(prov_map)

# cek apakah ada kode yang belum ter-map
unmapped = df_2002[df_2002['province'].isna()]['region'].unique()
print("Unmapped province codes:", unmapped)

Unmapped province codes: []


In [None]:
cols = df_2002.columns.tolist()

new_order = ['region', 'province'] + [c for c in cols if c not in ['region', 'province']]

# Reorder dataframe
df_2002 = df_2002[new_order]

In [None]:
# ============================================================================
# FINAL DATA QUALITY SUMMARY
# ============================================================================
print("="*80)
print("FINAL DATA QUALITY SUMMARY – PROVINCIAL INDICATORS")
print("="*80)

# 1. Basic dataset info
print(f"\nTotal observations        : {len(df_2002):,}")
print(f"Unique provinces          : {df_2002['region'].nunique()}")
print(f"Total indicators          : {df_2002.shape[1] - 2} (excluding region/year)")

# 2. Missing check
missing = df_2002.isna().sum()
if missing.sum() == 0:
    print("\nMissing values            : 0")
else:
    print("\nMissing values:")
    print(missing[missing > 0])

# 3. Indicator ranges (auto-detect pct columns)
print("\nIndicator Ranges by Province:")
print("-" * 80)

indicator_cols = [
    col for col in df_2002.columns
    if col not in ['region', 'province'] # Exclude 'province' column from numerical checks
]

for col in indicator_cols:
    col_min = df_2002[col].min()
    col_max = df_2002[col].max()

    # convert to percentage if value seems 0-1
    if col_max <= 1.5:
        print(f"{col:30s}: {col_min*100:6.1f}% – {col_max*100:6.1f}%")
    else:
        print(f"{col:30s}: {col_min:6.2f} – {col_max:6.2f}")

# 4. Quick descriptive stats
print("\nQuick Descriptive Stats:")
print("-" * 80)
print(df_2002[indicator_cols].describe().T[['mean','std','min','max']])

print("\nData preview:")
print(df_2002.head())

print("\nPROCESSING COMPLETE ✔")
print("="*80)


FINAL DATA QUALITY SUMMARY – PROVINCIAL INDICATORS

Total observations        : 26
Unique provinces          : 26
Total indicators          : 10 (excluding region/year)

Missing values            : 0

Indicator Ranges by Province:
--------------------------------------------------------------------------------
anc4_pct                      :   63.7% –   96.7%
facility_delivery_pct         :    5.1% –   89.6%
sba_pct                       :   42.8% –   94.6%
urban_share_pct               :   13.1% –  100.0%
low_education_pct             :   28.3% –   68.9%
risky_maternal_age_pct        :   22.9% –   51.4%
birth_interval_short_pct      :   11.7% –   41.7%
avg_parity                    :   1.98 –   3.11
lbw_pct                       :    8.6% –   27.8%
full_immun_pct                :   28.5% –   81.7%

Quick Descriptive Stats:
--------------------------------------------------------------------------------
                              mean       std       min       max
anc4_pct          

In [None]:
print("Saving Results")

df_2002.to_csv('dhs_regional_2002.csv', index=False)
print("Files saved successfully!")

Saving Results
Files saved successfully!


## Merge All Year

In [None]:
# Mapping provinsi baru -> provinsi induk (kode lama, 26 provinsi)
PROV_BACKWARD_MAP = {
    36: 32,  # Banten        -> Jawa Barat
    21: 14,  # Kep. Riau     -> Riau
    75: 71,  # Gorontalo     -> Sulawesi Utara
    76: 73,  # Sulawesi Barat-> Sulawesi Selatan
    92: 91,  # Papua Barat   -> Papua
    94: 91,  # varian kode Papua Barat (kalau muncul)
}

def wavg(x, w):
    """Weighted average yang aman terhadap NaN."""
    x = np.asarray(x, dtype=float)
    w = np.asarray(w, dtype=float)
    mask = ~np.isnan(x) & ~np.isnan(w)
    if mask.sum() == 0:
        return np.nan
    return np.average(x[mask], weights=w[mask])


In [None]:
def aggregate_province_year(
    df_mother,
    df_child,
    year,
    region_col='v024',
    weight_col='v005',
    apply_backward_mapping=True,
):
    """
    df_mother : dataframe level ibu (sudah FE)
    df_child  : dataframe level anak (sudah FE, small_birth_flag & full_immun_flag)
    year      : int (2002, 2007, 2012, 2017)
    region_col: kolom kode provinsi (v024/region)
    weight_col: kolom sampling weight DHS (v005)
    apply_backward_mapping:
        - 2002  -> False
        - 2007+ -> True
    """

    # --- copy & siapkan weight ---
    dm = df_mother.copy()
    dc = df_child.copy()

    # buang baris yang ga punya region / weight
    dm = dm[dm[region_col].notna() & dm[weight_col].notna()].copy()
    dc = dc[dc[region_col].notna() & dc[weight_col].notna()].copy()

    # pastikan region integer
    dm[region_col] = dm[region_col].astype(int)
    dc[region_col] = dc[region_col].astype(int)

    # scaling weight DHS
    dm['weight'] = dm[weight_col] / 1_000_000.0
    dc['weight'] = dc[weight_col] / 1_000_000.0

    # backward mapping (33 -> 26) kalau diminta
    if apply_backward_mapping:
        dm[region_col] = dm[region_col].replace(PROV_BACKWARD_MAP)
        dc[region_col] = dc[region_col].replace(PROV_BACKWARD_MAP)

    # --- daftar fitur yang mau di-aggregate ---
    mother_features = {
        'anc4_pct': 'anc4_flag',
        'facility_delivery_pct': 'facility_flag',
        'sba_pct': 'sba_flag',
        'urban_share_pct': 'urban_flag',
        'low_education_pct': 'lowedu_flag',
        'risky_maternal_age_pct': 'risky_age_flag',
        'birth_interval_short_pct': 'short_interval_flag',
        'avg_parity': 'parity',
    }

    child_features = {
        'lbw_pct': 'small_birth_flag',
        'full_immun_pct': 'full_immun_flag',
    }

    # filter kolom yang benar-benar ada (kalau beda tahun beda var)
    mother_features = {
        out: col for out, col in mother_features.items()
        if col in dm.columns
    }
    child_features = {
        out: col for out, col in child_features.items()
        if col in dc.columns
    }

    # --- aggregate mother-level (weighted) ---
    def agg_mother(group):
        w = group['weight'].values
        out = {}
        for out_name, col in mother_features.items():
            out[out_name] = wavg(group[col].values, w)
        return pd.Series(out)

    prov_mother = (
        dm
        .groupby(region_col)
        .apply(agg_mother)
        .reset_index()
    )

    # --- aggregate child-level (weighted) ---
    def agg_child(group):
        w = group['weight'].values
        out = {}
        for out_name, col in child_features.items():
            out[out_name] = wavg(group[col].values, w)
        return pd.Series(out)

    prov_child = (
        dc
        .groupby(region_col)
        .apply(agg_child)
        .reset_index()
    )

    # --- merge mother + child ---
    prov = prov_mother.merge(prov_child, on=region_col, how='left')

    # tambah kolom year & rapikan urutan kolom
    prov['year'] = year
    cols = ['year', region_col] + [c for c in prov.columns if c not in ['year', region_col]]
    prov = prov[cols]

    return prov


In [None]:
# 2002: belum ada pemekaran (26 prov), TIDAK backward-mapping
prov_2002 = aggregate_province_year(
    df_mother=df_ibu_2002,
    df_child=df_child_2002,
    year=2002,
    region_col='v024',
    weight_col='v005',
    apply_backward_mapping=False
)

# 2007
prov_2007 = aggregate_province_year(
    df_mother=df_ibu_2007,
    df_child=df_child_2007,
    year=2007,
    region_col='v024',
    weight_col='v005',
    apply_backward_mapping=True
)

# 2012
prov_2012 = aggregate_province_year(
    df_mother=df_ibu_2012,
    df_child=df_child_2012,
    year=2012,
    region_col='v024',
    weight_col='v005',
    apply_backward_mapping=True
)

# 2017
prov_2017 = aggregate_province_year(
    df_mother=df_ibu_2017,
    df_child=df_child_2017,
    year=2017,
    region_col='v024',
    weight_col='v005',
    apply_backward_mapping=True
)

# cek jumlah provinsi per tahun
for df in [prov_2002, prov_2007, prov_2012, prov_2017]:
    print(df['year'].iloc[0], df['v024'].nunique())


2002 26
2007 28
2012 28
2017 29


In [None]:
base = set(prov_2002['v024'])  # atau 'region' kalau nama kolomnya itu

for df in [prov_2007, prov_2012, prov_2017]:
    year = df['year'].iloc[0]
    regs = set(df['v024'])
    extra = sorted(regs - base)
    print(year, "extra codes:", extra)


2007 extra codes: [11, 81, 82, 91]
2012 extra codes: [11, 81, 82, 91]
2017 extra codes: [11, 65, 81, 82, 91]


In [None]:
print("prov 2002 unique:", sorted(prov_2002['v024'].unique()))
print("length:", len(prov_2002['v024'].unique()))


prov 2002 unique: [np.int64(12), np.int64(13), np.int64(14), np.int64(15), np.int64(16), np.int64(17), np.int64(18), np.int64(19), np.int64(31), np.int64(32), np.int64(33), np.int64(34), np.int64(35), np.int64(36), np.int64(51), np.int64(52), np.int64(53), np.int64(61), np.int64(62), np.int64(63), np.int64(64), np.int64(71), np.int64(72), np.int64(73), np.int64(74), np.int64(75)]
length: 26


In [None]:
region_col = 'v024'

regs_2002 = set(prov_2002[region_col].unique())
regs_2007 = set(prov_2007[region_col].unique())
regs_2012 = set(prov_2012[region_col].unique())
regs_2017 = set(prov_2017[region_col].unique())

stable_regions = regs_2002 & regs_2007 & regs_2012 & regs_2017
print("Stable regions:", sorted(stable_regions), " (n =", len(stable_regions), ")")


Stable regions: [np.int64(12), np.int64(13), np.int64(14), np.int64(15), np.int64(16), np.int64(17), np.int64(18), np.int64(19), np.int64(31), np.int64(32), np.int64(33), np.int64(34), np.int64(35), np.int64(51), np.int64(52), np.int64(53), np.int64(61), np.int64(62), np.int64(63), np.int64(64), np.int64(71), np.int64(72), np.int64(73), np.int64(74)]  (n = 24 )


In [None]:
def align_to_stable(df_year, year, stable_regions, region_col='region'):
    df = df_year.copy()
    df['year'] = year
    df = df[df[region_col].isin(stable_regions)].copy()
    df[region_col] = df[region_col].astype(int)
    df = df.sort_values([region_col]).reset_index(drop=True)
    return df

prov_2002_aligned = align_to_stable(prov_2002, 2002, stable_regions, region_col)
prov_2007_aligned = align_to_stable(prov_2007, 2007, stable_regions, region_col)
prov_2012_aligned = align_to_stable(prov_2012, 2012, stable_regions, region_col)
prov_2017_aligned = align_to_stable(prov_2017, 2017, stable_regions, region_col)

for df in [prov_2002_aligned, prov_2007_aligned, prov_2012_aligned, prov_2017_aligned]:
    print(df['year'].iloc[0], df[region_col].nunique())


2002 24
2007 24
2012 24
2017 24


In [None]:
meta_cols = ['year', region_col]
indicator_cols = [c for c in prov_2002_aligned.columns if c not in meta_cols]

panel = pd.concat(
    [prov_2002_aligned, prov_2007_aligned, prov_2012_aligned, prov_2017_aligned],
    ignore_index=True
)

print(panel.groupby('year')[region_col].nunique())

prov_name_map = {
    12: "Sumatera Utara",
    13: "Sumatera Barat",
    14: "Riau",
    15: "Jambi",
    16: "Sumatera Selatan",
    17: "Bengkulu",
    18: "Lampung",
    19: "Kep. Bangka Belitung",
    31: "DKI Jakarta",
    32: "Jawa Barat",
    33: "Jawa Tengah",
    34: "DI Yogyakarta",
    35: "Jawa Timur",
    36: "Banten",
    51: "Bali",
    52: "Nusa Tenggara Barat",
    53: "Nusa Tenggara Timur",
    61: "Kalimantan Barat",
    62: "Kalimantan Tengah",
    63: "Kalimantan Selatan",
    64: "Kalimantan Timur",
    71: "Sulawesi Utara",
    72: "Sulawesi Tengah",
    73: "Sulawesi Selatan",
    74: "Sulawesi Tenggara",
    75: "Gorontalo",
}

panel['province_name'] = panel[region_col].map(prov_name_map)
panel = panel[['year', region_col, 'province_name'] + indicator_cols]

panel.head()



year
2002    24
2007    24
2012    24
2017    24
Name: v024, dtype: int64


Unnamed: 0,year,v024,province_name,anc4_pct,facility_delivery_pct,sba_pct,urban_share_pct,low_education_pct,risky_maternal_age_pct,birth_interval_short_pct,avg_parity,lbw_pct,full_immun_pct
0,2002,12,Sumatera Utara,0.709615,0.344487,0.827922,0.42996,0.386486,0.27894,0.39119,3.019732,0.115147,0.348179
1,2002,13,Sumatera Barat,0.865139,0.602981,0.820163,0.394327,0.337632,0.299571,0.300608,2.816498,0.149896,0.539392
2,2002,14,Riau,0.837971,0.381888,0.758697,0.534813,0.394638,0.36667,0.247384,2.57885,0.130668,0.49635
3,2002,15,Jambi,0.799175,0.37968,0.718145,0.310392,0.502543,0.451023,0.171904,2.399863,0.081931,0.395001
4,2002,16,Sumatera Selatan,0.79344,0.371098,0.773976,0.313627,0.545571,0.44911,0.224071,2.602079,0.096247,0.467269


In [None]:
panel.to_csv('dhs_regional_panel.csv', index=False)
print("Files saved successfully!")

Files saved successfully!


# Extract IMR Data

In [None]:
imr_2012 = pd.read_csv('imr_2012.csv',skiprows=2, header=None)
imr_2012.columns = ["provinsi", "imr"]
imr_2012["tahun"] = 2012
imr_2012 = imr_2012.dropna(subset=["provinsi"])

# Reset index
imr_2012 = imr_2012.reset_index(drop=True)

# Convert IMR to numeric
imr_2012["imr"] = pd.to_numeric(imr_2012["imr"], errors="coerce")

imr_2012["provinsi"] = imr_2012["provinsi"].str.title()


In [None]:
imr_2012_long = imr_2012.rename(columns={
    "province": "provinsi",
    "imr": "imr_per_1k",
    "year": "tahun"
})[["provinsi", "tahun", "imr_per_1k"]]


In [None]:
FILE_WIDE = "akb_1971_2020.csv"
OUT = "data_imr_bps.csv"
MIN_YEAR = 2000

# Read File
df = pd.read_csv(FILE_WIDE, dtype=str)

# Normalisasi (wide table with multi-row header) ---
header_row = df.iloc[2].tolist()
df.columns = df.columns.str.lower() # Corrected line

# buat kolom: first = 'provinsi', sisanya adalah tahun (kita ekstrak 4-digit year)
years = []
for v in header_row[1:]:
    if pd.isna(v):
        years.append(None)
    else:
        m = re.search(r'(19|20)\d{2}', str(v))
        years.append(m.group(0) if m else str(v))
cols = ['provinsi'] + years

# data aktual mulai dari row index 3
data_rows = df.iloc[3:].copy()
data_rows.columns = cols

# extract from 2000, 2010, & 2020
wanted = ['provinsi', '2000', '2010', '2020']
available = [c for c in wanted if c in data_rows.columns]
df_sub = data_rows[available].reset_index(drop=True)

# melt into long format (provinsi, tahun, imr_per_1k)
df_wide_long = df_sub.melt(id_vars='provinsi', var_name='tahun', value_name='imr_per_1k')

# cleaning nilai & convert as numeric (float)
df_wide_long['imr_per_1k'] = (
    df_wide_long['imr_per_1k'].astype(str)
    .str.replace(',', '.')
    .str.extract(r'(\d+\.?\d*)')[0]
)

df_wide_long = df_wide_long.dropna(subset=['imr_per_1k'])
df_wide_long['imr_per_1k'] = df_wide_long['imr_per_1k'].astype(float)

# tahun as string
df_wide_long['tahun'] = df_wide_long['tahun'].astype(str)

# Ambil daftar provinsi dari df (dipakai untuk filter long-form) ---
provinces = df_sub['provinsi'].dropna().astype(str).str.strip().tolist()

df_all = pd.concat([df_wide_long], ignore_index=True, sort=False)

# final tidy: standardisasi nama provinsi ringan & tipe data
def normalize_name(s):
    s = str(s).strip()
    if s.lower() in ['dki', 'dki jakarta', 'dkijakarta']:
        return "DKI Jakarta"
    if 'yogyakarta' in s.lower():
        return "DI Yogyakarta"
    return ' '.join(w.capitalize() for w in s.split())

df_all['provinsi'] = df_all['provinsi'].apply(normalize_name)
df_all['tahun'] = df_all['tahun'].astype(int)
df_all['imr_per_1k'] = df_all['imr_per_1k'].round(2)

# simpan
df_all.to_csv(OUT, index=False, encoding='utf-8')
print("Saved:", OUT)
print(df_all.head(15).to_string(index=False))

Saved: data_imr_bps.csv
                 provinsi  tahun  imr_per_1k
                     Aceh   2000        40.0
           Sumatera Utara   2000        44.0
           Sumatera Barat   2000        53.0
                     Riau   2000        48.0
                    Jambi   2000        53.0
         Sumatera Selatan   2000        53.0
                 Bengkulu   2000        53.0
                  Lampung   2000        48.0
Kepulauan Bangka Belitung   2000        53.0
              DKI Jakarta   2000        25.0
               Jawa Barat   2000        57.0
              Jawa Tengah   2000        44.0
            DI Yogyakarta   2000        25.0
               Jawa Timur   2000        48.0
                   Banten   2000        66.0


In [None]:
imr_data = pd.read_csv('data_imr_bps.csv')

panel_imr = pd.concat([imr_data, imr_2012_long], ignore_index=True)

panel_imr["tahun"] = panel_imr["tahun"].astype(int)

panel_imr = panel_imr.sort_values(["provinsi", "tahun"]).reset_index(drop=True)
panel_imr.head()


Unnamed: 0,provinsi,tahun,imr_per_1k
0,Aceh,2000,40.0
1,Aceh,2010,28.0
2,Aceh,2012,47.0
3,Aceh,2020,19.41
4,Bali,2000,36.0


In [None]:

panel_imr.to_csv('panel_imr.csv', index=False)
print("Files saved successfully!")

Files saved successfully!
