# Village Health Digital Twin - Comprehensive Analysis

This notebook provides a comprehensive analysis of village-level health indicators including demographics, health issues (stunting, hypertension), health facilities, and other public health metrics. We'll explore the dataset structure, visualize relationships, define risk thresholds, and create a dashboard-style overview.

## Dataset Overview
- **HOUSEHOLDS.csv**: Household demographics and basic infrastructure
- **ADULTS_HTN.csv**: Adult hypertension data and treatment outcomes
- **CHILDREN_STUNTING.csv**: Child nutrition and stunting data
- **PROGRAM_LOG.csv**: Health program implementation logs
- **COSTS_CATALOG.csv**: Program cost information
- **PARAMETERS.csv**: Analysis parameters and thresholds

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

## 1. Data Loading and Initial Exploration

Let's start by loading all CSV files and performing initial data exploration.

In [None]:
# Load all datasets
households = pd.read_csv('/media/hdd2/mgodonf/digital-twin/HOUSEHOLDS.csv')
adults_htn = pd.read_csv('/media/hdd2/mgodonf/digital-twin/ADULTS_HTN.csv')
children_stunting = pd.read_csv('/media/hdd2/mgodonf/digital-twin/CHILDREN_STUNTING.csv')
program_log = pd.read_csv('/media/hdd2/mgodonf/digital-twin/PROGRAM_LOG.csv')
costs_catalog = pd.read_csv('/media/hdd2/mgodonf/digital-twin/COSTS_CATALOG.csv')
parameters = pd.read_csv('/media/hdd2/mgodonf/digital-twin/PARAMETERS.csv')

print("Dataset Shapes:")
print(f"Households: {households.shape}")
print(f"Adults HTN: {adults_htn.shape}")
print(f"Children Stunting: {children_stunting.shape}")
print(f"Program Log: {program_log.shape}")
print(f"Costs Catalog: {costs_catalog.shape}")
print(f"Parameters: {parameters.shape}")

In [None]:
# Display basic information about each dataset
datasets = {
    'Households': households,
    'Adults HTN': adults_htn,
    'Children Stunting': children_stunting,
    'Program Log': program_log
}

for name, df in datasets.items():
    print(f"\n=== {name} Dataset ===")
    print(f"Shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
    print("\nFirst 3 rows:")
    print(df.head(3))

## 2. Data Structure Analysis

Let's analyze the structure of each dataset in detail.

In [None]:
# Analyze data types and missing values
def analyze_dataframe(df, name):
    print(f"\n=== {name} Data Structure Analysis ===")
    print("\nData Types:")
    print(df.dtypes)
    print(f"\nMissing Values:")
    missing = df.isnull().sum()
    if missing.sum() > 0:
        print(missing[missing > 0])
    else:
        print("No missing values found")
    
    print(f"\nBasic Statistics:")
    print(df.describe())

for name, df in datasets.items():
    analyze_dataframe(df, name)

In [None]:
# Analyze categorical variables
print("=== Categorical Variables Analysis ===")

print("\nVillages (Dusun) distribution:")
print(households['dusun'].value_counts())

print("\nAdult Gender distribution:")
print(adults_htn['sex'].value_counts())

print("\nChildren Gender distribution:")
print(children_stunting['sex'].value_counts())

print("\nProgram types:")
print(program_log['program'].value_counts())

## 3. Health Metrics Calculation

Let's calculate key health metrics including prevalence rates and coverage indicators.

In [None]:
# Calculate key health metrics by household and village
def calculate_health_metrics():
    # Merge household data with health data
    adults_merged = adults_htn.merge(households[['household_id', 'dusun']], on='household_id', how='left')
    children_merged = children_stunting.merge(households[['household_id', 'dusun']], on='household_id', how='left')
    
    # Village-level metrics
    village_metrics = []
    
    for dusun in households['dusun'].unique():
        # Household data for this village
        hh_data = households[households['dusun'] == dusun]
        adult_data = adults_merged[adults_merged['dusun'] == dusun]
        child_data = children_merged[children_merged['dusun'] == dusun]
        
        metrics = {
            'dusun': dusun,
            'total_households': len(hh_data),
            'total_adults': len(adult_data),
            'total_children': len(child_data),
            'avg_income': hh_data['pendapatan_bulanan'].mean(),
            'clean_water_access': hh_data['air_bersih'].mean() * 100,
            'sanitation_access': hh_data['jamban_sehat'].mean() * 100,
            'avg_distance_puskesmas': hh_data['jarak_ke_puskesmas_km'].mean(),
            'htn_prevalence': adult_data['diagnosis_htn'].mean() * 100 if len(adult_data) > 0 else 0,
            'htn_program_uptake': adult_data['uptake_htn_program'].mean() * 100 if len(adult_data) > 0 else 0,
            'stunting_prevalence': (child_data['HAZ'] < -2).mean() * 100 if len(child_data) > 0 else 0,
            'stunting_program_uptake': child_data['uptake_stunting_program'].mean() * 100 if len(child_data) > 0 else 0,
            'avg_HAZ': child_data['HAZ'].mean() if len(child_data) > 0 else 0,
            'severe_stunting_prevalence': (child_data['HAZ'] < -3).mean() * 100 if len(child_data) > 0 else 0
        }
        village_metrics.append(metrics)
    
    return pd.DataFrame(village_metrics)

village_health_metrics = calculate_health_metrics()
print("Village Health Metrics:")
print(village_health_metrics.round(2))

In [None]:
# Calculate individual-level risk scores
def calculate_individual_metrics():
    # Adult risk factors
    adults_htn['risk_score'] = (
        (adults_htn['sistol'] >= 140).astype(int) +
        (adults_htn['diastol'] >= 90).astype(int) +
        adults_htn['diabetes_koin'] +
        adults_htn['perokok'] +
        (adults_htn['BMI'] >= 30).astype(int)
    )
    
    # Children risk factors
    children_stunting['risk_score'] = (
        (children_stunting['HAZ'] < -2).astype(int) * 2 +
        (children_stunting['anemia_hb_gdl'] < 11).astype(int) +
        children_stunting['diare_3bln_terakhir'] +
        (1 - children_stunting['ASI_eksklusif']) +
        (1 - children_stunting['mp_asi_memadai'])
    )
    
    return adults_htn, children_stunting

adults_htn, children_stunting = calculate_individual_metrics()

print("Adult Risk Score Distribution:")
print(adults_htn['risk_score'].value_counts().sort_index())

print("\nChildren Risk Score Distribution:")
print(children_stunting['risk_score'].value_counts().sort_index())

## 4. Distribution Analysis

Let's create distribution plots for key health indicators.

In [None]:
# Create distribution plots for key health indicators
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('Distribution of Key Health Indicators', fontsize=16, fontweight='bold')

# Income distribution
axes[0,0].hist(households['pendapatan_bulanan']/1000000, bins=8, alpha=0.7, color='skyblue', edgecolor='black')
axes[0,0].set_title('Monthly Income Distribution')
axes[0,0].set_xlabel('Income (Million Rp)')
axes[0,0].set_ylabel('Frequency')

# Blood pressure distribution
axes[0,1].scatter(adults_htn['sistol'], adults_htn['diastol'], alpha=0.7, c=adults_htn['diagnosis_htn'], cmap='RdYlBu')
axes[0,1].axhline(y=90, color='red', linestyle='--', alpha=0.7)
axes[0,1].axvline(x=140, color='red', linestyle='--', alpha=0.7)
axes[0,1].set_title('Blood Pressure Distribution')
axes[0,1].set_xlabel('Systolic BP (mmHg)')
axes[0,1].set_ylabel('Diastolic BP (mmHg)')

# HAZ distribution
axes[0,2].hist(children_stunting['HAZ'], bins=10, alpha=0.7, color='lightgreen', edgecolor='black')
axes[0,2].axvline(x=-2, color='red', linestyle='--', alpha=0.7, label='Stunting threshold')
axes[0,2].axvline(x=-3, color='darkred', linestyle='--', alpha=0.7, label='Severe stunting')
axes[0,2].set_title('Height-for-Age Z-score Distribution')
axes[0,2].set_xlabel('HAZ Score')
axes[0,2].set_ylabel('Frequency')
axes[0,2].legend()

# Village-level metrics
axes[1,0].bar(village_health_metrics['dusun'], village_health_metrics['htn_prevalence'], 
              alpha=0.7, color='coral', edgecolor='black')
axes[1,0].set_title('Hypertension Prevalence by Village')
axes[1,0].set_xlabel('Village')
axes[1,0].set_ylabel('Prevalence (%)')
axes[1,0].tick_params(axis='x', rotation=45)

axes[1,1].bar(village_health_metrics['dusun'], village_health_metrics['stunting_prevalence'], 
              alpha=0.7, color='lightcoral', edgecolor='black')
axes[1,1].set_title('Stunting Prevalence by Village')
axes[1,1].set_xlabel('Village')
axes[1,1].set_ylabel('Prevalence (%)')
axes[1,1].tick_params(axis='x', rotation=45)

# Access to facilities
x = np.arange(len(village_health_metrics))
width = 0.35
axes[1,2].bar(x - width/2, village_health_metrics['clean_water_access'], width, 
              label='Clean Water', alpha=0.7, color='lightblue')
axes[1,2].bar(x + width/2, village_health_metrics['sanitation_access'], width, 
              label='Sanitation', alpha=0.7, color='lightgreen')
axes[1,2].set_title('Infrastructure Access by Village')
axes[1,2].set_xlabel('Village')
axes[1,2].set_ylabel('Access (%)')
axes[1,2].set_xticks(x)
axes[1,2].set_xticklabels(village_health_metrics['dusun'], rotation=45)
axes[1,2].legend()

plt.tight_layout()
plt.show()

In [None]:
# Box plots for comparing distributions across villages
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('Health Indicators Distribution by Village', fontsize=16, fontweight='bold')

# Income by village
households.boxplot(column='pendapatan_bulanan', by='dusun', ax=axes[0,0])
axes[0,0].set_title('Income Distribution by Village')
axes[0,0].set_xlabel('Village')
axes[0,0].set_ylabel('Monthly Income (Rp)')

# Blood pressure by village
adults_merged = adults_htn.merge(households[['household_id', 'dusun']], on='household_id')
adults_merged.boxplot(column='sistol', by='dusun', ax=axes[0,1])
axes[0,1].set_title('Systolic BP Distribution by Village')
axes[0,1].set_xlabel('Village')
axes[0,1].set_ylabel('Systolic BP (mmHg)')

# HAZ by village
children_merged = children_stunting.merge(households[['household_id', 'dusun']], on='household_id')
children_merged.boxplot(column='HAZ', by='dusun', ax=axes[1,0])
axes[1,0].set_title('HAZ Distribution by Village')
axes[1,0].set_xlabel('Village')
axes[1,0].set_ylabel('HAZ Score')

# Distance to health facility
households.boxplot(column='jarak_ke_puskesmas_km', by='dusun', ax=axes[1,1])
axes[1,1].set_title('Distance to Puskesmas by Village')
axes[1,1].set_xlabel('Village')
axes[1,1].set_ylabel('Distance (km)')

plt.tight_layout()
plt.show()

## 5. Correlation Analysis

Let's generate correlation matrices to identify relationships between health indicators.

In [None]:
# Prepare data for correlation analysis
# Household-level correlations
household_corr_data = households.copy()

# Add aggregated health data to households
hh_health_summary = []
for hh_id in households['household_id']:
    adult_data = adults_htn[adults_htn['household_id'] == hh_id]
    child_data = children_stunting[children_stunting['household_id'] == hh_id]
    
    summary = {
        'household_id': hh_id,
        'adults_count': len(adult_data),
        'children_count': len(child_data),
        'htn_cases': adult_data['diagnosis_htn'].sum() if len(adult_data) > 0 else 0,
        'avg_bp_systolic': adult_data['sistol'].mean() if len(adult_data) > 0 else np.nan,
        'stunted_children': (child_data['HAZ'] < -2).sum() if len(child_data) > 0 else 0,
        'avg_HAZ': child_data['HAZ'].mean() if len(child_data) > 0 else np.nan,
        'program_participation': (adult_data['uptake_htn_program'].sum() + 
                                child_data['uptake_stunting_program'].sum()) if 
                               (len(adult_data) + len(child_data)) > 0 else 0
    }
    hh_health_summary.append(summary)

hh_health_df = pd.DataFrame(hh_health_summary)
household_analysis = household_corr_data.merge(hh_health_df, on='household_id')

# Calculate correlation matrix
numeric_cols = household_analysis.select_dtypes(include=[np.number]).columns
correlation_matrix = household_analysis[numeric_cols].corr()

# Create correlation heatmap
plt.figure(figsize=(12, 10))
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
sns.heatmap(correlation_matrix, mask=mask, annot=True, cmap='RdBu_r', center=0,
            square=True, fmt='.2f', cbar_kws={"shrink": .8})
plt.title('Correlation Matrix - Health and Socioeconomic Indicators', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()