In [2]:
from pathlib import Path
import zipfile
import pandas as pd
import numpy as np

REPO   = Path("..")
RAW_ZIP = REPO / "data_raw" / "DHS_zip"
RAW_DTA = REPO / "data_raw" / "DHS_dta"
CLEAN   = REPO / "data_cleaned"
CLEAN_RAW = CLEAN / "dhs_raw_csv"
CLEAN_TIDY = CLEAN / "dhs_clean_csv"

for p in [RAW_ZIP, RAW_DTA, CLEAN_RAW, CLEAN_TIDY]:
    p.mkdir(parents=True, exist_ok=True)

print("ZIPs here:", RAW_ZIP.resolve())
print("DTA out:", RAW_DTA.resolve())
print("Clean raw CSV:", CLEAN_RAW.resolve())
print("Clean tidy CSV:", CLEAN_TIDY.resolve())


ZIPs here: C:\Users\venus\OneDrive\Documents\GitHub\Climate-Malnutrition-EastAfrica\data_raw\DHS_zip
DTA out: C:\Users\venus\OneDrive\Documents\GitHub\Climate-Malnutrition-EastAfrica\data_raw\DHS_dta
Clean raw CSV: C:\Users\venus\OneDrive\Documents\GitHub\Climate-Malnutrition-EastAfrica\data_cleaned\dhs_raw_csv
Clean tidy CSV: C:\Users\venus\OneDrive\Documents\GitHub\Climate-Malnutrition-EastAfrica\data_cleaned\dhs_clean_csv


In [4]:
count = 0
for zpath in RAW_ZIP.glob("*.zip"):
    with zipfile.ZipFile(zpath) as zf:
        for name in zf.namelist():
            if name.lower().endswith(".dta"):
                zf.extract(name, RAW_DTA)
                src = RAW_DTA / name
                flat = RAW_DTA / Path(name).name
                if src.exists() and src != flat:
                    src.rename(flat)
                count += 1
    print("Extracted from:", zpath.name)
print("Total .DTA extracted:", count)

dta_files = sorted(RAW_DTA.glob("*.DTA"))
len(dta_files), dta_files[:5]


Extracted from: ETKR41DT.zip
Extracted from: ETKR51DT.zip
Extracted from: ETKR61DT.zip
Extracted from: ETKR71DT.zip
Extracted from: ETKR81DT.zip
Extracted from: KEKR01DT.zip
Extracted from: KEKR31DT.zip
Extracted from: KEKR3ADT.zip
Extracted from: KEKR42DT.zip
Extracted from: KEKR52DT.zip
Extracted from: KEKR72DT.zip
Extracted from: KEKR8CDT.zip
Extracted from: TZKR21DT.zip
Extracted from: TZKR3ADT.zip
Extracted from: TZKR41DT.zip
Extracted from: TZKR4IDT.zip
Extracted from: TZKR63DT.zip
Extracted from: TZKR7BDT.zip
Extracted from: TZKR82DT.zip
Extracted from: UGKR01DT.zip
Extracted from: UGKR33DT.zip
Extracted from: UGKR41DT.zip
Extracted from: UGKR52DT.zip
Extracted from: UGKR61DT.zip
Extracted from: UGKR7BDT.zip
Total .DTA extracted: 25


(25,
 [WindowsPath('../data_raw/DHS_dta/ETKR41FL.DTA'),
  WindowsPath('../data_raw/DHS_dta/ETKR51FL.DTA'),
  WindowsPath('../data_raw/DHS_dta/ETKR61FL.DTA'),
  WindowsPath('../data_raw/DHS_dta/ETKR71FL.DTA'),
  WindowsPath('../data_raw/DHS_dta/ETKR81FL.DTA')])

In [8]:
def resolve_cols(df):
    cols = set(df.columns)
    def pick(*candidates):
        for c in candidates:
            if c in cols:
                return c
        return None

    mapping = {
        "height_for_age_z": pick("hw70","hc70"),
        "weight_for_height_z": pick("hw71","hc71"),
        "weight_for_age_z": pick("hw72","hc72"),
        "sex": pick("b4"),
        "age_months": pick("b19"),
        "mother_edu": pick("v106"),
        "wealth_quintile": pick("v190"),
        "residence": pick("v025"),
        "region": pick("v024"),
        "year": pick("v007"),
        "weight_raw": pick("v005"),
    }
    return {k:v for k,v in mapping.items() if v is not None}

def clean_one_dhs(dta_path: Path):
    df = pd.read_stata(dta_path, convert_categoricals=False)
    df.columns = df.columns.str.lower()

    cols = resolve_cols(df)
    if not {"height_for_age_z","weight_for_height_z","weight_for_age_z"}.intersection(cols.keys()):
        raise ValueError(f"No nutrition z-score columns found in {dta_path.name}")

    sub = df[list(cols.values())].copy()
    sub.columns = list(cols.keys())

    # plausible ranges
    for z in ["height_for_age_z","weight_for_height_z","weight_for_age_z"]:
        if z in sub:
            sub = sub[(sub[z].between(-6,6)) | sub[z].isna()]

    # readable labels
    if "mother_edu" in sub:
        sub["mother_edu"] = sub["mother_edu"].map({0:"none",1:"primary",2:"secondary",3:"higher"}).fillna(sub["mother_edu"])
    if "wealth_quintile" in sub:
        sub["wealth_quintile"] = sub["wealth_quintile"].map({1:"poorest",2:"poorer",3:"middle",4:"richer",5:"richest"}).fillna(sub["wealth_quintile"])
    if "residence" in sub:
        sub["residence"] = sub["residence"].map({1:"urban",2:"rural"}).fillna(sub["residence"])

    # sample weight
    if "weight_raw" in sub:
        sub["weight"] = sub["weight_raw"] / 1_000_000.0

    # add country from filename prefix
    code = dta_path.name[:2].upper()
    code2name = {"ET":"Ethiopia","KE":"Kenya","TZ":"Tanzania","UG":"Uganda"}
    sub["country"] = code2name.get(code, code)

    # ensure integer year if present
    if "year" in sub:
        sub["year"] = pd.to_numeric(sub["year"], errors="coerce").astype("Int64")

    # final tidy column order
    order = ["country","year","weight","age_months","sex",
             "height_for_age_z","weight_for_height_z","weight_for_age_z",
             "mother_edu","wealth_quintile","residence","region","weight_raw"]
    sub = sub[[c for c in order if c in sub.columns]]

    return df, sub


In [10]:
WRITE_RAW = True  # set to False if you only want cleaned
raw_written = 0
clean_written = 0
problems = []

for f in dta_files:
    try:
        raw_df, clean_df = clean_one_dhs(f)

        # raw
        if WRITE_RAW:
            raw_path = CLEAN_RAW / f"{f.stem.lower()}_raw.csv"
            raw_df.to_csv(raw_path, index=False)
            raw_written += 1

        # tidy
        tidy_path = CLEAN_TIDY / f"{f.stem.lower()}_clean.csv"
        clean_df.to_csv(tidy_path, index=False)
        clean_written += 1

        print("OK:", f.name, "rows raw:", len(raw_df), "rows clean:", len(clean_df))
    except Exception as e:
        problems.append((f.name, str(e)))
        print("Skip:", f.name, str(e)[:120])

print("Raw CSV files:", raw_written, "Clean CSV files:", clean_written)
problems[:5]


Skip: ETKR41FL.DTA No nutrition z-score columns found in ETKR41FL.DTA
Skip: ETKR51FL.DTA No nutrition z-score columns found in ETKR51FL.DTA
OK: ETKR61FL.DTA rows raw: 11654 rows clean: 1181
OK: ETKR71FL.DTA rows raw: 10641 rows clean: 1521
OK: ETKR81FL.DTA rows raw: 5753 rows clean: 550
Skip: KEKR01FL.DTA No nutrition z-score columns found in KEKR01FL.DTA
Skip: KEKR31FL.DTA No nutrition z-score columns found in KEKR31FL.DTA
Skip: KEKR3AFL.DTA No nutrition z-score columns found in KEKR3AFL.DTA
Skip: KEKR42FL.DTA No nutrition z-score columns found in KEKR42FL.DTA
OK: KEKR52FL.DTA rows raw: 6079 rows clean: 591
OK: KEKR72FL.DTA rows raw: 20964 rows clean: 2077
OK: KEKR8CFL.DTA rows raw: 19530 rows clean: 2133
Skip: TZKR21FL.DTA No nutrition z-score columns found in TZKR21FL.DTA
Skip: TZKR3AFL.DTA No nutrition z-score columns found in TZKR3AFL.DTA
Skip: TZKR41FL.DTA No nutrition z-score columns found in TZKR41FL.DTA
Skip: TZKR4IFL.DTA No nutrition z-score columns found in TZKR4IFL.DTA
OK: 

[('ETKR41FL.DTA', 'No nutrition z-score columns found in ETKR41FL.DTA'),
 ('ETKR51FL.DTA', 'No nutrition z-score columns found in ETKR51FL.DTA'),
 ('KEKR01FL.DTA', 'No nutrition z-score columns found in KEKR01FL.DTA'),
 ('KEKR31FL.DTA', 'No nutrition z-score columns found in KEKR31FL.DTA'),
 ('KEKR3AFL.DTA', 'No nutrition z-score columns found in KEKR3AFL.DTA')]

In [12]:
clean_parts = []
for cpath in sorted(CLEAN_TIDY.glob("*_clean.csv")):
    try:
        clean_parts.append(pd.read_csv(cpath))
    except Exception as e:
        print("Cannot read cleaned file:", cpath.name, e)

panel = pd.concat(clean_parts, ignore_index=True) if clean_parts else pd.DataFrame()
print("Combined rows:", len(panel), "Columns:", list(panel.columns))

# optional filter to your study window
panel_2010_2024 = panel.copy()
if "year" in panel_2010_2024:
    panel_2010_2024 = panel_2010_2024[panel_2010_2024["year"].between(2010, 2024, inclusive="both")]

panel_path_all = CLEAN / "dhs_children_all_rounds_clean.csv"
panel_path_win = CLEAN / "dhs_children_2010_2024_clean.csv"

panel.to_csv(panel_path_all, index=False)
panel_2010_2024.to_csv(panel_path_win, index=False)

panel_path_all, panel_path_win, panel_2010_2024.groupby(["country","year"]).size().head(10)


Combined rows: 38582 Columns: ['country', 'year', 'weight', 'sex', 'height_for_age_z', 'weight_for_height_z', 'weight_for_age_z', 'mother_edu', 'wealth_quintile', 'residence', 'region', 'weight_raw', 'age_months']


(WindowsPath('../data_cleaned/dhs_children_all_rounds_clean.csv'),
 WindowsPath('../data_cleaned/dhs_children_2010_2024_clean.csv'),
 country   year
 Ethiopia  2011      550
 Kenya     2014     2077
           2022     2133
 Tanzania  2010      759
           2015      888
           2016      281
           2022     5925
 Uganda    2011     5665
           2016    11067
 dtype: int64)

In [14]:
from pathlib import Path
import pandas as pd
import numpy as np

REPO  = Path("..")
CLEAN = REPO / "data_cleaned"

panel_path_all = CLEAN / "dhs_children_all_rounds_clean.csv"
panel_path_win = CLEAN / "dhs_children_2010_2024_clean.csv"

# prefer your study window if it exists
use_path = panel_path_win if panel_path_win.exists() else panel_path_all
df = pd.read_csv(use_path)

df.head(3), use_path, df.shape


(    country  year    weight  sex  height_for_age_z  weight_for_height_z  \
 0  Ethiopia  2011  1.543343    1               NaN                  NaN   
 1  Ethiopia  2011  1.543343    2               NaN                  NaN   
 2  Ethiopia  2011  1.543343    1               NaN                  NaN   
 
    weight_for_age_z mother_edu wealth_quintile residence  region  weight_raw  \
 0               NaN     higher         richest     urban       1     1543343   
 1               NaN  secondary         richest     urban       1     1543343   
 2               NaN       none         richest     urban       1     1543343   
 
    age_months  
 0         9.0  
 1        40.0  
 2        10.0  ,
 WindowsPath('../data_cleaned/dhs_children_2010_2024_clean.csv'),
 (29345, 13))