In [52]:
# basic setup: i want to rename variable codes inside the renamed .dta files

import pandas as pd
from pathlib import Path

# path where your renamed .dta files live
unzipped = Path(r"C:\Users\venus\OneDrive\Documents\GitHub\Climate-Malnutrition-EastAfrica\data_raw\unzipped")
print("path set:", unzipped)


path set: C:\Users\venus\OneDrive\Documents\GitHub\Climate-Malnutrition-EastAfrica\data_raw\unzipped


In [53]:
rename_map = {
    "v000": "survey_code",
    "v001": "cluster",
    "v002": "household",
    "v003": "mother_line",
    "v004": "sample_strata",
    "v005": "weight_raw",
    "v006": "month_interview",
    "v007": "year_interview",
    "v008": "century_month_code",
    "b4": "child_sex",
    "b5": "child_alive",
    "hw1": "child_age_months",
    "hw2": "child_weight_kg",
    "hw3": "child_height_cm",
    "hw70": "z_height_for_age",
    "hw71": "z_weight_for_height",
    "hw72": "z_weight_for_age",
    "v437": "mother_weight_raw",
    "v438": "mother_height_raw",
    "v445": "mother_bmi_x100"
}


In [54]:
# read the stata file but keep raw codes
test_file = unzipped / "ETKR71DT" / "Ethiopia_KR_2016.dta"

df = pd.read_stata(test_file, convert_categoricals=False)
print("rows, cols:", df.shape)
df.head(3)



rows, cols: (10641, 1251)


Unnamed: 0,caseid,midx,v000,v001,v002,v003,v004,v005,v006,v007,...,sd508nc,sm508nc,sd508ra,sm508ra,sd508rb,sm508rb,sd508ma,sm508ma,sd508va,sm508va
0,1 17 2,1,ET7,1,17,2,1,5087433,9,2008,...,,,,,,,,,,
1,1 17 2,2,ET7,1,17,2,1,5087433,9,2008,...,,,,,,,,,,
2,1 17 2,3,ET7,1,17,2,1,5087433,9,2008,...,,,,,,,,,,


In [55]:
# same rename map I already defined
df_ren = df.rename(columns=rename_map).copy()
print(df_ren.columns[:20].tolist())
df_ren.head(3)


['caseid', 'midx', 'survey_code', 'cluster', 'household', 'mother_line', 'sample_strata', 'weight_raw', 'month_interview', 'year_interview', 'century_month_code', 'v008a', 'v009', 'v010', 'v011', 'v012', 'v013', 'v014', 'v015', 'v016']


Unnamed: 0,caseid,midx,survey_code,cluster,household,mother_line,sample_strata,weight_raw,month_interview,year_interview,...,sd508nc,sm508nc,sd508ra,sm508ra,sd508rb,sm508rb,sd508ma,sm508ma,sd508va,sm508va
0,1 17 2,1,ET7,1,17,2,1,5087433,9,2008,...,,,,,,,,,,
1,1 17 2,2,ET7,1,17,2,1,5087433,9,2008,...,,,,,,,,,,
2,1 17 2,3,ET7,1,17,2,1,5087433,9,2008,...,,,,,,,,,,


In [56]:
# make weights usable if present
if "weight_raw" in df_ren.columns:
    df_ren["weight"] = df_ren["weight_raw"] / 1_000_000

keep_cols = [
    "case_id","survey_code","cluster","household","mother_line",
    "year_interview","month_interview",
    "child_sex","child_alive",
    "child_age_months","child_weight_kg","child_height_cm",
    "z_height_for_age","z_weight_for_height","z_weight_for_age",
    "weight"
]

cleaned_dir = Path(r"C:\Users\venus\OneDrive\Documents\GitHub\Climate-Malnutrition-EastAfrica\data_cleaned")
cleaned_dir.mkdir(parents=True, exist_ok=True)

preview = df_ren[[c for c in keep_cols if c in df_ren.columns]].copy()
out_path = cleaned_dir / "Ethiopia_KR_2016_cleaned.parquet"
preview.to_parquet(out_path, index=False)
print("saved:", out_path)
preview.head(3)


saved: C:\Users\venus\OneDrive\Documents\GitHub\Climate-Malnutrition-EastAfrica\data_cleaned\Ethiopia_KR_2016_cleaned.parquet


Unnamed: 0,survey_code,cluster,household,mother_line,year_interview,month_interview,child_sex,child_alive,child_age_months,child_weight_kg,child_height_cm,z_height_for_age,z_weight_for_height,z_weight_for_age,weight
0,ET7,1,17,2,2008,9,1,1,31.0,104.0,842.0,-250.0,-224.0,-122.0,5.087433
1,ET7,1,17,2,2008,9,1,1,44.0,147.0,1022.0,23.0,-55.0,-102.0,5.087433
2,ET7,1,17,2,2008,9,2,1,55.0,195.0,1180.0,243.0,78.0,-124.0,5.087433


In [57]:
# loop through every renamed .dta and do the same thing
made = 0
errors = []

for folder in unzipped.iterdir():
    if not folder.is_dir():
        continue
    for f in folder.glob("*.dta"):
        try:
            tmp = pd.read_stata(f, convert_categoricals=False).rename(columns=rename_map)
            if "weight_raw" in tmp.columns:
                tmp["weight"] = tmp["weight_raw"] / 1_000_000

            small = tmp[[c for c in keep_cols if c in tmp.columns]].copy()
            out = cleaned_dir / f"{f.stem}_cleaned.parquet"
            small.to_parquet(out, index=False)
            made += 1
            print("cleaned:", f.name, "→", out.name, "rows:", len(small))
        except Exception as e:
            print("skipped:", f.name, "reason:", e)
            errors.append((f.name, str(e)))

print("done. files made:", made)
if errors:
    print("had issues with:", errors[:3])


cleaned: Ethiopia_KR_2005.dta → Ethiopia_KR_2005_cleaned.parquet rows: 10873
cleaned: Ethiopia_KR_51.dta → Ethiopia_KR_51_cleaned.parquet rows: 9861
cleaned: Ethiopia_KR_2011.dta → Ethiopia_KR_2011_cleaned.parquet rows: 11654
cleaned: Ethiopia_KR_2016.dta → Ethiopia_KR_2016_cleaned.parquet rows: 10641
cleaned: Ethiopia_KR_2021.dta → Ethiopia_KR_2021_cleaned.parquet rows: 5753
cleaned: Kenya_KR_1988.dta → Kenya_KR_1988_cleaned.parquet rows: 6980
cleaned: Kenya_KR_1998.dta → Kenya_KR_1998_cleaned.parquet rows: 6115
cleaned: Kenya_KR_1999.dta → Kenya_KR_1999_cleaned.parquet rows: 3531
cleaned: Kenya_KR_2008.dta → Kenya_KR_2008_cleaned.parquet rows: 5949
cleaned: Kenya_KR_2010.dta → Kenya_KR_2010_cleaned.parquet rows: 6079
cleaned: Kenya_KR_72.dta → Kenya_KR_72_cleaned.parquet rows: 20964
cleaned: Kenya_KR_2022.dta → Kenya_KR_2022_cleaned.parquet rows: 19530
cleaned: Tanzania_KR_1996.dta → Tanzania_KR_1996_cleaned.parquet rows: 8138
cleaned: Tanzania_KR_1999.dta → Tanzania_KR_1999_cleaned.

In [58]:
# if a column name repeats, this will add _1, _2, etc. to later duplicates
def make_unique(cols):
    seen = {}
    out = []
    for c in cols:
        if c in seen:
            seen[c] += 1
            out.append(f"{c}_{seen[c]}")
        else:
            seen[c] = 0
            out.append(c)
    return out


In [59]:
from pathlib import Path
import pandas as pd

unzipped = Path(r"C:\Users\venus\OneDrive\Documents\GitHub\Climate-Malnutrition-EastAfrica\data_raw\unzipped")
cleaned_dir = Path(r"C:\Users\venus\OneDrive\Documents\GitHub\Climate-Malnutrition-EastAfrica\data_cleaned")
cleaned_dir.mkdir(parents=True, exist_ok=True)

keep_cols = [
    "case_id","survey_code","cluster","household","mother_line",
    "year_interview","month_interview",
    "child_sex","child_alive",
    "child_age_months","child_weight_kg","child_height_cm",
    "z_height_for_age","z_weight_for_height","z_weight_for_age",
    "weight_raw","weight"  # weight gets created below
]

made = 0
errors = []

for folder in unzipped.iterdir():
    if not folder.is_dir():
        continue
    for f in folder.glob("*.dta"):
        try:
            # 1) read with raw codes
            tmp = pd.read_stata(f, convert_categoricals=False)

            # 2) rename variables
            tmp = tmp.rename(columns=rename_map)

            # 3) fix duplicate column names if any
            if tmp.columns.duplicated().any():
                tmp.columns = make_unique(list(tmp.columns))

            # 4) weights
            if "weight_raw" in tmp.columns and "weight" not in tmp.columns:
                tmp["weight"] = tmp["weight_raw"] / 1_000_000

            # 5) keep a light subset
            cols = [c for c in keep_cols if c in tmp.columns]
            small = tmp[cols].copy()

            # 6) save
            out = cleaned_dir / f"{f.stem}_cleaned.parquet"
            small.to_parquet(out, index=False)
            made += 1
            print("cleaned:", f.name, "→", out.name, "rows:", len(small))
        except Exception as e:
            print("skipped:", f.name, "reason:", e)
            errors.append((f.name, str(e)))

print("done. files made:", made)
if errors:
    print("had issues with:", errors[:5])


cleaned: Ethiopia_KR_2005.dta → Ethiopia_KR_2005_cleaned.parquet rows: 10873
cleaned: Ethiopia_KR_51.dta → Ethiopia_KR_51_cleaned.parquet rows: 9861
cleaned: Ethiopia_KR_2011.dta → Ethiopia_KR_2011_cleaned.parquet rows: 11654
cleaned: Ethiopia_KR_2016.dta → Ethiopia_KR_2016_cleaned.parquet rows: 10641
cleaned: Ethiopia_KR_2021.dta → Ethiopia_KR_2021_cleaned.parquet rows: 5753
cleaned: Kenya_KR_1988.dta → Kenya_KR_1988_cleaned.parquet rows: 6980
cleaned: Kenya_KR_1998.dta → Kenya_KR_1998_cleaned.parquet rows: 6115
cleaned: Kenya_KR_1999.dta → Kenya_KR_1999_cleaned.parquet rows: 3531
cleaned: Kenya_KR_2008.dta → Kenya_KR_2008_cleaned.parquet rows: 5949
cleaned: Kenya_KR_2010.dta → Kenya_KR_2010_cleaned.parquet rows: 6079
cleaned: Kenya_KR_72.dta → Kenya_KR_72_cleaned.parquet rows: 20964
cleaned: Kenya_KR_2022.dta → Kenya_KR_2022_cleaned.parquet rows: 19530
cleaned: Tanzania_KR_1996.dta → Tanzania_KR_1996_cleaned.parquet rows: 8138
cleaned: Tanzania_KR_1999.dta → Tanzania_KR_1999_cleaned.

In [60]:
import pandas as pd
from pathlib import Path

cleaned_dir = Path(r"C:\Users\venus\OneDrive\Documents\GitHub\Climate-Malnutrition-EastAfrica\data_cleaned")
made = sorted(cleaned_dir.glob("*_cleaned.parquet"))
len(made), [m.name for m in made[:8]]


(25,
 ['Ethiopia_KR_2005_cleaned.parquet',
  'Ethiopia_KR_2011_cleaned.parquet',
  'Ethiopia_KR_2016_cleaned.parquet',
  'Ethiopia_KR_2021_cleaned.parquet',
  'Ethiopia_KR_51_cleaned.parquet',
  'Kenya_KR_1988_cleaned.parquet',
  'Kenya_KR_1998_cleaned.parquet',
  'Kenya_KR_1999_cleaned.parquet'])