In [1]:
from pathlib import Path
import pandas as pd

# assume this notebook lives in notebooks/
REPO   = Path("..")
RAW_DTA = REPO / "data_raw" / "DHS_dta"
CLEAN   = REPO / "data_cleaned"

print("DTA folder exists:", RAW_DTA.exists(), "| path:", RAW_DTA)
print("Clean folder exists:", CLEAN.exists(), "| path:", CLEAN)


DTA folder exists: True | path: ..\data_raw\DHS_dta
Clean folder exists: True | path: ..\data_cleaned


In [7]:
from pathlib import Path
import pandas as pd

# locate the DHS files
REPO = Path("..")
RAW_DTA = REPO / "data_raw" / "DHS_dta"

# list all .DTA files
files = sorted(RAW_DTA.glob("*.DTA"))
print("Total DHS files found:", len(files))
files[:5]


Total DHS files found: 25


[WindowsPath('../data_raw/DHS_dta/ETKR41FL.DTA'),
 WindowsPath('../data_raw/DHS_dta/ETKR51FL.DTA'),
 WindowsPath('../data_raw/DHS_dta/ETKR61FL.DTA'),
 WindowsPath('../data_raw/DHS_dta/ETKR71FL.DTA'),
 WindowsPath('../data_raw/DHS_dta/ETKR81FL.DTA')]

In [9]:
files = sorted(RAW_DTA.glob("*.DTA"))
len(files), files[:5]


(25,
 [WindowsPath('../data_raw/DHS_dta/ETKR41FL.DTA'),
  WindowsPath('../data_raw/DHS_dta/ETKR51FL.DTA'),
  WindowsPath('../data_raw/DHS_dta/ETKR61FL.DTA'),
  WindowsPath('../data_raw/DHS_dta/ETKR71FL.DTA'),
  WindowsPath('../data_raw/DHS_dta/ETKR81FL.DTA')])

In [13]:
# pick one file to test
test_file = files[0]   # this just picks the first DHS file it found
test_file


WindowsPath('../data_raw/DHS_dta/ETKR41FL.DTA')

In [15]:
df = pd.read_stata(test_file, convert_categoricals=False)
df.columns = df.columns.str.lower()
print(test_file.name, "| rows:", len(df), "| cols:", len(df.columns))
df.columns.tolist()[:40]   # quick peek at first 40 column names


ETKR41FL.DTA | rows: 10873 | cols: 881


['caseid',
 'v000',
 'v001',
 'v002',
 'v003',
 'v004',
 'v005',
 'v006',
 'v007',
 'v008',
 'v009',
 'v010',
 'v011',
 'v012',
 'v013',
 'v014',
 'v015',
 'v016',
 'v017',
 'v018',
 'v019',
 'v019a',
 'v020',
 'v021',
 'v022',
 'v023',
 'v024',
 'v025',
 'v026',
 'v027',
 'v028',
 'v029',
 'v030',
 'v031',
 'v032',
 'v033',
 'v034',
 'v040',
 'v042',
 'v043']

In [17]:
len(df.columns), df.columns[-10:]  # see total columns and last few


(881,
 Index(['s406ba', 's406bb', 's406bc', 's406bd', 's406be', 's406bx', 's432c',
        's440a', 's442a', 's475c'],
       dtype='object'))

In [19]:
# look for either hw70/71/72 or hc70/71/72
zscore_hits = [c for c in df.columns if c in ("hw70","hw71","hw72","hc70","hc71","hc72")]
zscore_hits


[]

In [21]:
hits_basic = {k: (k in df.columns) for k in ["b19","b4","v106","v190","v025","v024","v007","v005"]}
hits_basic


{'b19': False,
 'b4': True,
 'v106': True,
 'v190': False,
 'v025': True,
 'v024': True,
 'v007': True,
 'v005': True}