-
Notifications
You must be signed in to change notification settings - Fork 13
/
dhs.py
151 lines (139 loc) 路 5.14 KB
/
dhs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
# AUTOGENERATED! DO NOT EDIT! File to edit: ../notebooks/04_dhs.ipynb.
# %% auto 0
__all__ = ['load_column_config', 'load_dhs_file', 'apply_threshold', 'assign_wealth_index']
# %% ../notebooks/04_dhs.ipynb 5
from typing import List
import numpy as np
import pandas as pd
from pandas import DataFrame
from sklearn.decomposition import PCA
# %% ../notebooks/04_dhs.ipynb 6
PH_COLUMN_CONFIG = {
"cluster number": "DHSCLUST",
"wealth index factor score combined (5 decimals)": "Wealth Index",
"country code and phase": "country code and phase",
"number of rooms used for sleeping": "rooms",
"has electricity": "electric",
"has mobile telephone": "mobile telephone",
"has radio": "radio",
"has television": "television",
"has car/truck": "car/truck",
"has refrigerator": "refrigerator",
"has motorcycle/scooter": "motorcycle",
"main floor material": "floor",
"type of toilet facility": "toilet",
"source of drinking water": "drinking water",
}
KH_COLUMN_CONFIG = {
"cluster number": "DHSCLUST",
"wealth index factor score (5 decimals)": "Wealth Index",
"country code and phase": "country code and phase",
"number of rooms used for sleeping": "rooms",
"has electricity": "electric",
"has mobile telephone": "mobile telephone",
"has radio": "radio",
"has television": "television",
"has car/truck": "car/truck",
"has refrigerator": "refrigerator",
"has motorcycle/scooter": "motorcycle",
"main floor material": "floor",
"type of toilet facility": "toilet",
"na - source of drinking water": "drinking water",
}
MM_COLUMN_CONFIG = {
"cluster number": "DHSCLUST",
"wealth index factor score combined (5 decimals)": "Wealth Index",
"country code and phase": "country code and phase",
"number of rooms used for sleeping": "rooms",
"has electricity": "electric",
"has mobile telephone": "mobile telephone",
"has radio": "radio",
"has television": "television",
"has car/truck": "car/truck",
"has refrigerator": "refrigerator",
"has motorcycle/scooter": "motorcycle",
"main floor material": "floor",
"type of toilet facility": "toilet",
"source of drinking water": "drinking water",
}
TL_COLUMN_CONFIG = {
"cluster number": "DHSCLUST",
"wealth index factor score combined (5 decimals)": "Wealth Index",
"country code and phase": "country code and phase",
"number of rooms used for sleeping": "rooms",
"has electricity": "electric",
"has mobile telephone": "mobile telephone",
"has radio": "radio",
"has television": "television",
"has car/truck": "car/truck",
"has refrigerator": "refrigerator",
"has motorcycle/scooter": "motorcycle",
"main floor material": "floor",
"type of toilet facility": "toilet",
"source of drinking water": "drinking water",
}
COLUMN_CONFIG = {
"ph": PH_COLUMN_CONFIG,
"kh": KH_COLUMN_CONFIG,
"mm": MM_COLUMN_CONFIG,
"tl": TL_COLUMN_CONFIG,
}
# %% ../notebooks/04_dhs.ipynb 7
def load_column_config(
country: str, # 2 letter character representing the country
) -> dict:
"""Get predined column mapping for some countries.
The following countries area supported:
- `ph` Philippines
- `tl` East Timor
- `mm` Myanmar
- `kh` Cambodia
"""
if country in COLUMN_CONFIG:
return COLUMN_CONFIG[country]
else:
raise ValueError(
f"Not a valid country. Valid countries are {list(COLUMN_CONFIG.keys())}"
)
# %% ../notebooks/04_dhs.ipynb 8
def load_dhs_file(
household_data: str, # str or pathlike object to the household data
) -> DataFrame:
"""Loads household data and renames columns based on variable labels of the file"""
dhs_reader = pd.read_stata(
household_data, convert_categoricals=False, iterator=True
)
dhs_dict = dhs_reader.variable_labels()
with dhs_reader:
dhs_df = dhs_reader.read()
dhs_df.rename(columns=dhs_dict, inplace=True)
return dhs_df
# %% ../notebooks/04_dhs.ipynb 9
def apply_threshold(
df: DataFrame, # Dataframe
columns: List[str], # List of columns to apply the threshold
config: dict, # Config containing the min and max of each columns
) -> DataFrame:
"""Applies a threshold to a list of columns"""
copied = df.copy()
for col in columns:
if col in config:
copied[col] = copied[col].clip(*config[col])
elif "_default" in config:
copied[col] = copied[col].clip(*config["_default"])
return copied
# %% ../notebooks/04_dhs.ipynb 10
def assign_wealth_index(
asset_df: DataFrame, # Dataframe containg only the features to apply wealth index
use_pca=True, # if calculating wealth index should be done via PCA or via Sigular Value Decomposition
):
if use_pca:
pca = PCA(1)
pca.fit(asset_df.values)
first_comp_vec_scaled = np.matmul(asset_df, pca.components_.T).squeeze()
else:
asset_df = asset_df.apply(lambda x: x - x.mean(), axis=1)
u, s, _ = np.linalg.svd(asset_df.values.T, full_matrices=False)
orthog_pc1_proj = np.matmul(asset_df, u[0])
first_comp_vec_scaled = s[0] * orthog_pc1_proj
return first_comp_vec_scaled