In [1]:
%matplotlib inline
import warnings
warnings.simplefilter('ignore')

import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler
from tqdm import tqdm

# data directory
DATA_DIR = os.path.join('..', 'data')
MWI_DATA_DIR = os.path.join('..', '..', 'data', 'raw_mwi')

def load_data(country_code, data_part='train'):
    hhold = os.path.join(MWI_DATA_DIR, '{}_aligned_hhold_{}.csv'.format(country_code, data_part))
    indiv = os.path.join(MWI_DATA_DIR, '{}_aligned_indiv_{}.csv'.format(country_code, data_part))

    hhold = pd.read_csv(hhold, index_col='id', low_memory=False)
    indiv = pd.read_csv(indiv, index_col=['id', 'iid'], low_memory=False)

    return hhold, indiv

# Review household data questions

In [2]:
mwi_hhold_questions = pd.read_json(
    os.path.join(MWI_DATA_DIR, 'mwi_hhold_questions.json'), typ='ser'
)

In [3]:
for i in mwi_hhold_questions.items():
    print(f"{i[0]}: {i[1]}")

com_bank: Is there a commercial bank in this community (NBM, Savings Bank, Stanbic, etc.)?
com_bus: Do public buses, mini-buses, or regular matola stop in this community?
com_classrooms: At nearest gov prim school,are all classrooms built of brick w/iron sheet roofs?
com_clinic: Is there a health clinic (Chipatala) is this community?
com_dailymrkt: Is there a daily market in this community?
com_distclinic: Distance to nearest health clinic
com_distprimary: Distance to nearest government primary school
com_medicines: Is there a place here to purchase common medicines(pain killers,malaria tablets)
com_postoffice: Is there a post office in this community
com_publicphone: Is there a place to make a telephone call here?(Public phone,phone bureau,etc.)
com_roadtype: What is the type of main access road surface in this community?
com_schoolelec: Is the nearest government primary school electrified?
com_urbancenter: Is the community in a major urban centre?
com_vehicles: Do vehicles pass on th

In [4]:
mwi_hhold, mwi_indiv = load_data('mwi')

In [5]:
mwi_hhold_questions[mwi_hhold.head().columns[:5]]

com_bank          Is there a commercial bank in this community (...
com_bus           Do public buses, mini-buses, or regular matola...
com_classrooms    At nearest gov prim school,are all classrooms ...
com_clinic        Is there a health clinic (Chipatala) is this c...
com_dailymrkt            Is there a daily market in this community?
dtype: object

Process to generate indiv_cat_train:
    1. Take only categorical features
    2. One-hot-encode the features
    3. Summarize encoded features using:
        - mean
        - median
        - all
        - any
        
Process to generate hhold_train:
    1. Take numeric and categorical data
    2. For numeric, transform data using:
        - MinMax scaler: mx_
        - Standard scaler: sc_
    3. For categorical, encode data:
        - Use label encoding
        - Use the label encoded data to perform one-hot-encoding
        - Retain the label encoding

In [6]:
def indiv_vectorize_object_columns(train_data, test_data, agg_type=['mean', 'median', 'any', 'all']):
    '''
        agg_type: ['mean', 'median', 'any', 'all']
    '''
    
    train_data = train_data.drop('country', axis=1)
    test_data = test_data.drop('country', axis=1)

    train_obj_data = train_data.select_dtypes(include=['object'])
    test_obj_data = test_data[train_obj_data.columns]
    
    train_processed_data = pd.DataFrame()
    test_processed_data = pd.DataFrame()
    
    for col in train_obj_data.columns:
        # Take average of categorical values for each member of the household
        train_group = pd.get_dummies(train_obj_data[col]).reset_index(0).groupby('id')
        test_group = pd.get_dummies(test_obj_data[col]).reset_index(0).groupby('id')

        for at in agg_type:

            if at == 'mean':
                train_vec_feat = train_group.mean()
                test_vec_feat = test_group.mean()
                
            if at == 'median':
                train_vec_feat = train_group.median()
                test_vec_feat = test_group.median()
                
            if at == 'any':
                train_vec_feat = (train_group.any()).astype(int)
                test_vec_feat = (test_group.any()).astype(int)

            if at == 'all':
                train_vec_feat = (train_group.all()).astype(int)
                test_vec_feat = (test_group.all()).astype(int)

            common_cols = train_vec_feat.columns.intersection(test_vec_feat.columns)

            train_vec_feat = train_vec_feat[common_cols]
            test_vec_feat = test_vec_feat[common_cols]

            train_vec_feat.columns = [f'{col}_{at}_{cname}' for cname in train_vec_feat.columns]
            test_vec_feat.columns = [f'{col}_{at}_{cname}' for cname in test_vec_feat.columns]

            if train_processed_data.empty:
                train_processed_data = train_vec_feat
            else:
                train_processed_data = pd.concat([train_processed_data, train_vec_feat], axis=1)

            if test_processed_data.empty:
                test_processed_data = test_vec_feat
            else:
                test_processed_data = pd.concat([test_processed_data, test_vec_feat], axis=1)

    train_processed_data['indiv_count'] = train_data.reset_index(0).groupby('id').count().max(axis=1)
    test_processed_data['indiv_count'] = test_data.reset_index(0).groupby('id').count().max(axis=1)

    return train_processed_data, test_processed_data


In [7]:
def transform_categorical(train, test):
    train = train.copy()
    test = test.copy()

    cols = set(train.columns)
    cat_cols = []
    
    # Target is of bool type so it will not be transformed.
    
    numeric = train.select_dtypes(include=['int64', 'float64'])
    numeric_fill = numeric.mean()
    
    numeric = numeric.fillna(numeric_fill)
    
    train[numeric.columns] = numeric
    test[numeric.columns] = test[numeric.columns].fillna(numeric_fill)

    sc = StandardScaler()
    mx = MinMaxScaler()

    train = pd.concat(
        [train, pd.DataFrame(
            sc.fit_transform(numeric),
            columns=['sc_{}'.format(i) for i in numeric.columns],
            index=train.index
        )], axis=1)
    
    test = pd.concat(
        [test, pd.DataFrame(
            sc.transform(test[numeric.columns].fillna(numeric_fill)),
            columns=['sc_{}'.format(i) for i in numeric.columns],
            index=test.index
        )], axis=1)
    
    train = pd.concat(
        [train, pd.DataFrame(
            mx.fit_transform(numeric),
            columns=['mx_{}'.format(i) for i in numeric.columns],
            index=train.index
        )], axis=1)
    
    test = pd.concat(
        [test, pd.DataFrame(
            mx.transform(test[numeric.columns].fillna(numeric_fill)),
            columns=['mx_{}'.format(i) for i in numeric.columns],
            index=test.index
        )], axis=1)
    
    
    num_cols = set(numeric.columns)
    
    for col in tqdm(cols):
        if train[col].dtype == 'object':
            train[col] = train[col].fillna('N/A')
            test[col] = test[col].fillna('N/A')

            train[col] = train[col].apply(str)
            test[col] = test[col].apply(str)

            le = LabelEncoder()
            ohe = OneHotEncoder()

            train_vals = list(train[col].unique())
            test_vals = list(test[col].unique())
            le.fit(train_vals + test_vals)
            train[col] = le.transform(train[col])
            test[col] = le.transform(test[col])
            
            cat_cols.append(col)

    train_ohe = pd.get_dummies(train[cat_cols].astype(str))
    test_ohe = pd.get_dummies(test[cat_cols].astype(str))

    ohe_common = train_ohe.columns.intersection(test_ohe.columns)

    train = pd.concat([train, train_ohe], axis=1)
    test = pd.concat([test, test_ohe], axis=1)
    
    return train, test

# Transform dataset

In [8]:
for country_code in ['mwi']:
    print('Processing country {} data...'.format(country_code))
    hhold_train, indiv_train = load_data(country_code, data_part='train')
    hhold_test, indiv_test = load_data(country_code, data_part='test')

    indiv_cat_train, indiv_cat_test = indiv_vectorize_object_columns(indiv_train, indiv_test)

    indiv_cat_train.to_hdf(os.path.join(DATA_DIR, 'indiv_cat_train.hdf'), '{}_indiv_cat_train'.format(country_code))    
    indiv_cat_test.to_hdf(os.path.join(DATA_DIR, 'indiv_cat_test.hdf'), '{}_indiv_cat_test'.format(country_code))

    hh_train, hh_test = transform_categorical(hhold_train, hhold_test)

    hh_train.to_csv(os.path.join(DATA_DIR, '{}-hhold-transformed-train.csv'.format(country_code)))
    hh_test.to_csv(os.path.join(DATA_DIR, '{}-hhold-transformed-test.csv'.format(country_code)))

Processing country mwi data...


100%|██████████| 345/345 [00:03<00:00, 97.48it/s]


In [9]:
indiv_cat_train.head()

Unnamed: 0_level_0,ind_birthattend_mean_Doctor or clin. officer,ind_birthattend_mean_Friend or relative,ind_birthattend_mean_Midwife,ind_birthattend_mean_Nurse,ind_birthattend_mean_Self,ind_birthattend_mean_TBA,ind_birthattend_median_Doctor or clin. officer,ind_birthattend_median_Friend or relative,ind_birthattend_median_Midwife,ind_birthattend_median_Nurse,...,ind_work5_all_State-Owned Enterprise (Parastatal),ind_work6_mean_No,ind_work6_mean_Yes,ind_work6_median_No,ind_work6_median_Yes,ind_work6_any_No,ind_work6_any_Yes,ind_work6_all_No,ind_work6_all_Yes,indiv_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
101010160009,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0.666667,0.0,1.0,0.0,1,0,0,0,6
101010160068,0.0,0.0,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0.666667,0.111111,1.0,0.0,1,1,0,0,9
101010160069,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0.0,1.0,0.0,1.0,0,1,0,1,1
101010160070,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0.363636,0.636364,0.0,1.0,1,1,0,0,11
101010160074,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0.5,0.3,0.5,0.0,1,1,0,0,10
