In [1]:
import pickle
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
# Import aggregation module
import imp
aggregation = imp.load_source('aggregation', '../../aggregation/aggregation.py')

In [2]:
# Load data
with open('../indiv.pickle', 'rb') as file:
    indiv = pickle.load(file)
with open('../hhold.pickle', 'rb') as file:
    hhold = pickle.load(file)

In [3]:
# Set id, iid as multi-index for easier aggregation
X = indiv.set_index(['id', 'iid']).drop(labels=['poor'], axis=1)
# Set id as index and keep only the target variable
Y = pd.DataFrame(hhold.set_index(['id'])['poor'])

In [4]:
# Choose the best aggregation functions based on spearman correlation (for numeric)
# and chi-squared test (for categorical) data. Fit and transform.
corr_agg = aggregation.Aggregate(X, Y=Y, groupby='id')
result_agg = corr_agg.fit_transform()
transform_dict = corr_agg.col_to_func

In [5]:
# Set 'id' as index of household
hhold = hhold.set_index(['id'])

In [6]:
# Merge the household and individal data
agg_df = pd.merge(hhold, result_agg, left_index=True, right_index=True)

In [7]:
agg_df.head()

Unnamed: 0_level_0,h_cat_001,h_cat_002,h_cat_003,h_cat_004,h_cat_005,h_cat_006,h_cat_007,h_cat_008,h_cat_009,h_cat_010,...,i_cat_037_FUUXv,i_cat_037_GtHel,i_cat_037_juMSt,i_cat_038_ALcKg,i_cat_038_JTCKs,i_cat_038_UaIsy,i_cat_038_dSJoN,i_cat_038_vhhVz,i_num_001,i_num_002
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
46107,JhtDR,GUusz,TuovO,ZYabk,feupP,PHMVg,NDTCU,cLAGr,XAmOF,MwLvg,...,False,True,1,False,1,0,0,False,4.0,211
82739,JhtDR,GUusz,TuovO,ZYabk,feupP,PHMVg,NDTCU,sehIp,lwCkE,MwLvg,...,False,True,1,False,0,0,1,False,4.0,116
9646,JhtDR,GUusz,BIZns,ZYabk,uxuSS,PHMVg,NDTCU,sehIp,qNABl,MwLvg,...,False,True,1,False,1,0,0,False,4.0,136
10975,JhtDR,GUusz,TuovO,ZYabk,feupP,PHMVg,NDTCU,sehIp,sPNOc,MwLvg,...,False,True,1,False,1,0,0,False,4.0,126
16463,JhtDR,alLXR,TuovO,ZYabk,feupP,PHMVg,NDTCU,cLAGr,NdlDR,MwLvg,...,True,True,0,False,1,0,0,False,4.0,136


In [8]:
agg_df.shape

(8203, 617)

## Scale Numeric Data

In [9]:
# Gather numeric cols
num_cols = [col for col in agg_df if '_num' in col]

In [10]:
# Scale using MinMaxScaler
min_max = MinMaxScaler()
agg_df[num_cols] = min_max.fit_transform(agg_df[num_cols])

In [11]:
agg_df.head()

Unnamed: 0_level_0,h_cat_001,h_cat_002,h_cat_003,h_cat_004,h_cat_005,h_cat_006,h_cat_007,h_cat_008,h_cat_009,h_cat_010,...,i_cat_037_FUUXv,i_cat_037_GtHel,i_cat_037_juMSt,i_cat_038_ALcKg,i_cat_038_JTCKs,i_cat_038_UaIsy,i_cat_038_dSJoN,i_cat_038_vhhVz,i_num_001,i_num_002
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
46107,JhtDR,GUusz,TuovO,ZYabk,feupP,PHMVg,NDTCU,cLAGr,XAmOF,MwLvg,...,False,True,1,False,1,0,0,False,0.0,0.298969
82739,JhtDR,GUusz,TuovO,ZYabk,feupP,PHMVg,NDTCU,sehIp,lwCkE,MwLvg,...,False,True,1,False,0,0,1,False,0.0,0.103093
9646,JhtDR,GUusz,BIZns,ZYabk,uxuSS,PHMVg,NDTCU,sehIp,qNABl,MwLvg,...,False,True,1,False,1,0,0,False,0.0,0.14433
10975,JhtDR,GUusz,TuovO,ZYabk,feupP,PHMVg,NDTCU,sehIp,sPNOc,MwLvg,...,False,True,1,False,1,0,0,False,0.0,0.123711
16463,JhtDR,alLXR,TuovO,ZYabk,feupP,PHMVg,NDTCU,cLAGr,NdlDR,MwLvg,...,True,True,0,False,1,0,0,False,0.0,0.14433


In [12]:
# Output aggregate data and transformation dictionary as pickle files
with open('agg_df.pickle', 'wb') as f:
    pickle.dump(agg_df, f, pickle.HIGHEST_PROTOCOL)
with open('indiv_transform_dict.pickle', 'wb') as f:
    pickle.dump(transform_dict, f, pickle.HIGHEST_PROTOCOL)