In [1]:
import pickle
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
# Import aggregation module
import imp
aggregation = imp.load_source('aggregation', '../../aggregation/aggregation.py')

In [2]:
# Load data
with open('../indiv.pickle', 'rb') as file:
    indiv = pickle.load(file)
with open('../hhold.pickle', 'rb') as file:
    hhold = pickle.load(file)

In [3]:
# Set id, iid as multi-index for easier aggregation
X = indiv.set_index(['id', 'iid']).drop(labels=['poor'], axis=1)
# Set id as index and keep only the target variable
Y = pd.DataFrame(hhold.set_index(['id'])['poor'])

In [None]:
# Choose the best aggregation functions based on spearman correlation (for numeric)
# and chi-squared test (for categorical) data. Fit and transform.
corr_agg = aggregation.Aggregate(X, Y=Y, groupby='id')
result_agg = corr_agg.fit_transform()
transform_dict = corr_agg.col_to_func

In [None]:
# Set 'id' as index of household
hhold = hhold.set_index(['id'])

In [None]:
# Merge the household and individal data
agg_df = pd.merge(hhold, result_agg, left_index=True, right_index=True)

In [None]:
agg_df.head()

In [None]:
agg_df.shape

## Scale Numeric Data

In [None]:
# Gather numeric cols
num_cols = [col for col in agg_df if '_num' in col]

In [None]:
# Scale using MinMaxScaler
min_max = MinMaxScaler()
agg_df[num_cols] = min_max.fit_transform(agg_df[num_cols])

In [None]:
agg_df.head()

In [None]:
# Output aggregate data and transformation dictionary as pickle files
with open('agg_df.pickle', 'wb') as f:
    pickle.dump(agg_df, f, pickle.HIGHEST_PROTOCOL)
with open('indiv_transform_dict.pickle', 'wb') as f:
    pickle.dump(transform_dict, f, pickle.HIGHEST_PROTOCOL)