In [2]:
import pandas as pd
import numpy as np
import pickle

In [10]:
class Aggregate():
    import pandas as pd
    import numpy as np
    from scipy.stats import spearmanr, mode
    from sklearn.feature_selection import chi2
    from sklearn.ensemble import RandomForestClassifier
    
    def __init__ (self, X, **kwargs):
        self.X = X
        self.cat_cols = np.array([x for x in self.X.columns if '_cat_' in x])
        self.num_cols = np.array([x for x in self.X.columns if '_num_' in x])
        self.Y = kwargs.pop('Y', None)
        self.groupby = kwargs.pop('groupby', 'id')
        
    # Possible aggregation functions
    def any_of_indiv (df, groupby='id'):
        return df.groupby('id').any()

    def percent_of_indiv (df, groupby='id'):
        return df.groupby('id').sum() / df.groupby('id').count()

    def mode_of_indiv (df, groupby='id'):
        return df.groupby('id').agg(lambda x: Aggregate.mode(x)[0][0])
        #return df.groupby('id').agg(lambda x: scipy.stats.mode(x)[0][0])

    def mean_of_indiv (df, groupby='id'):
        return df.groupby('id').mean()

    def median_of_indiv (df, groupby='id'):
        return df.groupby('id').median()

    def max_of_indiv (df, groupby='id'):
        return df.groupby('id').max()
    
    def min_of_indiv (df, groupby='id'):
        return df.groupby('id').min()
    
    # Possible evaluation functions
    def chi_squared (x, y):
        return pd.Series(Aggregate.chi2(x, y)[0])
    
    def spearman (x, y):
        result = x.apply(Aggregate.spearmanr, axis=0, args=(y,))
        #result = x.apply(scipy.stats.spearmanr, axis=0, args=(y,))
        result = result.apply(lambda x: x[0])
        return result.reset_index(drop=True)
    
    def random_forest (x, y, **kwargs):
        n_estimators = kwargs.pop('rf_n_estimators', 10000)
        forest = Aggregate.RandomForestClassifier(n_estimators=n_estimators, n_jobs=-1)
        forest.fit(x, y)
        return forest.feature_importances_
    
    # Helper functions for comparisons
    def return_max (row, cols_to_test):
        '''Pandas apply function to return max function in row.'''
        temp = np.array([row[x] for x in cols_to_test])
        # If all values are NaN, doesn't matter which function. Return first.
        if np.all(np.isnan(temp)):
            return cols_to_test[0].split('_')[1]
        # nanargmax ignores NaNs
        return cols_to_test[np.nanargmax(temp)].split('_')[1]
    
    def compare (self, cols, funcs, eval_func, **kwargs):
        # If there is only one function, return it (no testing necessary)
        if len(funcs) == 1:
            return np.array([funcs[0] for x in cols])
        # A dataframe to be used for comparison purposes
        compare_df = pd.DataFrame(data={'Name': cols})
        for func_name in funcs:
            func = getattr(Aggregate, func_name + '_of_indiv')
            # Apply aggregation
            temp_df = func(self.X[cols], groupby=self.groupby)
            # Merge with Y to ensure indices match
            temp_df = pd.merge(temp_df, self.Y, left_index=True, right_index=True)
            # Gather transformed data
            X_transform = temp_df[cols]
            Y_transform = temp_df['poor']
            # Names
            rho = 'rho_' + func_name
            # Apply correlation
            compare_df[rho] = eval_func(X_transform, Y_transform, **kwargs)
        # List of columns to test
        cols_to_test = [x for x in compare_df.columns if 'rho' in x]
        # Return the best aggregate function for each categorical feature
        return np.array(compare_df.apply(Aggregate.return_max, axis=1, args=(cols_to_test,)))
    
    # Fit function
    def fit (self, **kwargs):        
        '''The fit function attempts all supplied aggregate functions for each column
        of type numeric or categorical. The transformed columns are then tested for
        suitability using the evaluation function (e.g. correlation with target variable).
        Finally, the top aggregate function for each column is stored as a dictionary
        lookup in self.col_to_func. 
        
        *Note, the 'mode' aggregation function is very slow, and has to make assumptions
            when there is not one clear mode in series.
                    
        Possible evaluation functions:
            'chi_squared', 'spearman', 'random_forest'
        
        Possible aggregation functions:
            'mean', 'median', 'mode', 'max', 'min', 'any', 'percent'
        '''
        # Gather and store options as class variables
        self.num_eval = getattr(Aggregate, kwargs.pop('num_eval', 'spearman'))
        self.cat_eval = getattr(Aggregate, kwargs.pop('cat_eval', 'chi_squared'))
        self.num_agg_funcs = kwargs.pop('num_agg_funcs', ['mean', 'median', 'max', 'min'])
        self.cat_agg_funcs = kwargs.pop('cat_agg_funcs', ['any', 'percent', 'mode'])
        self.rf_n_estimators = kwargs.pop('rf_n_estimators', 10000)
        # Gather column names
        self.cat_cols = np.array([x for x in self.X.columns if '_cat_' in x])
        self.num_cols = np.array([x for x in self.X.columns if '_num_' in x])
        # Determine best agg function
        self.cat_funcs = self.compare(self.cat_cols, self.cat_agg_funcs, self.cat_eval, **kwargs)
        self.num_funcs = self.compare(self.num_cols, self.num_agg_funcs, self.num_eval, **kwargs)
        # Combine transformations into dictionary mapping
        t1 = np.append(self.cat_cols, self.num_cols)
        t2 = np.append(self.cat_funcs, self.num_funcs)
        self.col_to_func = {key: value for key, value in zip(t1, t2)}
    
    # Apply transform
    def transform (self, **kwargs):
        '''The transform function applies aggregate functions to each column and returns
        a transformed dataframe. If no arguments are passed, this will transform the df
        originally passed when initializing the class. User can also pass a new df and
        a col_to_func dictionary mapping, useful when user wants to transform a different
        df using the same transformations as determined by the fit function.'''
        # Allow for ability to pass new X and previously defined functions
        X = kwargs.pop('X', self.X)
        col_to_func = kwargs.pop('col_to_func', self.col_to_func)
        # Transform and return
        return pd.concat([getattr(Aggregate, func + '_of_indiv')(X[col]) for col, func in col_to_func.items()], axis=1)        
    
    def fit_transform (self, **kwargs):
        getattr(Aggregate, 'fit')(self, **kwargs)
        return getattr(Aggregate, 'transform')(self, **kwargs)

## Load the Data

In [3]:
# Load data
with open('../indiv.pickle', 'rb') as file:
    indiv = pickle.load(file)
with open('../hhold.pickle', 'rb') as file:
    hhold = pickle.load(file)

## Set id and iid as multi-index
This isn't strictly necessary, but it is a lot cleaner and easier to deal with if id is the index of the dataframe. That way we don't have to keep track of it so much. For indiv, this becomes a hierarchical index

In [4]:
X = indiv.set_index(['id', 'iid']).drop(labels=['poor'], axis=1)
Y = pd.DataFrame(hhold.set_index(['id'])['poor'])

In [7]:
X.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,i_num_001,i_num_002,i_cat_001_BNCcM,i_cat_001_HUpWg,i_cat_001_JMXQx,i_cat_001_PAVsH,i_cat_001_SJPkb,i_cat_001_SlRmt,i_cat_001_TRFeI,i_cat_001_XJgvq,...,i_cat_036_rkLqZ,i_cat_036_xUYIC,i_cat_037_FUUXv,i_cat_037_GtHel,i_cat_037_juMSt,i_cat_038_ALcKg,i_cat_038_JTCKs,i_cat_038_UaIsy,i_cat_038_dSJoN,i_cat_038_vhhVz
id,iid,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
80389,1,4.0,181,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0,1,0
80389,2,4.0,141,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,1,0,0,0
80389,3,4.0,41,0,0,0,0,0,0,1,0,...,1,0,0,0,1,0,1,0,0,0
80389,4,4.0,16,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0
39883,1,4.0,381,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,1,0,0


In [8]:
Y.head()

Unnamed: 0_level_0,poor
id,Unnamed: 1_level_1
46107,False
82739,False
9646,True
10975,True
16463,True


## Initialize the Aggregate class
An X value must be passed, but Y and groupby are optional. 

In [15]:
sample = Aggregate(X, Y=Y, groupby='id')

## Fit and Transform
The class is designed to mimic sklearn as much as possible. First, we need to call the "fit" method, which will try to determine the best aggregate transformation for each column.

The most basic call of .fit() will assume that we should try all possible aggregation functions, and will assume that we should evaluation numerical features using spearman correlation and categorical features using chi squared.

In [16]:
sample.fit()

## Storing the aggregation functions
The most important output is probably the .col_to_func variable, which is a dictionary with:
* key: column name
* value: aggregation function name

In [17]:
agg_functions = sample.col_to_func

In [18]:
agg_functions

{'i_cat_001_BNCcM': 'mode',
 'i_cat_001_HUpWg': 'mode',
 'i_cat_001_JMXQx': 'any',
 'i_cat_001_PAVsH': 'any',
 'i_cat_001_SJPkb': 'any',
 'i_cat_001_SlRmt': 'mode',
 'i_cat_001_TRFeI': 'any',
 'i_cat_001_XJgvq': 'any',
 'i_cat_001_XJsPz': 'mode',
 'i_cat_001_YsSBt': 'mode',
 'i_cat_001_ayXFR': 'any',
 'i_cat_001_dHZCo': 'mode',
 'i_cat_001_duBym': 'mode',
 'i_cat_001_fmdsF': 'any',
 'i_cat_001_jnwBm': 'any',
 'i_cat_001_kuFXw': 'mode',
 'i_cat_001_lBMrM': 'any',
 'i_cat_001_nIete': 'any',
 'i_cat_001_oGavK': 'any',
 'i_cat_001_slvTJ': 'mode',
 'i_cat_001_tMiQp': 'mode',
 'i_cat_001_vhanw': 'mode',
 'i_cat_001_wWIzo': 'any',
 'i_cat_001_xnnDH': 'any',
 'i_cat_001_xsVYp': 'percent',
 'i_cat_002_kzSFB': 'any',
 'i_cat_002_mOlYV': 'mode',
 'i_cat_002_yAyAe': 'any',
 'i_cat_003_FRcdT': 'any',
 'i_cat_003_UFoKR': 'mode',
 'i_cat_003_axSTs': 'percent',
 'i_cat_004_CXizI': 'any',
 'i_cat_004_DQhEE': 'mode',
 'i_cat_004_DaETh': 'any',
 'i_cat_004_GotAd': 'any',
 'i_cat_004_HIvIU': 'mode',
 'i_c

## Transforming the columns
Once you're ready, you can transform the columns. For the training data, this is as simple as calling the .transform() method. The class will remember everything needed (assuming that you passed a Y value when initializing the class).

In [19]:
result = sample.transform()

In [20]:
result.head()

Unnamed: 0_level_0,i_cat_001_BNCcM,i_cat_001_HUpWg,i_cat_001_JMXQx,i_cat_001_PAVsH,i_cat_001_SJPkb,i_cat_001_SlRmt,i_cat_001_TRFeI,i_cat_001_XJgvq,i_cat_001_XJsPz,i_cat_001_YsSBt,...,i_cat_037_FUUXv,i_cat_037_GtHel,i_cat_037_juMSt,i_cat_038_ALcKg,i_cat_038_JTCKs,i_cat_038_UaIsy,i_cat_038_dSJoN,i_cat_038_vhhVz,i_num_001,i_num_002
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
14,0,0,False,False,False,0,False,False,1,0,...,0,False,True,False,0,0,0,0,4.0,341
18,0,0,False,False,False,0,False,False,1,0,...,0,False,True,False,1,0,0,0,4.0,116
36,0,0,False,False,False,0,False,False,1,0,...,0,False,True,False,1,0,0,0,4.0,351
39,0,0,False,False,False,0,True,False,1,0,...,0,True,True,False,1,0,0,0,4.0,241
58,0,0,False,False,False,0,False,False,1,0,...,1,False,True,False,1,0,0,0,4.0,376


## Aggregate Function Options
It's possible to only review specific aggregation functions. For example, maybe you don't think there's any reason too look at 'min'. Simply pass a list of string-names for the functions you want to try. Here are the defaults:
* num_agg_funcs=['median', 'max', 'min', 'mean']
* cat_agg_funcs=['percent', 'any', 'mode']

Note that 'mode' is extremely slow! I believe this is because it can't just perform a basic mathematical operation on the column. Instead, it has to group by value, then count each value, then return the max. It's just a more expensive operation. Also, when there is a tie, mode is going to choose an arbitrary winner.

In [21]:
# Example of choosing to review only specific aggregate functions
sample2 = Aggregate(X, Y=Y, groupby='id')
sample2.fit(num_agg_funcs=['median', 'mean'], cat_agg_funcs=['percent', 'any'])
sample2.col_to_func

{'i_cat_001_BNCcM': 'percent',
 'i_cat_001_HUpWg': 'percent',
 'i_cat_001_JMXQx': 'any',
 'i_cat_001_PAVsH': 'any',
 'i_cat_001_SJPkb': 'any',
 'i_cat_001_SlRmt': 'any',
 'i_cat_001_TRFeI': 'any',
 'i_cat_001_XJgvq': 'any',
 'i_cat_001_XJsPz': 'percent',
 'i_cat_001_YsSBt': 'any',
 'i_cat_001_ayXFR': 'any',
 'i_cat_001_dHZCo': 'any',
 'i_cat_001_duBym': 'percent',
 'i_cat_001_fmdsF': 'any',
 'i_cat_001_jnwBm': 'any',
 'i_cat_001_kuFXw': 'any',
 'i_cat_001_lBMrM': 'any',
 'i_cat_001_nIete': 'any',
 'i_cat_001_oGavK': 'any',
 'i_cat_001_slvTJ': 'percent',
 'i_cat_001_tMiQp': 'any',
 'i_cat_001_vhanw': 'any',
 'i_cat_001_wWIzo': 'any',
 'i_cat_001_xnnDH': 'any',
 'i_cat_001_xsVYp': 'percent',
 'i_cat_002_kzSFB': 'any',
 'i_cat_002_mOlYV': 'percent',
 'i_cat_002_yAyAe': 'any',
 'i_cat_003_FRcdT': 'any',
 'i_cat_003_UFoKR': 'any',
 'i_cat_003_axSTs': 'percent',
 'i_cat_004_CXizI': 'any',
 'i_cat_004_DQhEE': 'any',
 'i_cat_004_DaETh': 'any',
 'i_cat_004_GotAd': 'any',
 'i_cat_004_HIvIU': 'pe

In [22]:
# Fit will work even if you only pass one option for aggregate function
# But obviously, it will just return that function...
sample3 = Aggregate(X, Y=Y, groupby='id')
sample3.fit(num_agg_funcs=['median', 'mean'], cat_agg_funcs=['percent'])
sample3.col_to_func

{'i_cat_001_BNCcM': 'percent',
 'i_cat_001_HUpWg': 'percent',
 'i_cat_001_JMXQx': 'percent',
 'i_cat_001_PAVsH': 'percent',
 'i_cat_001_SJPkb': 'percent',
 'i_cat_001_SlRmt': 'percent',
 'i_cat_001_TRFeI': 'percent',
 'i_cat_001_XJgvq': 'percent',
 'i_cat_001_XJsPz': 'percent',
 'i_cat_001_YsSBt': 'percent',
 'i_cat_001_ayXFR': 'percent',
 'i_cat_001_dHZCo': 'percent',
 'i_cat_001_duBym': 'percent',
 'i_cat_001_fmdsF': 'percent',
 'i_cat_001_jnwBm': 'percent',
 'i_cat_001_kuFXw': 'percent',
 'i_cat_001_lBMrM': 'percent',
 'i_cat_001_nIete': 'percent',
 'i_cat_001_oGavK': 'percent',
 'i_cat_001_slvTJ': 'percent',
 'i_cat_001_tMiQp': 'percent',
 'i_cat_001_vhanw': 'percent',
 'i_cat_001_wWIzo': 'percent',
 'i_cat_001_xnnDH': 'percent',
 'i_cat_001_xsVYp': 'percent',
 'i_cat_002_kzSFB': 'percent',
 'i_cat_002_mOlYV': 'percent',
 'i_cat_002_yAyAe': 'percent',
 'i_cat_003_FRcdT': 'percent',
 'i_cat_003_UFoKR': 'percent',
 'i_cat_003_axSTs': 'percent',
 'i_cat_004_CXizI': 'percent',
 'i_cat_

## Evaluation Options
The default is to use spearman correlation to evaluate the numeric variables and chi squared for categorical. You can request different evaluations for numerical and categorical. You can request to use any of the following:
* spearman
* chi_squared
* random_forest

If you use random_forest, you can also pass an option to choose how many estimators to run (in sklearn, this is the n_estimators option). 10,000 is the default.
* rf_n_estimators = 10000

In [23]:
# Example of how to pass different evaluation options.
# Note how to pass n_estimators
sample4 = Aggregate(X, Y=Y, groupby='id')
sample4.fit(num_eval='chi_squared', cat_eval='random_forest', rf_n_estimators=1000)
sample4.col_to_func

{'i_cat_001_BNCcM': 'any',
 'i_cat_001_HUpWg': 'any',
 'i_cat_001_JMXQx': 'any',
 'i_cat_001_PAVsH': 'any',
 'i_cat_001_SJPkb': 'any',
 'i_cat_001_SlRmt': 'any',
 'i_cat_001_TRFeI': 'any',
 'i_cat_001_XJgvq': 'any',
 'i_cat_001_XJsPz': 'mode',
 'i_cat_001_YsSBt': 'any',
 'i_cat_001_ayXFR': 'any',
 'i_cat_001_dHZCo': 'any',
 'i_cat_001_duBym': 'any',
 'i_cat_001_fmdsF': 'any',
 'i_cat_001_jnwBm': 'any',
 'i_cat_001_kuFXw': 'any',
 'i_cat_001_lBMrM': 'any',
 'i_cat_001_nIete': 'any',
 'i_cat_001_oGavK': 'any',
 'i_cat_001_slvTJ': 'any',
 'i_cat_001_tMiQp': 'any',
 'i_cat_001_vhanw': 'any',
 'i_cat_001_wWIzo': 'any',
 'i_cat_001_xnnDH': 'any',
 'i_cat_001_xsVYp': 'any',
 'i_cat_002_kzSFB': 'mode',
 'i_cat_002_mOlYV': 'mode',
 'i_cat_002_yAyAe': 'percent',
 'i_cat_003_FRcdT': 'percent',
 'i_cat_003_UFoKR': 'any',
 'i_cat_003_axSTs': 'mode',
 'i_cat_004_CXizI': 'mode',
 'i_cat_004_DQhEE': 'any',
 'i_cat_004_DaETh': 'any',
 'i_cat_004_GotAd': 'any',
 'i_cat_004_HIvIU': 'any',
 'i_cat_004_JyI

## Combine with hhold
Should be nice and easy to transform and combine with hhold

In [24]:
# Fit and transform
sample5 = Aggregate(X, Y=Y, groupby='id')
sample5.fit(num_agg_funcs=['median', 'mean'], cat_agg_funcs=['percent', 'any'])
indiv_transformed = sample5.transform()

In [26]:
# Merge data
agg_df = pd.merge(indiv_transformed, hhold, left_index=True, right_index=True)

In [27]:
agg_df.head()

Unnamed: 0,i_cat_001_BNCcM,i_cat_001_HUpWg,i_cat_001_JMXQx,i_cat_001_PAVsH,i_cat_001_SJPkb,i_cat_001_SlRmt,i_cat_001_TRFeI,i_cat_001_XJgvq,i_cat_001_XJsPz,i_cat_001_YsSBt,...,h_cat_330,h_cat_331,h_cat_332,h_cat_333,h_cat_334,h_cat_335,h_cat_336,h_cat_337,h_cat_338,h_cat_339
14,0.0,0.0,False,False,False,False,False,False,1.0,False,...,AZwXA,JqHnW,MaXfS,USRak,idRwx,LPtkN,vkbkA,qQxrL,AITFl,aQeIm
18,0.0,0.0,False,False,False,False,False,False,1.0,False,...,wxJis,JqHnW,MaXfS,HxnJy,idRwx,LPtkN,vkbkA,qQxrL,AITFl,cecIq
36,0.0,0.0,False,False,False,True,False,False,0.833333,False,...,wxJis,JqHnW,MaXfS,USRak,idRwx,UyAms,vkbkA,qQxrL,AITFl,aQeIm
39,0.0,0.0,False,False,False,False,True,False,0.555556,False,...,wxJis,JqHnW,MaXfS,HxnJy,idRwx,UyAms,vkbkA,qQxrL,AITFl,cecIq
58,0.0,0.0,False,False,False,False,False,False,1.0,False,...,wxJis,JqHnW,MaXfS,etZsD,idRwx,LPtkN,vkbkA,qQxrL,AITFl,aQeIm


## fit_transform method
There is also a fit_transform() method, which is really just a wrapper function that calls fit(), then calls transform(), but it's there for ease of use.

In [11]:
# Example of choosing to review only specific aggregate functions
sample6 = Aggregate(X, Y=Y, groupby='id')
result6 = sample6.fit_transform(num_agg_funcs=['median', 'mean'], cat_agg_funcs=['percent', 'any'])

In [12]:
result6.head()

Unnamed: 0_level_0,i_cat_001_BNCcM,i_cat_001_HUpWg,i_cat_001_JMXQx,i_cat_001_PAVsH,i_cat_001_SJPkb,i_cat_001_SlRmt,i_cat_001_TRFeI,i_cat_001_XJgvq,i_cat_001_XJsPz,i_cat_001_YsSBt,...,i_cat_037_FUUXv,i_cat_037_GtHel,i_cat_037_juMSt,i_cat_038_ALcKg,i_cat_038_JTCKs,i_cat_038_UaIsy,i_cat_038_dSJoN,i_cat_038_vhhVz,i_num_001,i_num_002
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
14,False,False,0.0,False,False,False,False,False,1.0,False,...,False,False,1.0,False,0.5,True,0.0,False,4.0,301.0
18,False,False,0.0,False,False,False,False,False,1.0,False,...,False,False,1.0,False,1.0,False,0.0,False,4.0,101.0
36,False,False,0.0,False,False,True,False,False,0.833333,False,...,False,False,1.0,False,1.0,False,0.0,False,4.0,157.666667
39,False,False,0.0,False,False,False,True,False,0.555556,False,...,False,True,0.777778,False,1.0,False,0.0,False,4.0,84.333333
58,False,False,0.0,False,False,False,False,False,1.0,False,...,True,False,0.25,False,1.0,False,0.0,False,4.0,192.25


In [14]:
sample6.col_to_func

{'i_cat_001_BNCcM': 'any',
 'i_cat_001_HUpWg': 'any',
 'i_cat_001_JMXQx': 'percent',
 'i_cat_001_PAVsH': 'any',
 'i_cat_001_SJPkb': 'any',
 'i_cat_001_SlRmt': 'any',
 'i_cat_001_TRFeI': 'any',
 'i_cat_001_XJgvq': 'any',
 'i_cat_001_XJsPz': 'percent',
 'i_cat_001_YsSBt': 'any',
 'i_cat_001_ayXFR': 'any',
 'i_cat_001_dHZCo': 'any',
 'i_cat_001_duBym': 'any',
 'i_cat_001_fmdsF': 'any',
 'i_cat_001_jnwBm': 'any',
 'i_cat_001_kuFXw': 'percent',
 'i_cat_001_lBMrM': 'any',
 'i_cat_001_nIete': 'any',
 'i_cat_001_oGavK': 'any',
 'i_cat_001_slvTJ': 'any',
 'i_cat_001_tMiQp': 'percent',
 'i_cat_001_vhanw': 'any',
 'i_cat_001_wWIzo': 'any',
 'i_cat_001_xnnDH': 'any',
 'i_cat_001_xsVYp': 'any',
 'i_cat_002_kzSFB': 'percent',
 'i_cat_002_mOlYV': 'percent',
 'i_cat_002_yAyAe': 'any',
 'i_cat_003_FRcdT': 'any',
 'i_cat_003_UFoKR': 'percent',
 'i_cat_003_axSTs': 'any',
 'i_cat_004_CXizI': 'percent',
 'i_cat_004_DQhEE': 'percent',
 'i_cat_004_DaETh': 'any',
 'i_cat_004_GotAd': 'percent',
 'i_cat_004_HIv