In [None]:
import dataiku
from dataiku import pandasutils as pdu
import pandas as pd

df = dataiku.Dataset("features_by_segment_and_uid").get_dataframe()

### NOTEBOOK PURPOSE AND INTRODUCTION

Given a dataframe derived from a clustering method (e.g. Kmeans) the below script is an automated script to profile the cluster groups according to the attached features. 

The script will automatically determine which features are categorical, one-hot encode them and then calculate the % of the cluster that are within each category.

It will also determine which features are continuous and calculate averages across the cluster group.

The input file requires each record to be a unique ID, with a single column designating its cluster group. As many additional features / columns as desired can be added.

In [None]:
def profile_function(dataframe, indexfield, groupbyfield):
    
    """
    
    Parameters
    ---------- 
        dataframe : pd.DataFrame
            the dataframe to use  
        indexfield: string
            the name of the unique identifier column
        groupbyfield: string
            the names or codes of the cluster the unique record is allocated to

    Returns
    --------
    pd.DataFramecolumns
            dataframe with a record for each unique cluster group, along with a set of columns that are either averages
            of continuous input features, or % of group categories for categorical features
   
    """

    # Onehot encode items in categorical_list

    # Get list of categorical features
    categorical_list = [key for key in dict(dataframe.dtypes)
                        if (dict(dataframe.dtypes)[key] in ['object'])
                        and (len(dataframe.loc[lambda x: x[key].isin(['1', '0', '1.0', '0.0']),:])) == 0
                        and key != groupbyfield]

    print ("Categorical Features for Encoding: ", categorical_list, '\n')

    for item in categorical_list:
        cat_feature = dataframe[[item]]

        cat_feature_encoded = (pd.get_dummies(cat_feature, prefix='', prefix_sep='')
                               .max(level=0, axis=1)
                               .add_prefix(item+' - '))

        # add the one-hot encoded column to the dataframe
        dataframe = pd.concat([dataframe, cat_feature_encoded], axis=1)

        # remove the original columns
        dataframe = dataframe.drop([item], axis=1)

    # Change columns that should be booleans to be so (from float and int)
    bool_fix_list = [key for key in dict(dataframe.dtypes)
                if (len(dataframe.loc[lambda x: x[key].isin([1, 1.0]),:])
                    + len(dataframe.loc[lambda x: x[key].isin([0, 0.0]),:])
                    == len(dataframe))]
    dataframe[bool_fix_list] = dataframe[bool_fix_list].astype(bool)

    # Create a dictionary of variables by type to aggregate up by ClusterID
    agg_dictionary = {}
    avg_list = []
    bool_list = []
    for col in dataframe:
        if (dict(dataframe.dtypes)[col] in ['int64', 'float64']) and col not in [indexfield, groupbyfield]:
            agg_dictionary[col] = 'mean'
            avg_list.append(col)
        if (dict(dataframe.dtypes)[col] in ['bool']) and col not in [indexfield, groupbyfield]:
            agg_dictionary[col] = 'sum'
            bool_list.append(col)

    agg_dictionary[indexfield] = 'nunique'

    print ('Boolean Features: ', bool_list, '\n')
    print ('Continuous Features: ', avg_list, '\n')

    # Groupby ClusterID
    print ('Grouping by: ', groupbyfield)
    dataframe = dataframe.groupby(groupbyfield).agg(agg_dictionary).reset_index()

    # Turn absolute values into % values by cluster
    dataframe[bool_list] = dataframe[bool_list].div(dataframe[indexfield], axis=0) * 100

    # Round all pct columns to be 4 decimal places
    dataframe[avg_list] = round(dataframe[avg_list],4)
    dataframe[bool_list] = dataframe[bool_list].astype(float)
    dataframe[bool_list] = round(dataframe[bool_list],4)

    # Rename columns according to whether they are avg or pct columns
    rename_list = {**{col: col + '_avg' for col in avg_list}
                   ,**{col: col + '_pct' for col in bool_list}
                   ,**{indexfield: 'num_' + indexfield}
                  }
    dataframe.rename(columns=rename_list, inplace=True)

    # Create population size column
    dataframe[indexfield + '_pct'] = round((dataframe['num_' + indexfield]
                                            / sum(dataframe['num_' + indexfield]) * 100),4)

    # Move indexfield columns to front of dataframe
    cols = list(dataframe)
    cols.insert(1, cols.pop(cols.index('num_' + indexfield)))
    cols.insert(2, cols.pop(cols.index(indexfield + '_pct')))
    dataframe = dataframe[cols]

    return dataframe

In [None]:
df_out = profile_function(df, 'unique_identifier', 'Cluster_Code')