In [7]:
import pandas as pd

def cluster_factors_corr(df, y_name): 
    """
    Clusters the factors (column names) in the dataset by the correlation of
    columns with a certain outcome 
    Returns: a dictionary of factor names and their polarities clustered using correlations
    """
    # take the correlations with y
    cor_dict = zip(df.columns, df.corr()[y_name].astype(float))
    return {key: "pi" if value > 1 else ("un" if value == 0 else "delta") for key, value in cor_dict}

def cluster_factors_voting(df, y_name): 
    """
    Clusters the factors (column names) in the dataset with a voting mechanism, 
    where a factor which appears in the dataset with a certain outcome more times is assigned that polarity.
    Returns: a dictionary of factor names and their polarities clustered using voting.
    """
    true_votes = {column: ((df[column] == True) & (df[y_name] == True)).sum() for column in df.columns}
    false_votes = {column: ((df[column] == False) & (df[y_name] == True)).sum() for column in df.columns}

    return {key: "pi" if value > false_votes[key] else ("un" if value == false_votes[key] else "delta") for key, value in true_votes.items()}