In [146]:
import pandas as pd
from itertools import combinations, combinations_with_replacement
import numpy as np
import csv

In [147]:
def urns_and_balls(U, B ,urns_max=False, urns_min=False, urns_names=False, zeros=False):
    """
    Distributes B balls into U urns
    - if zeros == True  : empty urns allowed 
    - if zeros == False : empty urns not allowed 
    """
    if type(urns_max) == bool: urns_max = np.ones(U,dtype=int)*B
    else: urns_max = np.array(urns_max)
    if type(urns_min) == bool: urns_min = np.zeros(U,dtype=int)
    else: urns_max = np.array(urns_max)
        
    if zeros == True:
        C = combinations_with_replacement(range(B+1),U-1)
        a = np.ones(B,dtype=int)
        g = []
        for c in C:
            x = [sum(a[:c[0]])]
            for i in range(len(c)-1): 
                x.append(sum(a[c[i]:c[i+1]]))
            x.append(sum(a[c[-1]:]))
            if all([i <= j for i,j in zip(x,urns_max)]) and all([i >= j for i,j in zip(x,urns_min)]):
                if type(urns_names) == bool: g.append(x)
                else: g.append(dict(zip(urns_names,x)))
    
    if zeros == False:
        if B < U: print(f'WARNING: Number of balls {B} has to be greater than or equal to the number of urns {U}.')
        C = combinations(range(1,B),U-1)
        C = [list(c) for c in C]
        a = np.ones(B,dtype=int)
        g = []
        for c in C:
            x = [sum(a[0:c[0]])]
            for i in range(len(c)-1): 
                x.append(sum(a[c[i]:c[i+1]]))
            x.append(sum(a[c[-1]:]))
            if all([i <= j for i,j in zip(x,urns_max)]) and all([i >= j for i,j in zip(x,urns_min)]):
                if type(urns_names) == bool: g.append(x)
                else: g.append(dict(zip(urns_names,x)))
    return g

In [148]:
# Load Data 

data = pd.read_csv("rome_tension.csv")

# Define tree

data.sort_values(by=['rome1','rome3','rome5'])
cat = data.drop(columns=['BE_id','T']).drop_duplicates()
tree1 = dict(cat.groupby(by='rome1')['rome3'].unique())
tree3 = dict(cat.groupby(by='rome3')['rome5'].unique())
tree = {key1:{key3:list(tree3[key3]) for key3 in tree1[key1]} for key1 in tree1.keys()}

# Delete these groups: they are too big!! 
del tree['H']
del tree['K']
del tree['N']

# Parameter for draws 
n_nodes = {key1:{key3:len(tree[key1][key3]) for key3 in tree[key1].keys()} for key1 in tree.keys()}
n_groups = {key1:len(tree[key1]) for key1 in tree.keys()}

Be careful !!! All draws that have strictly positive entries must come from a zeros=False draw with G as number of groups to redistribute. This problem arises as soon as one seeks to attribute G groups to n nodes with G > n (which could only happen here if we tried to redistribute groups at the top level. 

So if G > n ask for all zero draws with G-1 groups but delete among those groups those who have all strictly positive entries. (for instance drop draws if product > 0) 

In [149]:
draws3 = {key:urns_and_balls(n_groups[key],
                           n_groups[key]-1,
                           urns_max=list(n_nodes[key].values()),
                           urns_names=tree[key].keys(),
                           zeros=True) \
          for key in tree.keys()}

In [150]:
final_groups = {}
for key1 in tree.keys():
    print(key1)
    sigma = float('Inf')
    best_groups = {}
    for draw3 in draws3[key1]:
        zeros = []
        ones = []
        others = {}
        groups = {}
        i = 0
        for key3, nb_groups in draw3.items():
            if nb_groups == 0: zeros.append(key3)
            if nb_groups == 1: ones.append(key3)
            if nb_groups > 1 : others[key3] = nb_groups
        if len(zeros) > 0: groups[f'g{i}'] = []
        for key3 in zeros:
            for rome5 in tree[key1][key3]:
                groups[f'g{i}'].append(rome5)
        for key3 in ones:
            i += 1
            groups[f'g{i}'] = tree[key1][key3]
        for key3, nb_groups in others.items():
            if nb_groups == n_nodes[key1][key3]:
                best_groups5 = [[rome5] for rome5 in tree[key1][key3]]
            else:
                draws5 = urns_and_balls(n_nodes[key1][key3],
                                        nb_groups-1,
                                        urns_max=np.ones(n_nodes[key1][key3],dtype=int),
                                        zeros=True,
                                        urns_names=tree[key1][key3])
                best_groups5 = []
                sigma5 = float('Inf')
                for draw5 in draws5:
                    groups5 = []
                    groups5.append([rome5 for rome5, nb in draw5.items() if nb == 0])
                    groups5 = [x for x in groups5 if len(x) > 0]
                    for rome5, nb in draw5.items():
                        if nb == 1: groups5.append([rome5])
                    groups5_var = sum([data.loc[data['rome5'].isin(group)]['T'].var() for group in groups5])
                    if groups5_var < sigma5: 
                        sigma5 = groups5_var
                        best_groups5 = groups5
            for group in best_groups5:
                i += 1
                groups[f'g{i}'] = group
        groups_var = sum([data.loc[data['rome5'].isin(group)]['T'].var() for group in groups.values()])
        if groups_var < sigma:
            sigma = groups_var
            best_groups = groups
            best_draw = draw3
    final_groups[key1] = best_groups

A
B
C
D
E
F
G
I
J
L
M


In [151]:
final_output = {}
i = 0
for key1, groups1 in final_groups.items():
    for groups5 in groups1.values():
        i += 1
        for rome5 in groups5:
            final_output[rome5] = i
with open('optimal_grouping.csv', 'w') as csv_file:
    writer = csv.writer(csv_file)
    for key, value in final_output.items():
       writer.writerow([key, value])