In [1]:
import pandas as pd
from itertools import combinations, combinations_with_replacement
import numpy as np
import csv
import operator

In [2]:
# Load Data 

data = pd.read_csv("rome_tension.csv")

# Define tree

data.sort_values(by=['rome1','rome3','rome5'])
cat = data.drop(columns=['BE_id','T']).drop_duplicates()
tree1 = dict(cat.groupby(by='rome1')['rome3'].unique())
tree3 = dict(cat.groupby(by='rome3')['rome5'].unique())
tree = {key1:{key3:list(tree3[key3]) for key3 in tree1[key1]} for key1 in tree1.keys()}

# Delete these groups: they are too big!! 
#del tree['H']
#del tree['K']
#del tree['N']

In [28]:
def objective(x,y): 
    return (x-y)**2
    #return x + y

# groups class
class Group:
    def __init__(self,nodes):
        self.nodes = nodes 
        self.nb_nodes = len(nodes)
        self.nodes_type = len(self.nodes[0])
        if self.nodes_type > 1: self.branch = self.nodes[0][0]
        else: self.branch = ''
        if self.nodes_type == 5: 
            self.splitable = self.nb_nodes > 1
            self.breakable = False
            self.contains = self.nodes
            self.terminal = self.contains
            self.size = len(self.contains)
        if self.nodes_type == 3:
            self.contains = []
            for node in self.nodes:
                self.contains += tree[self.branch][node]
            self.terminal = self.contains
            self.size = len(self.contains)
            self.splitable = self.nb_nodes > 1
            self.breakable = self.nb_nodes == 1 and self.size > 1
        if self.nodes_type == 1:
            self.contains = []
            for node in self.nodes:
                self.contains += list(tree[node].keys())
            self.size = len(self.contains)
            self.terminal = []
            for key in self.contains: 
                self.terminal += tree[key[0]][key]
            self.splitable = self.nb_nodes > 1
            self.breakable = self.nb_nodes == 1 and self.size > 1
        self.var = data.loc[data['rome5'].isin(self.terminal)]['T'].var()
                
    def split(self):
        
        if self.nodes_type in [1,3]:
            
            if self.splitable:
                n = self.nb_nodes
                if n % 2 == 0: k = n // 2
                else: k = (n-1) // 2
                if k == 1: gen_splits = ([x[0]] for x in combinations(self.nodes,k))
                else: gen_splits = (list(x) for x in combinations(self.nodes,k))
                sig = float('Inf')
                for split in gen_splits:
                    g1 = Group(split)
                    g2 = Group([key for key in self.nodes if key not in split])
                    ob  = objective(g2.var,g1.var)
                    if ob < sig:
                        sig = ob
                        best_g1, best_g2 = g1, g2
                return [best_g1, best_g2]
            
            if self.breakable:    
                n = self.size
                if n % 2 == 0: k = n // 2
                else: k = (n-1) // 2
                if k == 1: gen_splits = ([x[0]] for x in combinations(self.contains,k))
                else: gen_splits = (list(x) for x in combinations(self.contains,k))
                sig = float('Inf')
                for split in gen_splits:
                    g1 = Group(split)
                    g2 = Group([key for key in self.contains if key not in split])
                    ob  = objective(g2.var,g1.var)
                    if ob < sig:
                        sig = ob
                        best_g1, best_g2 = g1, g2
                return [best_g1, best_g2]
        
        if self.nodes_type == 5:
            
            if self.splitable:
                n = self.nb_nodes
                if n % 2 == 0: k = n // 2
                else: k = (n-1) // 2
                if k == 1: gen_splits = ([x[0]] for x in combinations(self.nodes,k))
                else: gen_splits = (list(x) for x in combinations(self.nodes,k))
                sig = float('Inf')
                for split in gen_splits:
                    g1 = Group(split)
                    g2 = Group([key for key in self.nodes if key not in split])
                    ob  = objective(g2.var,g1.var)
                    if ob < sig:
                        sig = ob
                        best_g1, best_g2 = g1, g2
                return [best_g1, best_g2]

        if self.splitable == False and self.breakable == False:
            print(f'{self.nodes} is unbreakable.')
            return self, []
            
def order_groups(groups):
    ordered_groups = sorted([[group,group.var] for group in groups], key=operator.itemgetter(1), reverse = True)
    return [group[0] for group in ordered_groups]
    

### Selecting biggest variance in each split:

In [23]:
final_groups1 = {}
j = 0
for key1, keys3 in tree.items():
    groups = order_groups([Group(list(keys3.keys()))])
    G = len(keys3)
    g = len(groups)
    while g < G:
        i , split = 0, False
        while i < len(groups) and split == False:
            if groups[i].splitable or groups[i].breakable:
                print(f'Spliting {groups[i].nodes}')
                groups = order_groups(groups[:i] + groups[i].split() + groups[i+1:])
                split = True
            else:
                i += 1
        if split == False:
            print(f'Impossible to split or break group anymore: only {g} out of {G} groups created')
            g = G
        else:
            g += 1
    for group in groups:
        j += 1
        final_groups1[f'g{j}'] = group.terminal

Spliting ['A11', 'A12', 'A13', 'A14', 'A15']
Spliting ['A13', 'A14', 'A15']
Spliting ['A13']
Spliting ['A1301', 'A1302']
Spliting ['B11', 'B12', 'B13', 'B14', 'B15', 'B16', 'B17', 'B18']
Spliting ['B13', 'B15', 'B16', 'B18']
Spliting ['B13', 'B16']
Spliting ['B16']
Spliting ['B1602', 'B1604']
Spliting ['B1601', 'B1603']
Spliting ['B13']
Spliting ['C11', 'C12', 'C13', 'C14', 'C15']
Spliting ['C11', 'C12', 'C13']
Spliting ['C11', 'C12']
Spliting ['C11']
Spliting ['D11', 'D12', 'D13', 'D14', 'D15']
Spliting ['D11', 'D12', 'D15']
Spliting ['D12', 'D15']
Spliting ['D15']
Spliting ['E11', 'E12', 'E13', 'E14']
Spliting ['E12', 'E13']
Spliting ['E13']
Spliting ['F11', 'F12', 'F13', 'F14', 'F15', 'F16', 'F17']
Spliting ['F11', 'F12', 'F14', 'F16']
Spliting ['F11', 'F12']
Spliting ['F12']
Spliting ['F1203', 'F1204']
Spliting ['F11']
Spliting ['G11', 'G12', 'G13', 'G14', 'G15', 'G16', 'G17', 'G18']
Spliting ['G12', 'G13', 'G14', 'G17']
Spliting ['G13', 'G14']
Spliting ['G13']
Spliting ['G1301', '

#### Saving output 

In [24]:
final_output = {}
i = 0
for key, group in final_groups1.items():
    for rome in group:
        final_output[rome] = key[1:]
with open('optimal_grouping_fresh1.csv', 'w') as csv_file:
    writer = csv.writer(csv_file)
    for key, value in final_output.items():
       writer.writerow([key, value])

### Trying all possible splits:

In [29]:
final_groups2 = {}
j = 0
for key1, keys3 in tree.items():
    groups = order_groups([Group(list(keys3.keys()))])
    G = len(keys3)
    g = len(groups)
    while g < G:
        i, sig, split = 0, float('Inf'), False
        while i < len(groups):
            if groups[i].splitable or groups[i].breakable:
                new_groups = order_groups(groups[:i] + groups[i].split() + groups[i+1:])
                var = sum([group.var for group in new_groups])
                if var < sig:
                    split = True
                    sig = var
                    best_groups = new_groups
                    i_best = i 
                i += 1
            else:
                i += 1
        if split == False:
            print(f'Impossible to split or break group anymore: only {g} out of {G} groups created')
            g = G
        else:
            g += 1
            print(f'Best split: {groups[i_best].nodes}')
            groups = best_groups 
    for group in groups:
        j += 1
        final_groups2[f'g{j}'] = group.terminal

Best split: ['A11', 'A12', 'A13', 'A14', 'A15']
Best split: ['A11', 'A13']
Best split: ['A12', 'A14', 'A15']
Best split: ['A14']
Best split: ['B11', 'B12', 'B13', 'B14', 'B15', 'B16', 'B17', 'B18']
Best split: ['B11', 'B16', 'B17', 'B18']
Best split: ['B17', 'B18']
Best split: ['B11', 'B16']
Best split: ['B18']
Best split: ['B1801', 'B1805', 'B1806']
Best split: ['B1801', 'B1806']
Best split: ['C11', 'C12', 'C13', 'C14', 'C15']
Best split: ['C11', 'C15']
Best split: ['C15']
Best split: ['C1503', 'C1504']
Best split: ['D11', 'D12', 'D13', 'D14', 'D15']
Best split: ['D12', 'D14']
Best split: ['D14']
Best split: ['D1404', 'D1406', 'D1407', 'D1408']
Best split: ['E11', 'E12', 'E13', 'E14']
Best split: ['E11', 'E13']
Best split: ['E11']
Best split: ['F11', 'F12', 'F13', 'F14', 'F15', 'F16', 'F17']
Best split: ['F11', 'F13', 'F17']
Best split: ['F11', 'F13']
Best split: ['F13']
Best split: ['F17']
Best split: ['F1701', 'F1704', 'F1706']
Best split: ['G11', 'G12', 'G13', 'G14', 'G15', 'G16', 

In [30]:
final_output = {}
i = 0
for key, group in final_groups2.items():
    for rome in group:
        final_output[rome] = key[1:]
with open('groups_min_dist_no_realloc.csv', 'w') as csv_file:
    writer = csv.writer(csv_file)
    for key, value in final_output.items():
       writer.writerow([key, value])

### Introducing top level allocation: 

In [82]:
final_groups3 = {}
j = 0
groups = [Group(list(tree.keys()))]
G = sum([len(tree[key].values()) for key in tree.keys()])
g = len(groups)
while g < G:
    i, sig, split = 0, float('Inf'), False
    while i < len(groups):
        if groups[i].splitable or groups[i].breakable:
            new_groups = groups[:i] + groups[i].split() + groups[i+1:]
            var = sum([group.var for group in new_groups])
            if var < sig:
                split = True
                sig = var
                best_groups = new_groups
                i_best = i 
            i += 1
        else:
            i += 1
    if split == False:
        print(f'Impossible to split or break group anymore: only {g} out of {G} groups created')
        g = G
    else:
        g += 1
        print(f'Created {g}/{G} nodes. Last best split: {groups[i_best].nodes}.')
        groups = best_groups 
for group in groups:
    j += 1
    final_groups3[f'g{j}'] = group.terminal

Best split: ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N']
Best split: ['A', 'D', 'E', 'F', 'G', 'I', 'M']
Best split: ['D', 'F', 'G']
Best split: ['D']
Best split: ['D13', 'D14']
Best split: ['D14']
Best split: ['D1401', 'D1402', 'D1403', 'D1407']
Best split: ['D1401', 'D1407']
Best split: ['D1402', 'D1403']
Best split: ['D1404', 'D1405', 'D1406', 'D1408']
Best split: ['D1404', 'D1406']
Best split: ['D11', 'D12', 'D15']
Best split: ['D11']
Best split: ['D1101', 'D1102', 'D1104']
Best split: ['D1101', 'D1104']
Best split: ['D1405', 'D1408']
Best split: ['D1103', 'D1105', 'D1106', 'D1107']
Best split: ['D1105', 'D1106']
Best split: ['D1103', 'D1107']
Best split: ['D12', 'D15']
Best split: ['D12']
Best split: ['D1202', 'D1208', 'D1209', 'D1211', 'D1212', 'D1213', 'D1214']
Best split: ['D1202', 'D1212', 'D1214']
Best split: ['D1202', 'D1212']
Best split: ['D1208', 'D1209', 'D1211', 'D1213']
Best split: ['D1208', 'D1211']
Best split: ['D1209', 'D1213']
Best split: [

In [83]:
final_output = {}
i = 0
for key, group in final_groups3.items():
    for rome in group:
        final_output[rome] = key[1:]
with open('optimal_grouping_fresh3.csv', 'w') as csv_file:
    writer = csv.writer(csv_file)
    for key, value in final_output.items():
       writer.writerow([key, value])

### Merge to ROME arbo 

In [17]:
def adjust(cat):
    if len(cat) == 1:
        cat = '0' + cat
        return cat

arbo = pd.read_csv("arbor_rome.csv")
arbo.columns = 'ind', 'div', 'cat', 'label', 'exp', 'int'
arbo = arbo.filter(items=['ind','div','cat','label'])
arbo = arbo.dropna(subset=['cat'],axis=0)
arbo['cat'] = arbo['cat'].apply(int).apply(str).apply(adjust)
arbo['div'] = arbo['div'].apply(int).apply(str)
arbo['rome'] = arbo['ind'] + arbo['div'] + arbo['cat']
arbo = arbo.drop_duplicates(subset='rome').dropna(subset=['rome'],axis=0).reset_index(drop=True)
arbo['rome3'] = arbo['ind'] + arbo['div']
arbo = arbo.filter(items=['ind','rome3','rome','label'])

In [31]:
name = 'groups_min_dist_no_realloc.csv'
results = pd.read_csv(name,header=None)
results.columns = 'rome', 'group'
results = pd.merge(results, arbo, on='rome', how='left')
results = results[['ind','rome3','rome','label','group']]
results = results.sort_values(by='group').reset_index(drop=True)
results.to_csv(name[:-4] + '_headings.csv',index=False)

In [32]:
results

Unnamed: 0,ind,rome3,rome,label,group
0,A,A13,A1301,Conseil et assistance technique en agriculture,1
1,A,A13,A1302,Contrôle et diagnostic technique en agriculture,1
2,A,A13,A1303,Ingénierie en agriculture et environnement nat...,1
3,A,A14,A1403,Aide d'élevage agricole et aquacole,2
4,A,A14,A1409,Élevage de lapins et volailles,2
5,,,A1410,,2
6,,,A1411,,2
7,,,A1412,,2
8,,,A1414,,2
9,,,A1415,,2
