In [33]:
import numpy as np
import sonnet as snt
import tensorflow as tf
from sklearn import tree
import copy
import os
import matplotlib.pyplot as plt
from tqdm import tqdm
import pickle
import math
from decimal import *
import pandas as pd

In [34]:
data_dir = 'H:\\CodeRange\\CelebA\\npy\\'

info_pak = np.load(os.path.join(data_dir, 'celeba_attr.npz'))
train_idxs = info_pak['train_idxs']
val_idxs = info_pak['val_idxs']
test_idxs = info_pak['test_idxs']
attributes_names = info_pak['attribute_names']
attributes = info_pak['attributes']

train_label = attributes[train_idxs]
test_label = attributes[test_idxs]

skew_rank = [24, 39, 2, 21, 31, 36, 19, 20, 18, 33, 25, 27, 1, 6, 8, 7, 32, 3, 11, 34, 5, 9, 12, 37, 23, 0, 28, 38, 29, 15, 16, 13, 30, 10, 35, 14, 26, 17, 22, 4]

## Analyze Specs of CelebA

In [None]:
def describe_partition(idxs):
    global attributes, attributes_names
    partition = attributes[idxs]

    num_instances = partition.shape[0]
    num_classes = partition.shape[1]
    
    output = ''
    output += 'Total # of instances, {}\n'.format(num_instances)
    output += 'Attribute Index, Attribute Name, # of Positives,  % of Positives, # of Negatives, % of Negatives\n'
    for i in range(num_classes):
        x = partition[:, i]
        output += '{}, {}, {}, {}, {}, {}\n'.format(i, attributes_names[i], sum(x), sum(x)/num_instances, 
                                                    num_instances-sum(x), 1-sum(x)/num_instances)
    return output

output = 'Training Partition\n'+describe_partition(train_idxs)+'\n'
output += 'Test Partition\n'+describe_partition(test_idxs)+'\n'
output += 'Validation Partition (not used)\n'+describe_partition(val_idxs)+'\n'
with open('celeba_specs.csv', 'w') as f:
    f.write(output)
   

## Generate Figures

In [None]:
def add_info_to_node_label(in_filename, out_filename, node_id, info):
    with open(in_filename) as f:
        linelist = f.readlines()
    found_flag = False
    for i,s in enumerate(linelist):
        if s.startswith('{} [label='.format(node_id)):
            idx = s.find('", fillcolor=')
            s = s[:idx] + "\\n" + info + s[idx:]
            linelist[i] = s
            found_flag = True
            break
    if not found_flag:
        raise Exception('node {} not found'.format(node_id))
    with open(out_filename, "w+") as f:
        for x in linelist:
            f.write(x)

def tree2dot(decision_tree, path):
    tree.export_graphviz(
        decision_tree,
        out_file=path,
        class_names=['positive', 'negative'],
        filled=True,
        rounded=True,
        proportion=True)

In [None]:
def update_tree(skewRankIdx):
    initIdx = skew_rank[skewRankIdx]
    path = "./saved_model/SkewRank{:02d}_{}/".format(skewRankIdx, attributes_names[initIdx])
    train_latent_variable = np.reshape(np.load(os.path.join(path, 'post_latent_var-train.npy')), [-1, 50])
    test_latent_variable = np.reshape(np.load(os.path.join(path, 'post_latent_var-test.npy')), [-1, 50])
    with open(os.path.join(path, 'saved_model/decision_tree.pkl'), 'rb') as dt_file:
        decision_tree = pickle.load(dt_file)
    
    def draw_tree_for_class_other(class_other_idx):
        target_dot_path = os.path.join(path, 'tree__{}.dot'.format(attributes_names[class_other_idx]))
        target_png_path = os.path.join(path, 'tree_{}.png'.format(attributes_names[class_other_idx]))
        tree2dot(decision_tree, target_dot_path)
        
        def count(latent_variable, label):
            decision_path = decision_tree.decision_path(latent_variable).toarray()
            instances_num = decision_path.shape[0]
            nodes_num = decision_path.shape[1]

            node_pos = [0] * nodes_num
            node_neg = [0] * nodes_num
#             print(nodes_num)
            for i in range(instances_num):
                for j in range(nodes_num):
                    if decision_path[i][j]:
                        # the y_o here is in initial index
                        if label[i][class_other_idx]:
                            node_pos[j]+=1
                        else:
                            node_neg[j]+=1
            return node_pos, node_neg, nodes_num

        train_node_pos, train_node_neg, _ = count(train_latent_variable, train_label)
        test_node_pos, test_node_neg, nodes_num = count(test_latent_variable, test_label)
        for i in range(nodes_num):
            info = "train-Y_o[neg:pos]={}:{}\\ntest-Y_o[neg:pos]={}:{}".format(
                train_node_neg[i], train_node_pos[i], test_node_neg[i], test_node_pos[i])
            add_info_to_node_label(target_dot_path, target_dot_path, i, info)
#             print(info)
        os.system("dot -Tpng {} -o {}".format(target_dot_path, target_png_path))
        os.remove(target_dot_path)
        return [train_node_neg, train_node_pos], [test_node_neg, test_node_pos]
    
    train_count_list = []
    test_count_list = []
    for i in range(40):
        print('- {}'.format(i))
        train_count, test_count = draw_tree_for_class_other(i) 
        train_count_list.append(train_count)
        test_count_list.append(test_count)
    return train_count_list, test_count_list


## Update Tree Diagram with Distribution of Y_o

In [None]:
for i in range(10):
    print(i)
    update_tree(i)

## Generate Origin Tree Diagram

In [None]:
for i in range(40):
    initIdx = skew_rank[i]
    path = "./saved_model/SkewRank{:02d}_{}/".format(i, attributes_names[initIdx])
    with open(os.path.join(path, 'saved_model/decision_tree.pkl'), 'rb') as dt_file:
        decision_tree = pickle.load(dt_file)
    target_dot_path = os.path.join(path, 'decision_tree_CPVAE_best.dot')
    tree2dot(decision_tree, target_dot_path)
    target_png_path = os.path.join(path, 'decision_tree_CPVAE_best.png')
    os.system("dot -Tpng {} -o {}".format(target_dot_path, target_png_path))
    os.remove(target_dot_path)

## Save node_count to .npy

In [None]:
train_count = []
test_count = []
for i in tqdm(range(40)):
    train_count_with_yo_i, test_count_with_yo_i = update_tree(i)
    train_count.append(train_count_with_yo_i)
    test_count.append(test_count_with_yo_i)
# np.save('train_node_count.npy', train_count)
# np.save('test_node_count.npy', test_count)

## Generate .csv Report for Phi Coefficients of Roots

In [35]:
getcontext().prec = 500
# All index is skew_rank index
class PhiCoefficientReportGenerator:
    def __init__(self, path='./'):
        self._path = path
        
        self._train_path = os.path.join(self._path, 'train_node_count.npy')
        self._test_path = os.path.join(self._path, 'test_node_count.npy')
        self.train_count = np.load(self._train_path)
        self.test_count = np.load(self._test_path)
        
        data_dir = 'H:\\CodeRange\\CelebA\\npy\\'
        info_pak = np.load(os.path.join(data_dir, 'celeba_attr.npz'))
        self._attributes_names = info_pak['attribute_names']
        self._skew_rank = [24, 39, 2, 21, 31, 36, 19, 20, 18, 33, 25, 27, 1, 6, 8, 7, 32, 3, 11, 34, 5, 9, 12, 37, 23, 0, 28, 38, 29, 15, 16, 13, 30, 10, 35, 14, 26, 17, 22, 4]
        
    def generate(self, node_id):
        def get_table(node_count):
            table = []
            for skew_rank_id in range(40):
                print('skew rankd: {}'.format(skew_rank_id))
                Y_c_table = self.get_table_for_y_c(skew_rank_id, node_id, node_count)
                table = table + Y_c_table
            table = pd.DataFrame({
                'Y_c_skew_idx': [x[0] for x in table], 
                'Y_c_name': [x[1] for x in table],
                'Y_o': [x[2] for x in table],
                'node': [node_id for x in table],
                'Phi': [x[3] for x in table]
            })
            return table
        
        return get_table(self.train_count), get_table(self.test_count)
    
    def get_table_for_y_c(self, y_c_i, node_id, node_count):
        '''
                     | y_o=1 | y_o=0 |
            go_left  | n11   | n10   |
            go_right | n01   | n00   |
        '''
        tree = self.load_tree_by_skew_rank(y_c_i).tree_
        phi_list = []
        for y_o_i in range(40):
            left_node = tree.children_left[node_id]
            right_node = tree.children_right[node_id]
            yo_neg = node_count[y_c_i, y_o_i, 0, node_id]
            yo_pos = node_count[y_c_i, y_o_i, 1, node_id]

            n11 = Decimal(node_count[y_c_i, y_o_i, 1, left_node].item())
            n10 = Decimal(node_count[y_c_i, y_o_i, 0, left_node].item())
            n01 = Decimal(node_count[y_c_i, y_o_i, 1, right_node].item())
            n00 = Decimal(node_count[y_c_i, y_o_i, 0, right_node].item())

#                 if y_c_i==0 and y_o_i==36:
#                     print(n11, n10, n01, n00)
            try:
                n_left = n11 + n10
                n_right = n01 + n00
#                 assert int(n11 + n01) == yo_pos
#                 assert int(n10 + n00) == yo_neg

                phi = (n11*n00 - n10*n01)/(n_left*n_right*yo_pos*yo_neg).sqrt()
                phi_list.append(float(phi))
            except Exception as e:
                print('y_c: {}'.format(attributes_names[self._skew_rank[y_c_i]]))
                print('y_o: {}'.format(attributes_names[y_o_i]))
                print(n11, n10, n01, n00)
#                 print(n_left, n_right, yo_neg, yo_pos)
                phi_ad = float(((n11+1)*(n00+1) - (n10+1)*(n01+1))/((n_left+2)*(n_right+2)*(yo_pos+2)*(yo_neg+2)).sqrt())
                print(phi_ad)
                print('')
                phi_list.append(phi_ad)
        
        Y_c_tb=np.array(phi_list)
        Y_c_tb=np.expand_dims(Y_c_tb, 1).tolist()
        for i,phi in enumerate(Y_c_tb):
            y_c_name = self._attributes_names[self._skew_rank[y_c_i]]
            y_o_name = self._attributes_names[i]
            Y_c_tb[i]=[y_c_i, y_c_name, y_o_name, phi[0]]
        Y_c_tb=sorted(Y_c_tb, reverse=True, key=lambda x:abs(x[3]))
        return Y_c_tb
    
    def load_tree_by_skew_rank(self, i):
        initIdx = skew_rank[i]
        path_yc = "saved_model/SkewRank{:02d}_{}/".format(i, attributes_names[initIdx])
        path_yc = os.path.join(self._path, path_yc)
        with open(os.path.join(path_yc, 'saved_model/decision_tree.pkl'), 'rb') as dt_file:
            decision_tree = pickle.load(dt_file)
        return decision_tree

In [36]:
gen = PhiCoefficientReportGenerator()
table_tr, table_te = gen.generate(node_id=1)
# with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
#     print(table_tr)
table_tr.to_csv('train_phi_coefficients.csv', index=False)
table_te.to_csv('test_phi_coefficients.csv', index=False)

skew rankd: 0
skew rankd: 1
skew rankd: 2
skew rankd: 3
skew rankd: 4
skew rankd: 5
skew rankd: 6
skew rankd: 7
skew rankd: 8
skew rankd: 9
skew rankd: 10
skew rankd: 11
skew rankd: 12
skew rankd: 13
skew rankd: 14
skew rankd: 15
skew rankd: 16
skew rankd: 17
skew rankd: 18
skew rankd: 19
skew rankd: 20
skew rankd: 21
skew rankd: 22
skew rankd: 23
skew rankd: 24
skew rankd: 25
skew rankd: 26
skew rankd: 27
skew rankd: 28
skew rankd: 29
skew rankd: 30
skew rankd: 31
skew rankd: 32
skew rankd: 33
skew rankd: 34
skew rankd: 35
skew rankd: 36
skew rankd: 37
skew rankd: 38
skew rankd: 39
skew rankd: 0
skew rankd: 1
skew rankd: 2
skew rankd: 3
skew rankd: 4
skew rankd: 5
skew rankd: 6
skew rankd: 7
skew rankd: 8
skew rankd: 9
skew rankd: 10
skew rankd: 11
skew rankd: 12
skew rankd: 13
skew rankd: 14
skew rankd: 15
skew rankd: 16
skew rankd: 17
skew rankd: 18
skew rankd: 19
skew rankd: 20
skew rankd: 21
skew rankd: 22
skew rankd: 23
skew rankd: 24
skew rankd: 25
skew rankd: 26
skew rankd: 27


## Get a Tree and Test

In [None]:
i=0
initIdx = skew_rank[i]
path = "./saved_model/SkewRank{:02d}_{}/".format(i, attributes_names[initIdx])
with open(os.path.join(path, 'saved_model/decision_tree.pkl'), 'rb') as dt_file:
    decision_tree = pickle.load(dt_file)

In [None]:
import sklearn.tree
help(sklearn.tree._tree.Tree)

In [None]:
decision_tree.tree_.n_node_samples[1]

## Confirm node #1 #2

In [41]:
for i in range(40):
    tree = gen.load_tree_by_skew_rank(i).tree_
    assert tree.children_left[0]==1
    assert tree.children_right[0]==2