In [1]:
import sys
sys.path.append("../")
import consts
import api
from subset import addr_operation
from Anonymizer import anonymizer
import pickle
from copy import deepcopy

attr_list = consts.ATTR_LIST
attr_list.append('seq')

original_file = '../evaluation/original_data.csv'
anonymized_file = '../' + consts.ORIGIN_FILE
watermarked_file = '../' + consts.MODIFIED_FILE

csv_header, org_list = api.parsed_list(original_file, True)
_, anonymized_list = api.parsed_list(anonymized_file, True)
_, watermarked_list = api.parsed_list(watermarked_file, True)

In [2]:
import unicodedata
def left(digit, msg):
    for c in msg:
        if unicodedata.east_asian_width(c) in ('F', 'W', 'A'):
            digit -= 2
        else:
            digit -= 1
    return msg + ' '*digit

# refer to http://d.hatena.ne.jp/naoya/20090412/btree
class Node:
    def __init__(self):
        self.value = None
        self.leaf_num = 0
        self.children = []
        self.is_leaf = False

    def insert(self, v):
        s = Node()
        s.value = v
        self.children.append(s)
        
    def show(self, pad):
        print(
            "%s, %s, %d" %
            (left(35, '='*pad + self.value),
             left(5, str(self.is_leaf)),
             self.leaf_num)
             )

    def show_all(self, pad):
        self.show(pad)
        if self.is_leaf:
            return
        else:
            for c in self.children:
                c.show_all(pad + 1)
                
    def show_children(self, pad):
        self.show(pad)
        if self.is_leaf:
            return
        else:
            for c in self.children:
                c.show(pad + 1)
    
    def ratio_inverse(self, child_v, child_index=False):
        inverse_sum = 0
        for i, c in enumerate(self.children):
            inverse_sum += 1 / c.leaf_num
            if c.value == child_v:
                child_inverse = 1 / c.leaf_num
                if child_index is True:
                    child_i = i
        try:
            result = child_inverse / inverse_sum * len(self.children)
            if child_index >= 0:
                return result, child_i
            else:
                return result
        except:
            print(':::Error:::')
            c.show(1)
            print(child_v)
    
class GeneralTree:
    def __init__(self):
        self.root = Node()
        self.root.value = None
        self.root.is_leaf = False
    
    def insert(self, value_l):
        r = self.root
        
        if r.value is None:
            r.value = value_l[0]
        if r.value != value_l[0]:
            print('Error: top value is not root value')
            print('Error value: ', value_l)
        
        r.leaf_num += 1
        
        for value in value_l[1:]:
            for child in r.children:
                if value == child.value:
                    r = child
                    r.leaf_num += 1
                    break
            else:
                r.insert(value)
                r = r.children[-1]
                r.leaf_num += 1
        r.is_leaf = True
    
    def show(self):
        self.root.show_all(1)
        
    def search(self, value_l):
        r = self.root
        for value in value_l[1:]:
            for child in r.children:
                if value == child.value:
                    r = child
                    break
        return r
    
    def ncp(self, value_l):
        numerator = self.search(value_l).leaf_num
        # print('value: ', value_l)
        # print('numer: ', numerator)
        denominator = self.root.leaf_num
        # print('denom: ', denominator)
        return numerator / denominator
    
    def ratio_inverse(self, value_l):
        r = self.root
        ratio_sum = 0
        for child_v in value_l[1:]:
            ratio, ci = r.ratio_inverse(child_v, child_index=True)
            ratio_sum += ratio
            r = r.children[ci]
        return ratio_sum
    
    def IL_inverse(self, org_value_l, mod_value_l):
        denominator = self.ratio_inverse(org_value_l)
        numerator = denominator - self.ratio_inverse(mod_value_l)
        return numerator / denominator

In [3]:
def general_addr(addr_attr, addr):
    general = list()
    prev = addr
    if prev[0] == '関東':
        general.append(addr[0])
        return general
    while(True):
        if ('*' in prev) and (prev.index('*') == 1):
            general.append(prev[0])
            now = anonymizer.address_masking(addr_attr, deepcopy(prev))
            general.append(now[0])
            break
            
        now = anonymizer.address_masking(addr_attr, deepcopy(prev))

        str_prev = ''.join(prev).strip('*')
        str_now = ''.join(now).strip('*')

        for chunk in str_now:
            str_prev = str_prev[1:]
        general.append(str_prev)

        prev = now

    return general[::-1]

In [4]:
def general_addrs(addr_attr, addr_l):
    general_l = list()
    for addr in addr_l:
        general_l.append(general_addr(addr_attr, deepcopy(addr)))
    return general_l

In [5]:
def IL_calc(org_l, mod_l, attr_list):
    # sequential numberをintに
    for org_r in org_l:
        org_r.append(int(org_r.pop()))
    for mod_r in mod_l:
        mod_r.append(int(mod_r.pop()))
    
    # sort by sequential number
    org_l.sort(key=lambda x: x[-1])
    mod_l.sort(key=lambda x: x[-1])
    
    # insert empty record to mod_l
    for i, mod_r in enumerate(mod_l):
        if i != mod_r[-1]:
            mod_l.insert(i, ['empty', i])
    
    # IL of addr
    addr_first, addr_last = addr_operation.addr_range_catcher(attr_list)
    
    ## get original addr list
    org_addr_l = [x[addr_first:addr_last+1] for x in org_l]
    
    ## get modified addr list
    mod_addr_l = list()
    for mod_r in mod_l:
        if mod_r[0] == 'empty':
            mod_addr_l.append(['関東', '*', '*', '*', '*'])
        else:
            mod_addr_l.append(mod_r[addr_first:addr_last+1])
        
    ## address tree from original addr list
    addr_tree = GeneralTree()
    
    addr_attr = ['addr0', 'addr1', 'addr2', 'addr3', 'addr4']
    
    org_general_addr_l = general_addrs(addr_attr, org_addr_l)
    for addr in org_general_addr_l:
        addr_tree.insert(addr)
    
    # addr_tree.show()
    # addr_tree.root.show_children(1)
    
    ## IL calculation
    mod_general_addr_l = general_addrs(addr_attr, mod_addr_l)
    
    IL_list = list()
    
    IL_method = 'inverse'
    
    if IL_method == 'NCP':
        # NCP
        for mod_addr in mod_general_addr_l:
            IL_list.append(addr_tree.ncp(mod_addr))
    elif IL_method == 'inverse':
        # IL_inverse
        for org_addr, mod_addr in zip(org_general_addr_l, mod_general_addr_l):
            IL_list.append(addr_tree.IL_inverse(org_addr, mod_addr))
    
    return IL_list, mod_addr_l

In [6]:
IL_list, anonym_addr_l = IL_calc(org_list, anonymized_list, attr_list)
print('max: ', max(IL_list))
print('min: ', min(IL_list))

import numpy as np
print('IL: ', np.mean(IL_list))

'''
for mod_addr, IL in zip(anonym_addr_l, IL_list):
    print('mod: ', mod_addr)
    print('IL : ', IL)
'''

max:  1.0
min:  0.5893312471048857
IL:  0.8093013184715786


"\nfor mod_addr, IL in zip(anonym_addr_l, IL_list):\n    print('mod: ', mod_addr)\n    print('IL : ', IL)\n"