# Evaluation ranking between cskg and USF-FAN


This notebook performs evaluation ranking between cksg and USF-FAN, it will calculate MAP for different entity embedding gz files 

In [1]:
import json
import copy
import faiss
import gzip
import os
import numpy as np
from lxml import etree

### Parameters for invoking the notebook

- `cue_target`: a xml file contains the grounding truth of USF-FAN dataset
- `cskg_connected`: a tsv file contains the raw cskg entity information
- `embedding_folder`: a folder contains all of the embedding gz files
- `MAP_res`: a json file contains the MAP result for each cskg embedding gz file

In [14]:
### Parameters
cue_target = '../tmp/cue-target.xml'
cskg_connected = '../input/cskg_connected.tsv'
embedding_folder = '../output/embeddings'
MAP_res = '../output/MAP_res.json'

###  Utils

- dict_to_json(dict_,output_file):  convert dictionary to json file
- get_file_path(embedding_folder):  get all of embedding gz files 
- res_to_json(result,MAP_res_file):     dump MAP result into json file


In [3]:
def dict_to_json(dict_,output_file):
    with open(output_file,'w') as f:
        json.dump(dict_,f)
         
def get_file_path(embedding_folder):
    gz_list = []
    for gz_file in os.listdir(embedding_folder):
        file_path = os.path.join(embedding_folder, gz_file)
        gz_list.append(file_path)   
    return gz_list

def res_to_json(result,MAP_res_file):
    res_dict = {}
    for i in result:
        gz_file = i[0]
        MAP = i[1]
        res_dict[gz_file] = MAP
    with open(MAP_res_file,'w') as f:
        json.dump(res_dict,f)

###  Prepara data

- xml_load(input_file): load USF-FAN dataset and convert it into dictionary format
- create_cskg_index(tsv_file): load cskg_connected.tsv and convert it into dictionary format
- load_ent_embeddings(input_file): load entity embeddings and create two dictionaries to store node index and embeddings

In [4]:
def xml_load(input_file):   # cue-target.xml'
    tree = etree.parse(input_file)
    root = tree.getroot()
    # create a dict to store ground truth sets, 
    # example : `p={'car': ['wheel', 'driver', ...], 'book`: [...]}`
    ground_truth = {}
    for cue_ele in root:
        key = cue_ele.get('word')
        ground_truth[key] = []
        for word_ele in cue_ele:
            ground_truth[key].append(word_ele.get('word'))
    return ground_truth


def create_cskg_index(tsv_file): # cskg_connected.tsv
    cskg_index_dict = {}
    #  create a dict to store cskg data set   label: node_list
    #  example : `p={'turtle':  ['Q1705322', '/c/en/turtle', ...], 'book`: [...]}`
    with open(tsv_file) as f:
        for line in f:
            content = line.split('\t')
            if content[0]!='id': # ignore the first time 
                node1_id = content[1]
                node2_id = content[3]
                node1_lbl = content[4]
                node2_lbl = content[5]
                cskg_index_dict[node1_lbl] = cskg_index_dict.get(node1_lbl,set())
                cskg_index_dict[node1_lbl].add(node1_id)
                cskg_index_dict[node2_lbl] = cskg_index_dict.get(node2_lbl,set())
                cskg_index_dict[node2_lbl].add(node2_id)
    
    # convert set to list
    for k in cskg_index_dict:
        cskg_index_dict[k] = list(cskg_index_dict[k])

    return cskg_index_dict

def load_ent_embeddings(input_file):
    # input file folder path :/nas/home/binzhang/backup_data/embeddings 
    #  create a dict to store cskg embeddings   node: embedding example: '/c/en/turtle': [0.01,0.02....]

    ix_node_dict = {} # { node_index: node_name,... node_name:node_index... }
    node_embedding_dict = {} # {node_name:embedding, ....}
    with gzip.open(input_file,'rt') as f:
        for index,line in enumerate(f):
            line = line.split('\t')
            entity_name = line[0]
            entity_vec =  [ float(i) for i in line[1:]]
            ix_node_dict[entity_name] = index
            ix_node_dict[index] = entity_name
            node_embedding_dict[entity_name] = entity_vec
    
    return ix_node_dict,node_embedding_dict

#### Process data

- cal_avg_embeddings(node_embedding_dict,cskg_index_dict): calculate entity's average embedding
-  build_fassi_index(avg_embeddings): build a fassi index for embedding matrix and a label dictionary for each entity label
- create_queryset(ground_truth_dict,label_dict,avg_embeddings): build query set for fassi index
- neighbor_searching: find neighbors of query set
- evaluate_ranking: compare traing set(cskg) and ground truth(USF-FAN) to get MAP

In [5]:
def cal_avg_embeddings(node_embedding_dict,cskg_index_dict):
    # node_embedding_dict's key is node's name (example: '/c/en/joke') and the value is the embedding vectors
    # example: '/c/en/turtle': [0.01,0.02...]

    # cskg_index_dict's key is the label for a node , value is a list recording the node's name
    # example : joke: ['/c/en/joke', '/c/en/joke/n', '/c/en/joke/n/wn/act',...]
    avg_embeddings = {}
    for label in cskg_index_dict:
        entity_names = cskg_index_dict[label]
        size = len(entity_names)
        sum_embedding =  node_embedding_dict[entity_names[0]]
        for entity in entity_names[1:]:
            embedding = node_embedding_dict[entity] # embeddings list 
            sum_embedding = list(map(lambda x,y : x+y ,sum_embedding,embedding))
            
        avg_emb = [i/size for i in sum_embedding]
        avg_embeddings[label] = avg_emb
        
    return avg_embeddings

def build_fassi_index(avg_embeddings):
    # avg_embeddings is a dictionary which key is the node label and value is lable's embedding
    
    label_dict = {}         # build a entity label-index bi dictionary
    entity_embeddings = []  # all the embeddings 

    index = 0
    for key,value in avg_embeddings.items():
        label_dict[index] = key
        label_dict[key] = index
        index+=1    
        entity_embeddings.append(value)

    # entity_embeddings => matrix  X contains  all labels' embeddings 
    X = np.array(entity_embeddings).astype(np.float32) # float32
    dimension = X.shape[1]

    # build index (METRIC_INNER_PRODUCT => cos )
    vec_index = faiss.index_factory(dimension, "Flat", faiss.METRIC_INNER_PRODUCT)
    # # normalize all vectors in order to get cos sim 
    faiss.normalize_L2(X)  
    # add vectors to inde 
    vec_index.add(X) 
    
    return vec_index,label_dict

def create_queryset(ground_truth_dict,label_dict,avg_embeddings):
    query_ent_vecs = []
    macth_concept = 0
    miss_concept = 0
    miss_concept_list = []
    for key in ground_truth_dict:
        if key.lower() in label_dict:
            macth_concept+=1
            query_ent_vecs.append(avg_embeddings[key.lower()])
        else:
            miss_concept_list.append(key)
            miss_concept+=1
    # print(f'match label num from cskg and USF-FAN: {macth_concept}')
    # print(f'miss label num from cskg and USF-FAN: {miss_concept}, they are {miss_concept_list}')
    
    query_ent_mat = np.array(query_ent_vecs).astype(np.float32)
    faiss.normalize_L2(query_ent_mat) 
    
    return query_ent_mat

def neighbor_searching(vec_index,query_ent_mat,label_dict,topk):
    cos_sim, index = vec_index.search(query_ent_mat, topk) # both of them are matrices
    neighbors_dict = {}
    for neighbors in index:
        tmp_list = []
        for ix in neighbors:
            tmp_list.append(label_dict[ix].upper())   # ix refers to the label's index 
        
        neighbors_dict[tmp_list[0]] = tmp_list[1:]
                
    return neighbors_dict  

def evaluate_ranking(test_dict,grouding_dict):
    MAP = 0
    set_size = len(test_dict)
    
    for label in test_dict:
        values = test_dict[label]
        ground_values = grouding_dict[label]
        item_size = len(values)
        correct_preditions = 0
        running_sum = 0
        
        for k in range(0,item_size):
            if values[k] in ground_values:
                correct_preditions +=1
                running_sum+=correct_preditions/(k+1)
        if correct_preditions ==0 : # no one match:
            AP =0
        else:
            AP = running_sum/correct_preditions
        MAP+=AP
    
    return MAP/set_size


### Procedures
1. prepares  USF-FAN dataset into dictionary  result: ground_truth_dict
2. construct an index of CSKG from label to node id  result: cskg_index_dict
3. get all possible possible embeddings from embedding folder
4. For each possible embedding:
    - obtain the embeddings for all entity.
    - compute an average embedding.
    - create a fassi vector index.
    - create query sets based on grounding truth(USF-FAN).
    - set topk and do calculate MAP.

In [10]:
# prepares  USF-FAN dataset into dictionary  result: ground_truth_dict
ground_truth_dict = xml_load(cue_target)

# construct an index of CSKG from label to node id  result: cskg_index_dict
cskg_index_dict = create_cskg_index(cskg_connected)

# get all possible possible embeddings 
gz_list = get_file_path(embedding_folder)


###  Here we take an embedding gz file as an example, do MAP calculation
print(f'using embedding gz file: {gz_list[0]}')

# obtain the embeddings for all concepts 
ix_node_dict,node_embedding_dict = load_ent_embeddings(gz_list[0])
# compute an average embedding. 
avg_embeddings = cal_avg_embeddings(node_embedding_dict,cskg_index_dict)

# fassi: create vector index
vec_index,label_dict= build_fassi_index(avg_embeddings)

# create query sets based on grounding truth(USF-FAN)
query_ent_mat = create_queryset(ground_truth_dict,label_dict,avg_embeddings)

topk = 11
neighbors_dict = neighbor_searching(vec_index,query_ent_mat,label_dict,topk)
print(f"neigbors for 'A' on cskg: {neighbors_dict['A']}")
print(f"neighbors for 'A' on USF-FAN: {ground_truth_dict['A']}")      

MAP = evaluate_ranking(neighbors_dict,ground_truth_dict)
MAPs.append(MAP)

print(f'MAP: {MAP}')

using embedding gz file: ../output/embeddings/trans_rank_dot_0.1.tsv.gz
neigbors for 'A' on cskg: ['MONO', 'THROW', 'LOW', 'FOIL', 'BULL', 'HOST', 'SECOND', 'FIRST', 'BORN', 'AB']
neighbors for 'A' on USF-FAN: ['B', 'ALPHABET', 'THE', 'GRADE', 'LETTER', 'PLUS', 'AN', 'Z', 'AND', 'APPLE', 'GREAT', 'ONE', 'WORD']
MAP: 0.09796898266958144


In [12]:
# Here if you want to get all possible MAPs, then you can declare an MAP list and make a for loop
MAPs  = []
for ent_embedding_path in gz_list:
    # obtain the embeddings for all concepts 
    ix_node_dict,node_embedding_dict = load_ent_embeddings(ent_embedding_path)
    # compute an average embedding. 
    avg_embeddings = cal_avg_embeddings(node_embedding_dict,cskg_index_dict)

    # fassi: create vector index
    vec_index,label_dict= build_fassi_index(avg_embeddings)

    # create query sets based on grounding truth(USF-FAN)
    query_ent_mat = create_queryset(ground_truth_dict,label_dict,avg_embeddings)

    topk = 11
    neighbors_dict = neighbor_searching(vec_index,query_ent_mat,label_dict,topk)

    print(f"neigbors for 'A' on cskg: {neighbors_dict['A']}")
    print(f"neighbors for 'A' on USF-FAN: {ground_truth_dict['A']}")      

    MAP = evaluate_ranking(neighbors_dict,ground_truth_dict)
    MAPs.append(MAP)

    print(f'embedding gz file: {ent_embedding_path}')
    print(f'MAP: {MAP}')
    print('='*100)

neigbors for 'A' on cskg: ['MONO', 'THROW', 'LOW', 'FOIL', 'BULL', 'HOST', 'SECOND', 'FIRST', 'BORN', 'AB']
neighbors for 'A' on USF-FAN: ['B', 'ALPHABET', 'THE', 'GRADE', 'LETTER', 'PLUS', 'AN', 'Z', 'AND', 'APPLE', 'GREAT', 'ONE', 'WORD']
embedding gz file: ../output/embeddings/trans_rank_dot_0.1.tsv.gz
MAP: 0.09796898266958144
neigbors for 'A' on cskg: ['ON', 'CORE', 'PORT', 'C', 'AS', 'PROPER TRAINING', 'PAP', 'DEFENCE', 'PUT OUT', 'SUPER']
neighbors for 'A' on USF-FAN: ['B', 'ALPHABET', 'THE', 'GRADE', 'LETTER', 'PLUS', 'AN', 'Z', 'AND', 'APPLE', 'GREAT', 'ONE', 'WORD']
embedding gz file: ../output/embeddings/comp_rank_dot_0.1.tsv.gz
MAP: 0.06212578810632696
neigbors for 'A' on cskg: ['AS', 'X', 'RAY', 'HACKEE', 'SPUR', 'S', 'SUPER', 'JNA', 'BATON', 'JAMES WILSON']
neighbors for 'A' on USF-FAN: ['B', 'ALPHABET', 'THE', 'GRADE', 'LETTER', 'PLUS', 'AN', 'Z', 'AND', 'APPLE', 'GREAT', 'ONE', 'WORD']
embedding gz file: ../output/embeddings/comp_soft_dot_0.05.tsv.gz
MAP: 0.2093668219117

In [16]:
# write result to json file
result = list(zip(gz_list,MAPs))  # [(gz_file,MAP),(...)]
res_to_json(result,MAP_res) 

### best performance

neigbors for 'A' on cskg: ['S', 'MORE', 'C', 'BINGO', 'ACE', 'MOD', 'MONO', 'E', 'REAL', 'SUPER']

neighbors for 'A' on USF-FAN: ['B', 'ALPHABET', 'THE', 'GRADE', 'LETTER', 'PLUS', 'AN', 'Z', 'AND', 'APPLE', 'GREAT', 'ONE', 'WORD']

embedding gz file: /nas/home/binzhang/backup_data/embeddings/trans_log_dot_0.1.tsv.gz

MAP: 0.2507032495854852