In [111]:
import entity_analysis as ea
import importlib
importlib.reload(ea)

<module 'entity_analysis' from '/data/cskg/examples/entity_analysis.py'>

## Parameters for invoking the notebook

- `cue_target`: file path of cue-target.xml (contains the grounding truth of USF-FAN dataset)
- `cskg_connected`: file path of cskg_connected.tsv (contains the raw cskg edge information)
- `bert_embs`: file path of bert-nli-large-embeddings.tsv.gz (contains the text embeddings for nodes)
- `kgtk_embs`: file path of trans_log_dot_0.1.tsv.gz (contains the graph embeddings for nodes)

In [2]:
cue_target = '../input/cue-target.xml'
cskg_connected = '../input/cskg_connected.tsv'
bert_embs = '../input/bert-nli-large-embeddings.tsv.gz'
kgtk_embs = '../input/trans_log_dot_0.1.tsv.gz' 

## Load Datasets that we need 

- `USF_FAN_dict` : A dictionary whose key is a cue's label, value is a list containing cue's similar targets in decreasing order of similarity <br>e.g. 'turtle': ['slow','shell','tortoise','animal',...]
- `CSKG_label_dict` : A dictionary whose key is the label of the node, value is a list of node IDs, whode node's label is the corresponding key. <br> e.g. 'turtle': ['Q1705322', '/c/en/turtle', ...]
- `CSKG_inv_dict` : A inverted index dictionary recording the correspondence between the ID and label of each node. The key is the node ID, the value is the node's label corresponding to the ID <br> e.g.'Q1705322':'turtle', '/c/en/turtle', 'Q997698':'book'
- `ground_truth`: A dictionary whose key is both in USF_FAN and CSKG, value the same value as the USF_FAN_dict for the cue, this is used as the gold_list

In [3]:
### Load common data(can be used in graph analysis step and text analysis step)
USF_FAN_dict = ea.xml_load(cue_target)
CSKG_label_dict,CSKG_inv_dict = ea.cskg_load(cskg_connected)
ground_truth = ea.get_ground_truth(USF_FAN_dict,CSKG_label_dict)
print(f"Targets for 'turtle' in USF_FAN: {USF_FAN_dict['turtle'][:5]}...\n")
print(f"Nodes with the label 'turtle': {CSKG_label_dict['turtle'][:5]}...\n")
print(f"Label for node 'Q32945370': {CSKG_inv_dict['Q32945370']}\n")
print(f"Length of ground_truth: {len(ground_truth)}, length of USF_FAN_dict: {len(USF_FAN_dict)}")

Targets for 'turtle' in USF_FAN: ['slow', 'shell', 'tortoise', 'animal', 'green']...

Nodes with the label 'turtle': ['/c/en/turtle/n/wn/animal', '/c/en/turtle/n/wikt/en_1', '/c/en/turtle/n/wn/artifact', '/c/en/turtle/n/wp/entourage', '/c/en/turtleneck/n/wn/artifact']...

Label for node 'Q32945370': turtle

Length of ground_truth: 5011, length of USF_FAN_dict: 5018


## With Regard to Graph Embedding Analysis

- `graph_node_emb`: A dictionary whose key is the Node ID, value is the graph embeddings for such node.
- `graph_label_emb`: A dictionary whose key is the Node label , value is the average graph embeddings for such node.
- `graph_index`: A faiss index keeps the index for the graph label embeddings.
- `graph_label_ix`:  A dictionary whose key is the graph_index's number, value is the label. This dictionary is aimed at recording each label's order for future mapping.
- `graph_query_dict`:A dictionary whose key is both in ground_truth and CSKG, value is the graph embedding value for labels on CSKG nodes
- `neighbor_dict`:   A dictionary whose key is a label in CSKG, value is a list containing the label's similar targets in decreasing order of cosine similarity, each item in the list is a tuple, first item is the similar target, and second one is the similarity to the label.<br>
example: {'a': [('s', 0.9048489),('more', 0.88388747),('c', 0.8800387)...]...}
- `pred_dict`: A dictionary with same key with ground_truth , but the value is the list of neighbors generated by faiss searching according the label embeddings

### load graph embedding for each node id

In [4]:
%%time
## load graph embedding for each node id
graph_node_emb = ea.graph_emb_load(kgtk_embs) 
print(f"Graph embeddings for node 'Q32945370': {graph_node_emb['Q32945370'][:5]}...\n")

Graph embeddings for node 'Q32945370': [0.254152298, -0.446585357, 0.152848288, 0.144540176, 0.129683152]...

CPU times: user 1min 30s, sys: 4.24 s, total: 1min 34s
Wall time: 1min 34s


### get the embedding for each label

In [5]:
%%time
## get the embedding for each label, since each label may have multiple nodes, 
## so here we use their average embeddings as the embeddings for such label
graph_label_emb = ea.get_label_emb(graph_node_emb,CSKG_label_dict)
print(f"Graph embeddings for lable 'turtle': {graph_label_emb['turtle'][:5]}...\n")

Graph embeddings for lable 'turtle': [0.15604649623076922, -0.14148455738461538, -0.0034780889230769056, -0.17627298076923076, 0.1646368569230769]...

CPU times: user 32.4 s, sys: 2.73 s, total: 35.2 s
Wall time: 35.1 s


### build a faiss index for graph embddings

In [6]:
%%time
## build a faiss index for graph embddings
graph_index,graph_label_ix = ea.build_index(graph_label_emb)
print(f"The index 10000 points to the label : {graph_label_ix[10000]}\n")

The index 10000 points to the label : wee

CPU times: user 20.4 s, sys: 1.24 s, total: 21.7 s
Wall time: 8.31 s


### create query set

In [7]:
%%time
## create query set, here the query set is generated according to ground turth cues
graph_query_dict =  ea.create_queryset(ground_truth,CSKG_label_dict,graph_label_emb)
print(f" Graph embddings for query label 'black',\n {graph_query_dict['black']}\n{graph_query_dict['black'].shape}\n")

 Graph embddings for query label 'black',
 [[ 0.17928712 -0.13591672  0.01267633  0.05109588  0.12160839  0.13548541
  -0.05430475 -0.19000961  0.08617514  0.0871693   0.04200997 -0.0019516
   0.01005572 -0.05972502  0.11564244  0.05589715  0.0324163   0.01095609
  -0.11914153  0.09515327 -0.11379138  0.02665885 -0.03155296 -0.06613164
  -0.07865904  0.14886895 -0.00828494  0.07173332 -0.08187395 -0.02142016
  -0.00032857  0.07197008 -0.14334884 -0.03807215  0.01257057 -0.09659239
  -0.0765395   0.13656841 -0.12027948  0.05086154 -0.13277571 -0.01514891
   0.14886029  0.04981698  0.15932456 -0.19051534  0.09474476 -0.06964613
  -0.14163026  0.12131403 -0.00805583 -0.06236116 -0.02707643 -0.17910491
   0.00504959  0.0686815  -0.11555821  0.00990293  0.06747081 -0.07370351
  -0.07396634 -0.07172409 -0.04467475 -0.02662679  0.12883261  0.17289415
   0.08157898 -0.16289166 -0.0800919   0.10152796  0.1769218  -0.0878875
  -0.11570306 -0.02765672  0.15355319  0.00549699  0.07113624  0.150103

### Here let's make a simple example for neigbors searching on CKSG labels

In [8]:
## example: search neighbors for label 'person'
tmp1 = ea.get_label_neighbor(graph_query_dict['person'],graph_index,graph_label_ix,5,include=False)
tmp2 = ea.get_label_neighbor(graph_query_dict['person'],graph_index,graph_label_ix,5,include=True)
print(f"Searching result for label 'person'(not include itself): {tmp1}")
print(f"Searching result for label 'person'(include itself):     {tmp2}")

Searching result for label 'person'(not include itself): [('man', 0.98969), ('boy', 0.98480916), ('girl', 0.9805683), ('people', 0.969992), ('black', 0.96583045)]
Searching result for label 'person'(include itself):     [('person', 1.0), ('man', 0.98969), ('boy', 0.98480916), ('girl', 0.9805683), ('people', 0.969992)]


### Neighbor Searching for all ground truth's labels

In [11]:
## Neighbor Searching for all ground truth's labels, Here we serach targets with the same number 
## for each label in ground turth (@X)
neighbor_dict = ea.neighbor_search(graph_query_dict,ground_truth,graph_index,graph_label_ix,1)
pred_dict = ea.get_pred_dict(neighbor_dict)

100%|███████████████████████████████████████| 5011/5011 [07:00<00:00, 11.93it/s]


In [15]:
print(f"Searching result for label 'turtle'on ground turth:   {ground_truth['turtle'][:5]}...")
print(f"Searching result for label 'turtle'on CSKG:           {pred_dict['turtle'][:5]}...")

print()
print(f"The format for neighbor_dict(use an key as an example) neighbor_dict['turtle']:{neighbor_dict['turtle'][:5]}...")

Searching result for label 'turtle'on ground turth:   ['slow', 'shell', 'tortoise', 'animal', 'green']...
Searching result for label 'turtle'on CSKG:           ['skeleton', 'rock', 'style', 'frog', 'channel']...

The format for neighbor_dict(use an key as an example) neighbor_dict['turtle']:[('skeleton', 0.84700084), ('rock', 0.84658915), ('style', 0.8444997), ('frog', 0.8442888), ('channel', 0.842693)]...


### Evaluations based on ground truth and CSKG results

In [17]:
## neighbor searching and calcuate hit score
hit_micro = ea.cal_hits(ground_truth,pred_dict,level='micro')
hit_macro = ea.cal_hits(ground_truth,pred_dict,level='macro')
MAP = ea.cal_map(ground_truth,pred_dict)
MRR = ea.cal_mrr(ground_truth,pred_dict)
print(f"hit_micro@X: {hit_micro}")
print(f"hit_macro@X: {hit_macro}")
print(f"MAP@X: {MAP}")
print(f"MRR: {MRR}")

hit_micro@X: 0.0729793935344536
hit_macro@X: 0.06919887395820333
MAP@X: 0.2418847995261804
MRR: 0.07591513776394637


In [100]:
# X = 50 to get as many similar targets as possible for NDCG calculation 
# => idea1: assume faiss result can include all grounding truth
neighbor_dict = ea.neighbor_search(graph_query_dict,ground_truth,graph_index,graph_label_ix,50)
pred_dict = ea.get_pred_dict(neighbor_dict)

# idea2 => use embedding to calculate similarity to get the relevance => not proper since the result of NDCG will be 1

100%|███████████████████████████████████████| 5011/5011 [07:05<00:00, 11.79it/s]


In [109]:
NDCG = ea.cal_ndcg(ground_truth,pred_dict)
print(f"NDCG: {NDCG}")

NDCG: 0.8655184738071138


### Use different X to do neighbors searching 

In [None]:
# for X in [1,2,3,5,10]:
#     neighbor_dict = ea.neighbor_search(graph_query_dict,ground_truth,graph_index,graph_label_ix,X)
#     hit_micro = ea.cal_hits(ground_truth,neighbor_dict,level='micro')
#     hit_macro = ea.cal_hits(ground_truth,neighbor_dict,level='macro')
#     MAP = ea.cal_map(ground_truth,neighbor_dict)
#     MRR = ea.cal_mrr(ground_truth,neighbor_dict)
#     print(f"hit_micro@{X}X: {hit_micro}")
#     print(f"hit_macro@{X}X: {hit_macro}")
#     print(f"MAP@{X}X: {MAP}")
#     print(f"MRR: {MRR}")

## With Regard to Text Embedding Analysis

- `text_node_emb`: A dictionary whose key is the Node ID, value is the text embeddings for such node.
- `text_label_emb`: A dictionary whose key is the Node label , value is the average text embeddings for such node.
- `text_index`: A faiss index keeps the index for the text label embeddings.
- `text_label_ix`:  A dictionary whose key is the text_index's number, value is the label. This dictionary is aimed at recording each label's order for future mapping.
- `text_query_dict`:A dictionary whose key is both in ground_truth and CSKG, value is the graph embedding value according to CSKG
- `neighbor_dict`:   A dictionary whose key is a label in CSKG, value is a list containing the label's similar targets in decreasing order of cosine similarity, each item in the list is a tuple, first item is the similar target, and second one is the similarity to the label.<br>
example: {'a': [('s', 0.9048489),('more', 0.88388747),('c', 0.8800387)...]...}
- `pred_dict`: A dictionary with same key with ground_truth , but the value is the list of neighbors generated by faiss searching according the label embeddings

### load text embedding for each node id

In [112]:
## load text embedding for each node id
text_node_emb = ea.txt_emb_load(bert_embs)

100%|███████████████████████████████| 2161048/2161048 [16:20<00:00, 2203.23it/s]


### get the embedding for each label

In [113]:
%%time
## get the embedding for each label, since eache label may have multiple nodes, 
## so here we use their average embeddings
text_label_emb = ea.get_label_emb(text_node_emb,CSKG_label_dict)

CPU times: user 4min 49s, sys: 1min 7s, total: 5min 56s
Wall time: 5min 55s


### build a faiss index for text embddings

In [115]:
%%time
## build a faiss index for text embddings
text_index,text_label_ix = ea.build_index(text_label_emb)

CPU times: user 2min 7s, sys: 31.8 s, total: 2min 39s
Wall time: 2min 13s


### create query set

In [116]:
%%time
## create query set, here the query set is generated according to ground turth cues
text_query_dict =  ea.create_queryset(ground_truth,CSKG_label_dict,text_label_emb)

CPU times: user 1min 3s, sys: 2.69 s, total: 1min 6s
Wall time: 945 ms


### Here let's make a simple example for neigbors searching on CKSG labels

In [117]:
## example: search neighbors for label 'person'
tmp1 = ea.get_label_neighbor(text_query_dict['person'],text_index,text_label_ix,5,include=False)
tmp2 = ea.get_label_neighbor(text_query_dict['person'],text_index,text_label_ix,5,include=True)
print(f"Searching result for label 'person'(not include itself): {tmp1}")
print(f"Searching result for label 'person'(include itself):     {tmp2}")

Searching result for label 'person'(not include itself): [('man', 0.9726254), ('men', 0.96999484), ('boy', 0.96972156), ('board', 0.9528501), ('area', 0.95093006)]
Searching result for label 'person'(include itself):     [('person', 1.0000002), ('man', 0.9726254), ('men', 0.96999484), ('boy', 0.96972156), ('board', 0.9528501)]


### Neighbor Searching for all ground truth's labels

In [118]:
## Neighbor Searching for all ground truth's labels, Here we serach targets with the same number 
## for each label in ground turth (@X)
neighbor_dict = ea.neighbor_search(text_query_dict,ground_truth,text_index,text_label_ix,1)
pred_dict = ea.get_pred_dict(neighbor_dict)

100%|█████████████████████████████████████| 5011/5011 [1:05:53<00:00,  1.27it/s]


In [119]:
print(f"Searching result for label 'turtle'on ground turth:   {ground_truth['turtle'][:5]}...")
print(f"Searching result for label 'turtle'on CSKG:           {pred_dict['turtle'][:5]}...")

print()
print(f"The format for neighbor_dict(use an key as an example) neighbor_dict['turtle']:{neighbor_dict['turtle'][:5]}...")

Searching result for label 'turtle'on ground turth:   ['slow', 'shell', 'tortoise', 'animal', 'green']...
Searching result for label 'turtle'on CSKG:           ['spiny turtle', 'box turtle', 'spotted turtle', 'painted turtle', 'lute turtle']...

The format for neighbor_dict(use an key as an example) neighbor_dict['turtle']:[('spiny turtle', 0.95888114), ('box turtle', 0.9561493), ('spotted turtle', 0.9481943), ('painted turtle', 0.9479501), ('lute turtle', 0.946333)]...


### Evaluations based on ground truth and CSKG results

In [120]:
## neighbor searching and calcuate hit score
hit_micro = ea.cal_hits(ground_truth,pred_dict,level='micro')
hit_macro = ea.cal_hits(ground_truth,pred_dict,level='macro')
MAP = ea.cal_map(ground_truth,pred_dict)
MRR = ea.cal_mrr(ground_truth,pred_dict)
print(f"hit_micro@X: {hit_micro}")
print(f"hit_macro@X: {hit_macro}")
print(f"MAP@X: {MAP}")
print(f"MRR: {MRR}")

hit_micro@X: 0.02748779102733233
hit_macro@X: 0.0272773917987547
MAP@X: 0.09015168588218885
MRR: 0.021420157065854165


In [121]:
# X = 50 to get as many similar targets as possible for NDCG calculation 
# => idea1: assume faiss result can include all grounding truth
neighbor_dict = ea.neighbor_search(text_query_dict,ground_truth,text_index,text_label_ix,50)
pred_dict = ea.get_pred_dict(neighbor_dict)
NDCG = ea.cal_ndcg(ground_truth,pred_dict)
print(f"NDCG: {NDCG}")

100%|█████████████████████████████████████| 5011/5011 [1:01:16<00:00,  1.36it/s]


NDCG: 0.8629002252951639


### Use different X to do neighbors searching

In [None]:
# for X in [1,2,3,5,10]:
#     neighbor_dict = ea.neighbor_search(text_query_dict,ground_truth,text_index,text_label_ix,X)
#     hit_micro = ea.cal_hits(ground_truth,pred_dict,level='micro')
#     hit_macro = ea.cal_hits(ground_truth,pred_dict,level='macro')
#     MAP = ea.cal_map(ground_truth,pred_dict)
#     MRR = ea.cal_mrr(ground_truth,pred_dict)
#     print(f"hit_micro@{X}X: {hit_micro}")
#     print(f"hit_macro@{X}X: {hit_macro}")
#     print(f"MAP@{X}X: {MAP}")
#     print(f"MRR: {MRR}")

## Testing Evaluation metrics
- MAP will inform us how many of the top candidates are good. 
- Hits@X, Hits@2X, etc. will inform us about the number of good results in flexible number of top results
- MRR will inform us about the ranking by the system for the top answer
- NDCG will inform us about the relative ranking of the positive results between the system and the ground truth.

In [None]:
ground_truth1 =   { 'good': ['bad','well','better' ] , 'book':['read','paper' ]  } 
neighbor_dict1 = { 'good': ['hello','bad','well'], 'book':['paper','fine']    } 
neighbor_dict2 = { 'good': ['hello','bad','well','fine','kid','better'], 
                   'book': ['paper','fine','pen','magazin'] }
hit_ratio = ea.cal_hits(ground_truth1,neighbor_dict1,level='micro')
mrr = ea.cal_mrr(ground_truth1,neighbor_dict1)
map_ = ea.cal_map(ground_truth1,neighbor_dict1)
print(hit_ratio,mrr,map_)

hit_ratio = ea.cal_hits(ground_truth1,neighbor_dict2,level='micro')
mrr = ea.cal_mrr(ground_truth1,neighbor_dict2)
map_ = ea.cal_map(ground_truth1,neighbor_dict2)
print(hit_ratio,mrr,map_)