## Introduction
Relation prediction on WebChild property data

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from nltk.stem.wordnet import WordNetLemmatizer
import random, os, time, faiss, torch

In [None]:
from util import *

## Setup directories

In [None]:
# input_file
kgtk_webchild = "./data/wc/kgtk_webchild_comparative.tsv"

# ouput_file
wc_gold_file = "./data/wc/wc_gold_500k.tsv"
wc_entity_file = "./data/wc/entities.txt"
wc_entity2id_file = "./data/wc/entity2id.txt"
wc_entity2text_file = "./data/wc/entity2text.txt"
wc_entity2textlong_file = "./data/wc/entity2textlong.txt"
wc_relation_file = "./data/wc/relations.txt"
wc_relation2id_file = "./data/wc/relation2id.txt"
wc_relation2text_file = "./data/wc/relation2text.txt"
wc_train_500k = "./data/wc/train.tsv"
wc_dev_500k = "./data/wc/dev.tsv"
wc_test_500k = "./data/wc/test.tsv"
wc_train2id_500k = "./data/wc/train2id.txt"
wc_dev2id_500k = "./data/wc/valid2id.txt"
wc_test2id_500k = "./data/wc/test2id.txt"

## Data Preperation

In [None]:
def generate_gold_file(lines):
    # Generate TSV file for relation classification
    wn_gold_all = []
    i = 0
    for line in lines:
        node1_id = line[0]
        relation_id = line[1]
        node2_id = line[2]
        node1_labels = line[3]
        node2_labels = line[4]
        relation_label = line[5]

        # modeify the node labels, check with leve distance
        sent = line[8].replace("[","").replace("]","")
        node1_label = multiple_labels(node1_labels,node1_id)
        node2_label = multiple_labels(node2_labels,node2_id)

        wn_gold_all.append([node1_label, relation_label, node2_label, node1_id, node2_id,relation_id,sent])
        i += 1
        if i%10000==1:
            print(f"\r {i}/{len(lines)}", end="")
        
    return wn_gold_all
        
def write_split_file(filename1, fielename2, lines,entity2detail,relation2detail):
    # write train & train2id
    with open(filename1,"w",newline='') as f1, open(fielename2,"w",newline='') as f2:
        w1 = csv.writer(f1, delimiter='\t')
        w2 = csv.writer(f2, delimiter='\t')

        #write head
        w2.writerow([len(lines)])
        for line in lines:
            # wtite content
            entity1, relation, entity2 = line
            entity1_id, entity1_text, entity1_textlong = entity2detail[entity1]
            entity2_id, entity2_text, entity2_textlong = entity2detail[entity2]
            relation_id, relation_text, relation_textlong = relation2detail[relation]
            w1.writerow([entity1,relation,entity2])
            w2.writerow([entity1_id,entity2_id,relation_id,])

In [None]:
# load data
head, lines = load_file(kgtk_webchild,encoding="latin1")

In [None]:
# triple example
lines[0]

**Data Description**

In [None]:
# check whether it has triplr without predicate
# if yes, remove this triples
# check number of relations, unique nodes, total edges, distribution of relations, most frequent nodes

valid_lines = []
invalid_lines_count = 0
unique_nodes = dict()
relations_dict = dict()

node2text = dict()
relation2text = dict()

for line in lines:
    node1_id = line[0]
    node2_id = line[2]
    relation_id = line[1]
    
    node1_label = line[3]
    node2_label = line[4]
    relation_label = line[5]
    
    if not relation_id:
        #filter no relation
        invalid_lines_count += 1
        continue
    
    # triple with relation
    valid_lines.append(line)
    
    # count the number of unique nodes
    unique_nodes[node1_id] = unique_nodes.get(node1_id,0)+1
    unique_nodes[node2_id] = unique_nodes.get(node2_id,0)+1
    
    # count the number of unique relations
    relations_dict[relation_id] = relations_dict.get(relation_id,0)+1
    
    node2text[node1_id] = node1_label
    node2text[node2_id] = node2_label
    relation2text[relation_id] = relation_label
    
# Summary
print(f"The number of triples without relations: {invalid_lines_count}")
print(f"The number of triples with relation: {len(valid_lines)}")
print(f"The number of unique nodes: {len(unique_nodes)}")
print(f"The number of unique relations: {len(relations_dict)}")
max_fre_nodes = max(unique_nodes, key=unique_nodes.get)
print(f"The most frequent node: {max_fre_nodes}")
max_fre_relation = max(relations_dict, key=relations_dict.get)
print(f"The most frequent relation: {max_fre_relation}")

In [None]:
plt.hist(relations_dict.values(),log=True,bins=20)
plt.title('Frequency Distribution')
plt.xlabel('Number of Edges Having One Specified Relation')
plt.ylabel('Number of Relations')
plt.show()

In [None]:
relation2fre=[]

for line in lines:
    relation_id = line[1]
    
    relation2fre.append(relations_dict[relation_id])

In [None]:
plt.hist(relation2fre,bins=20)
plt.title('Frequency Distribution')
plt.xlabel('Number of Edges Having One Specified Relation')
plt.ylabel('Number of Edges for the Relations Having Same Number of Edges')
plt.show()

In [None]:
# random select 500k lines from data
valid_lines_10k = random.choices(valid_lines, k=500000)

# generate 500k lines gold file
wc_gold_500k = generate_gold_file(valid_lines_10k)

**Generate Data File**

1. Generate Train, Dev and Test Dataset.
2. Generate basic data files used for kg-bert

In [None]:
entity2detail = dict()
relation2detail = dict()

idx= 0
for node_id in unique_nodes:
    # build dict for each entity with their id, text, and long text
    definition = wn.synset(node_id.split(":")[1]).definition()
    entity_label = node2text[node_id]
    entity2detail[node_id]=[idx, entity_label, definition]
    idx += 1

idx = 0
for relation_id in relation2text:
    # build dict for each relation with their id, text, and long text
    definition = wn.synset(relation_id.split(":")[1].replace(" ","_")).definition()
    relation_label = relation2text[relation_id]
    relation2detail[relation_id]=[idx, relation_label, definition]
    idx += 1
    
# Transfer gold data to the structure of kg-bert
kgBert_data_sent = [[_[3],_[5],_[4],_[6]] for _ in wc_gold_500k]

In [None]:
# write file
# entities.txt, entity2id, entity2text, entity2textlong

with open(wc_entity_file,"w",newline='') as f1, open(wc_entity2id_file,"w",newline='') as f2,\
open(wc_entity2text_file, "w",newline='') as f3, open(wc_entity2textlong_file,"w",newline='') as f4:
    w1 = csv.writer(f1, delimiter='\t')
    w2 = csv.writer(f2, delimiter='\t')
    w3 = csv.writer(f3, delimiter='\t')
    w4 = csv.writer(f4, delimiter='\t')
    
    #write head
    w2.writerow([len(entity2detail)])
    for entity in entity2detail:
        # wtite content
        entity_id, entity_text, entity_textlong = entity2detail[entity]
        w1.writerow([entity,])
        w2.writerow([entity,entity_id])
        w3.writerow([entity,entity_text])
        w4.writerow([entity,entity_textlong])
        
# relations.txt, rekation2id.txt, relation2text.txt
wc_relation_file = "./data/wc/relations.txt"
wc_relation2id_file = "./data/wc/relation2id.txt"
wc_relation2text_file = "./data/wc/relation2text.txt"
with open(wc_relation_file,"w",newline='') as f1, open(wc_relation2id_file,"w",newline='') as f2,\
open(wc_relation2text_file, "w",newline='') as f3:
    w1 = csv.writer(f1, delimiter='\t')
    w2 = csv.writer(f2, delimiter='\t')
    w3 = csv.writer(f3, delimiter='\t')
    
    #write head
    w2.writerow([len(relation2detail)])
    for relation in relation2detail:
        # wtite content
        relation_id, relation_text, relation_textlong = relation2detail[relation]
        w1.writerow([relation,])
        w2.writerow([relation,relation_id])
        w3.writerow([relation,relation_text])

In [None]:
# Split train, dev, test file

random.shuffle(kgBert_data_sent)

# define ratio of train, dev, test
train_ratio = 0.8
dev_ratio = 0.1
test_ratio = 1 - train_ratio - dev_ratio

len_gold = len(kgBert_data_sent)
train, dev, test = np.split(kgBert_data_sent,[int(train_ratio*len_gold),int((train_ratio+dev_ratio)*len_gold)])
print("length of train, dev, test: ", len(train), len(dev), len(test))

# write train & train2id
write_split_file(wc_train_500k, wc_train2id_500k, [_[0:-1] for _ in train],entity2detail,relation2detail)

# write dev & dev2id
write_split_file(wc_dev_500k, wc_dev2id_500k, [_[0:-1] for _ in dev],entity2detail,relation2detail)

# write test & test2id
write_split_file(wc_test_500k, wc_test2id_500k, [_[0:-1] for _ in test],entity2detail,relation2detail)

In [None]:
#example of gold file
train[0]

## Baseline Running

**Random Baseline**

In [None]:
def generate_synsets(labels):
    # According to the generation of labels, obtain the synsets
    for label in labels:
        synsets = list(wn.synsets(label, pos="a"))
        
        if synsets:
            return synsets, label
        
    return [], label

def generate_candidates(label):
    candidates,_ = generate_synsets(transfer_words(WordNetLemmatizer().lemmatize(label, pos="a")))
    return candidates

def MRS(wn_gold):
    # Random Baseline calculation
    # only predict subject
    wn_predict = []
    for line in wn_gold:
        entity1  = line[0]
        entity2 = line[2]
        relation_label = line[3]
        relation_label = relation_transfer(relation_label)
        
        # generate all possible combination of "_"&"-" and check whether it can find sysets.
        # if the combination can generate sysets, use this combinations to generate candidates
        candidates = generate_candidates(relation_label)
        
        if candidates:
            # random choose candidates
            relation_id = random.choice(candidates)
        else:
            relation_id = ""
        
        node1_id=""
        node2_id=""
        
        wn_predict.append([entity1, relation_id, entity2, relation_label])
        
    return wn_predict

def relation_transfer(label, indexs=[0,1,1,1], str_=[" than","be ","more ","less "]):
    if str_:
        # modify
        index = indexs[0]
        if index == 0:
            label = label.split(str_[0])[index]
            return relation_transfer(label, indexs=indexs[1:], str_=str_[1:])
        else:
            temp = label.split(str_[0])
            if len(temp)>index:
                label = temp[index]
                return relation_transfer(label, indexs=indexs[1:], str_=str_[1:])
            else:
                return relation_transfer(label, indexs=indexs[1:], str_=str_[1:])
    else:
        # no more modify
        return label.strip()
    
def modify_data(lines,relation2detail):
    # add label in the dataset used to make prediction
    for line in lines:
        entity1, relation, entity2, sent = line
        relation_label = relation2detail[relation][1]
        yield entity1, relation, entity2, relation_label, sent

def validation(wn_predict, wn_gold):
    # valid the accuracy of prediction: only compare the accuracy of prediction
    
    correct = 0
    
    for predict, actual in zip(wn_predict, wn_gold):
        predict_label1 = predict[1]
        actual_label1 = wn.synset(actual[1].split(":")[1].replace(" ","_"))
        if not predict_label1:
            continue
        if predict_label1 == actual_label1:
            # predict and actual is same
            correct += 1
            
    return correct/len(wn_predict)

In [None]:
# modify train, dev, test data
train_modify = list(modify_data(train,relation2detail))
dev_modify = list(modify_data(dev,relation2detail))
test_modify = list(modify_data(test,relation2detail))

In [None]:
#train dataset
wc_ran_train_predict = MRS(train_modify)
accuracy = validation(wc_ran_train_predict, train_modify)
print("Accuracy of train dataset (random baseline): ", accuracy)

#dev dataset
wc_ran_dev_predict = MRS(dev_modify)
accuracy = validation(wc_ran_dev_predict, dev_modify)
print("Accuracy of dev dataset (random baseline): ", accuracy)

#test dataset
wc_ran_test_predict = MRS(test_modify)
accuracy = validation(wc_ran_test_predict, test_modify)
print("Accuracy of test dataset (random baseline): ", accuracy)

**MFS Baseline**

In [None]:
def MFS(wn_gold):
    # Frequent Baseline Calculation
    wn_predict = []
    for line in wn_gold:
        entity1  = line[0]
        entity2 = line[2]
        relation_label = line[3]
        relation_label = relation_transfer(relation_label)
        
        # generate all possible combination of "_"&"-" and check whether it can find sysets.
        # if the combination can generate sysets, use this combinations to generate candidates
        candidates = generate_candidates(relation_label)
        
        if candidates:
            # random choose candidates
            relation_id = candidates[0]
        else:
            relation_id = ""
        
        node1_id=""
        node2_id=""
        
        wn_predict.append([entity1, relation_id, entity2, relation_label])
    return wn_predict

In [None]:
#train dataset
wc_ran_train_predict = MFS(train_modify)
accuracy = validation(wc_ran_train_predict, train_modify)
print("Accuracy of train dataset (frequency baseline): ", accuracy)

#dev dataset
wc_ran_dev_predict = MFS(dev_modify)
accuracy = validation(wc_ran_dev_predict, dev_modify)
print("Accuracy of dev dataset (frequency baseline): ", accuracy)

#test dataset
wc_ran_test_predict = MFS(test_modify)
accuracy = validation(wc_ran_test_predict, test_modify)
print("Accuracy of test dataset (frequency baseline): ", accuracy)

**STBert Baseline**

In [None]:
model_STB = SentenceTransformer('bert-base-nli-mean-tokens')

In [None]:
def vector_normalize(vector):
    # input should be a numpy array
    # change to unit vector
    return vector / (vector**2).sum()**0.5

def max_candidate_faiss(label_,sent_embedding_,label_embeddings):
    # return the max similarity candidates
    #output: [[similarity, synset, the pos of synset]]

    sent_embedding_ = vector_normalize(sent_embedding_)
    if label_ not in label_embeddings:
        # label is not exists in the labels embeddings means:
            #there is no sysets for this label
            # return ""
        return ""
    else:
        # label exists
        # return the max similarity candidates
        top_n = 1
        index_ = label_embeddings[label_][1]
        _, I = index_.search(np.array([sent_embedding_]), top_n)
        #print(int(I),label_embeddings[label_][0])
        return label_embeddings[label_][0][int(I)]

def candidates_embeddings_faiss(wn_gold, model):
    # generate label node id defination embeddings from file
    # output:{"label_name":[[node_id, embedding of node_id defination],[X,X],[X,X]]}
    
    # store label, synset
    label_synsets = []
    # store the defination sentence of synset
    sents_combine = []
    
    embeddings = dict()
    
    length1 = len(wn_gold)
    count1 = 0
    
    for line in wn_gold:
        label1 = line[0]
        label2 = line[2]
        relation_label = line[3]
        relation_label = relation_transfer(relation_label)
        
        label_synsets,sents_combine = label2sentence2sent(relation_label, model,label_synsets,sents_combine)
        count1 += 1
        
        #if count1%1000==0:print(f"\r lines counting {count1}/{length1}",end="")

    # generate embedding of sentence
    start = time.time()
    sents_embed = model.encode(sents_combine)
    # dimension of faiss
    d = len(sents_embed[0])
    end = time.time()
    #print(f"model time: {end-start}")
    #print("candidates sentences embedding generated")
    
    length2 = len(label_synsets)
    count2 = 0
    for label_synset, embed in zip(label_synsets,sents_embed):
        label, synset = label_synset
        
        temp = embeddings.get(label,dict())
        if synset not in temp:
            temp[synset] = embed
        
        # generate embedding of label synset
        embeddings[label]=temp
        count2 +=1
        
        #if count2%1000==0:print(f"\r embed countung {count2}/{length2}", end="")
            
    # write embedding into faiss
    for label in embeddings:
        label_embeds = embeddings[label]
        index_ = faiss.IndexFlatL2(d)
        sub_embeds = []
        sub_labelId = []
        for label_id in label_embeds:
            embed = label_embeds[label_id]
            sub_labelId.append(label_id)
            sub_embeds.append(vector_normalize(embed))
            
        # write into faiss store with label name
        index_.add(np.array(sub_embeds))
        
        embeddings[label] = (sub_labelId,index_)
    return embeddings

def sentence_embedding_faiss(wn_gold, model, label_embeddings = None):
    # use sentences embedding to find most similar candit
    wn_predict = []
    sents_combine = []
    
    length1 = len(wn_gold)
    count1 = 0
    for line in wn_gold:
        sentence = line[4]
        sents_combine.append(sentence)
        
        count1 += 1
        #if count1%1000==0:print(f"\r lines counting {count1}/{length1}",end="")
    # obtain sentence embedding
    start = time.time()
    sents_embedding = model.encode(sents_combine)
    end = time.time()
    #print(f"model time: {end-start}")
    
    length2 = len(wn_gold)
    count2 = 0
    for line,sent_embedding in zip(wn_gold,sents_embedding):
        relation_label = line[3]
        relation_label = relation_transfer(relation_label)

        #obtain the max similar item for label1
        relation_id = max_candidate_faiss(relation_label,sent_embedding,label_embeddings) 
                
        wn_predict.append([line[0], relation_id, line[2],line[3],line[4]])
        count2 +=1
        #print(f"\r line countung {count2}/{length2}", end="")
    return wn_predict

def chunks_divide(data, num=10000):
    for idx in range(0,len(data),num):
        yield data[idx:idx+num]

def process_data_inChunk(data, model, chunk_num = 10000):
    # data file is processed in chunks
    total = 0
    total_predict = []
    start_total = time.time()
    
    for sub_data in chunks_divide(data, num=chunk_num):
        start = time.time()
        label_embeddings = candidates_embeddings_faiss(sub_data, model)
        predict = sentence_embedding_faiss(sub_data, model,label_embeddings = label_embeddings)
        
        total_predict += predict
        process_num = len(sub_data)
        total +=process_num
        usedtime = time.time() - start
        print(f"The time used for this iteration: {usedtime}, finished lines {total}/{len(data)}")
    accuracy = validation(total_predict, data)
    
    total_time = time.time()-start_total
    print(f"Process finished! Total time: {total_time}")
    return total_predict, accuracy

In [None]:
#wc_ran_train_predict, accuracy = process_data_inChunk(dev_modify[:20000], model_STB, chunk_num = 10000)

In [None]:
#train dataset
wc_stb_train_predict, accuracy = process_data_inChunk(train_modify, model_STB, chunk_num = 10000)
print("Accuracy of train dataset (STB baseline): ", accuracy)
print("\n")
#dev dataset
wc_stb_dev_predict, accuracy = process_data_inChunk(dev_modify, model_STB, chunk_num = 10000)
print("Accuracy of dev dataset (STB baseline): ", accuracy)
print("\n")
#test dataset
wc_stb_test_predict, accuracy = process_data_inChunk(test_modify, model_STB, chunk_num = 10000)
print("Accuracy of test dataset (STB baseline): ", accuracy)

**STRoberta Baseline**

In [None]:
model_STR = SentenceTransformer('roberta-large-nli-stsb-mean-tokens')

In [None]:
#train dataset
wc_str_train_predict, accuracy = process_data_inChunk(train_modify, model_STR, chunk_num = 10000)
print("Accuracy of train dataset (STR baseline): ", accuracy)
print("\n")
#dev dataset
wc_str_dev_predict, accuracy = process_data_inChunk(dev_modify, model_STR, chunk_num = 10000)
print("Accuracy of dev dataset (STR baseline): ", accuracy)
print("\n")

#test dataset
wc_str_test_predict, accuracy = process_data_inChunk(test_modify, model_STR, chunk_num = 10000)
print("Accuracy of test dataset (STR baseline): ", accuracy)