## Setup directories

In [None]:
# input files
kgtk_wn_file="kgtk_wordnet.tsv"
kgtk_cn_file="kgtk_conceptnet.tsv"

# output files
wn_gold_file="wn_gold_all.tsv"
wn_gold_200_file="wn_gold_200.tsv"
wn_mrs_prediction_file="wn_MRS_200.tsv"
wn_mfs_prediction_file="wn_MFS_200.tsv"
wn_stb_prediction_file="wn_STB_200.tsv"
wn_str_prediction_file="wn_STR_200.tsv"
cn_test_1k_file="cn_test_1k.tsv"
cn_prediction_file="cn_predict_1k.tsv"

## Generate wn_gold_all.tsv file

In [None]:
from util import *
import matplotlib.pyplot as plt

In [None]:
head, lines = load_file(kgtk_wn_file)

In [None]:
# what does the head look like
head

In [None]:
wn_gold_all = generate_gold_file(lines)
# example of new dataset
wn_gold_all[0]

In [None]:
#write gold data into file
write_gold(wn_gold_file,wn_gold_all)

## statistics calculation

In [None]:
def distribution(wn_gold_all):
    # obtain the dsitribution of each label
    # output: {label1-> str: num->integer}
    distri = dict()
    for line in wn_gold_all:
        node1_label = line[0]
        node2_label = line[2]
        node1_id = line[3]
        node2_id = line[4]
        temp1 = distri.get(node1_label,set())
        temp1.add(node1_id)
        temp2 = distri.get(node2_label,set())
        temp2.add(node2_id)
        distri[node1_label] = temp1
        distri[node2_label] = temp2
    
    for item in distri:
        distri[item] = len(distri[item])
    return distri

In [None]:
distri = distribution(wn_gold_all)
plt.hist(distri.values(),log=True)
print("mean ambiguity of label:", sum(distri.values())/len(distri), "size of records:", len(wn_gold_all), "num of distinct labels:", len(distri))

## Random Pick

In [None]:
# random pick 200 records
wn_gold_200 = random.choices(wn_gold_all, k=200)

In [None]:
#write gold data into file
write_gold(wn_gold_200_file,wn_gold_200)

## Get synsets
**Detail Function is in util.py "get synsets part"**

In [None]:
# obtain the sysets of one phrase by WordNet interface
generate_candidates("far cry")

## Random Baseline

In [None]:
def MRS(wn_gold):
    # Random Baseline calculation
    wn_predict = []
    for line in wn_gold:
        label1  = line[0]
        label2 = line[2]
        relationship = line[1]
        
        candidates1 = generate_candidates(label1)
        candidates2 = generate_candidates(label2)
        
        if candidates1:
            node1_id = random.choice(candidates1)
        else:
            #_ = label1
            node1_id = ""
        
        if candidates2:
            node2_id = random.choice(candidates2)
        else:
            #_ = label2
            node2_id = ""
        
        #print(node2_id)
        wn_predict.append([label1, relationship, label2, node1_id, node2_id])
        
    return wn_predict

In [None]:
wn_predict_200 = MRS(wn_gold_200)
wn_predict_200[0]

In [None]:
def validation(wn_predict,wn_gold):
    # check accracy of prediction
    # accuracy1: if label is correct, true positive +1
    # accuracy2: iff two labels in one record are correct (record is correct), true positive +1
    correct1 = 0
    correct2 = 0
    for predict, actual in zip(wn_predict, wn_gold):
        #print(predict, actual)
        judge = [synset2str(predict[3]) == actual[3],synset2str(predict[4]) == actual[4]]
        #print(predict[3],actual[3])
        if judge[0]:
            correct1 += 1
            
        if judge[1]:
            correct1 += 1
            
        if all(judge):
            correct2 += 1
            
    return correct1/(len(wn_predict)*2), correct2/len(wn_predict)

In [None]:
write_prediction(wn_mrs_prediction_file, wn_predict_200)
accuracy1, accuracy2 = validation(wn_predict_200,wn_gold_200)
accuracy1, accuracy2

## Frequent Baseline

In [None]:
def MFS(wn_gold):
    # Frequent Baseline Calculation
    wn_predict = []
    for line in wn_gold:
        label1  = line[0]
        label2 = line[2]
        relationship = line[1]
        
        candidates1 = generate_candidates(label1)
        candidates2 = generate_candidates(label2)
        
        if candidates1:
            node1_id = candidates1[0]
        else:
            #print(label1)
            #_ = label1
            node1_id = ""
        
        if candidates2:
            node2_id = candidates2[0]
        else:
            #print(label2)
            #_ = label2
            node2_id = ""
        
        #print(node2_id)
        wn_predict.append([label1, relationship, label2, node1_id, node2_id])
        
    return wn_predict

In [None]:
wn_predict_200 = MFS(wn_gold_200)
wn_predict_200[:3]

In [None]:
write_prediction(wn_mfs_prediction_file, wn_predict_200)
accuracy1, accuracy2 = validation(wn_predict_200,wn_gold_200)
accuracy1, accuracy2

## sentence-transformer-bert

In [None]:
model_STB = SentenceTransformer('roberta-large-nli-stsb-mean-tokens')

In [None]:
# check the relationship
relationships = set()

for line in wn_gold_all:
    relationships.add(line[1])

In [None]:
relationships

In [None]:
word2sentence = {'/r/IsA':"is a", '/r/MadeOf': "is made of",'/r/PartOf':"is part of"}

In [None]:
def sentence_embedding(wn_gold, model = None, label_embeddings = None, word2sentence = None):
    # use sentences embedding to find most similar candit
    wn_predict = []
    sents_combine = []
    
    for line in wn_gold:
        sentence = line_sentence(line, word2sentence)
        sents_combine.append(sentence)
    sents_embedding = model.encode(sents_combine)
    
    for line,sent_embedding in zip(wn_gold,sents_embedding):
        label1 = line[0]
        label2 = line[2]

        #obtain the max similar item for label1
        node_id1 = max_candidate(label1,sent_embedding,label_embeddings) 
        
        #obtain the max similar item for label2
        node_id2 = max_candidate(label2,sent_embedding,label_embeddings) 
                
        wn_predict.append([label1, line[1], label2,node_id1,node_id2])
        
    return wn_predict

In [None]:
label_embeddings = candidates_embeddings(wn_gold_200, model_STB)

In [None]:
wn_predict_200 = sentence_embedding(wn_gold_200, model = model_STB, label_embeddings = label_embeddings, word2sentence = word2sentence)

In [None]:
write_prediction(wn_stb_prediction_file, wn_predict_200)
accuracy1, accuracy2 = validation(wn_predict_200,wn_gold_200)
accuracy1, accuracy2

## sentence-transformer-roberta

In [None]:
model_STR = SentenceTransformer('bert-large-nli-stsb-mean-tokens')

In [None]:
label_embeddings = candidates_embeddings(wn_gold_200, model_STR)

In [None]:
wn_predict_200 = sentence_embedding(wn_gold_200, model = model_STR, label_embeddings = label_embeddings, word2sentence = word2sentence)

In [None]:
write_prediction(wn_str_prediction_file, wn_predict_200)
accuracy1, accuracy2 = validation(wn_predict_200,wn_gold_200)
accuracy1, accuracy2

## Compare

In [None]:
accuracy_MRS = []
accuracy_MFS = []
accuracy_STB = []
accuracy_STR = []
n=10
for i in range(n):
    print("\r",i, end="")
    wn_gold_200 = random.choices(wn_gold_all, k=200)
    wn_predict_200 = MRS(wn_gold_200)
    accuracy1, accuracy2 = validation(wn_predict_200,wn_gold_200)
    accuracy_MRS.append(accuracy1)
    
    wn_predict_200 = MFS(wn_gold_200)
    accuracy1, accuracy2 = validation(wn_predict_200,wn_gold_200)
    accuracy_MFS.append(accuracy1)
    
    label_embeddings = candidates_embeddings(wn_gold_200, model_STB)
    wn_predict_200 = sentence_embedding(wn_gold_200, model = model_STB, label_embeddings = label_embeddings, word2sentence = word2sentence)
    accuracy1, accuracy2 = validation(wn_predict_200,wn_gold_200)
    accuracy_STB.append(accuracy1)
    
    label_embeddings = candidates_embeddings(wn_gold_200, model_STR)
    wn_predict_200 = sentence_embedding(wn_gold_200, model = model_STR, label_embeddings = label_embeddings, word2sentence = word2sentence)
    accuracy1, accuracy2 = validation(wn_predict_200,wn_gold_200)
    accuracy_STR.append(accuracy1)
    

In [None]:
n=10
x_axis = range(n)
plt.plot(x_axis, accuracy_MRS, color='green', label='MRS')
plt.plot(x_axis, accuracy_MFS, color='red', label='MFS')
plt.plot(x_axis, accuracy_STB,  color='skyblue', label='STB')
plt.plot(x_axis, accuracy_STR, color='blue', label='STR')
plt.legend()

plt.xlabel('iteration times')
plt.ylabel('accuracy')
plt.show()

## WordNet graph

In [None]:
head, lines = load_file(kgtk_cn_file)
lines_1k = random.choices(lines, k=1000)

In [None]:
cn_test_1k = generate_gold_file(lines_1k)

In [None]:
cn_predict_1k = MFS(cn_test_1k)
# write prediction
write_prediction(cn_prediction_file, cn_predict_1k)
cn_predict_1k[0]

# Frequency Mehotd

In [None]:
def no_synset_prob(cn_predict_1k):
    #prob 1: no synset for label
    #prob 2: no synset for record
    count1 = 0
    count2 = 0
    for record in cn_predict_1k:
        judge = [synset2str(record[3]) == "wn:",synset2str(record[4]) == "wn:"]
        
        if judge[0]:
            count1 += 1
            
        if judge[1]:
            count1 += 1
            
        if any(judge):
            count2 += 1
            
    return count1,count2

In [None]:
# missing probability
coun1,count2 = no_synset_count(cn_predict_1k)
coun1/len(cn_predict_1k), count2/len(cn_predict_1k)

## sentence-transformer-roberta for WordNet

In [None]:
label_embeddings = candidates_embeddings(cn_test_1k, model_STR)

In [None]:
relationships = set()

for line in lines:
    relationships.add(line[1])
    
relationships

In [None]:
word2sentence = {'/r/Antonym':"is antonym for", 
                 '/r/AtLocation': "is located at",
                 '/r/CapableOf':"is capable of",
                '/r/Causes':"causes",
                '/r/CausesDesire':"causes the desire of",
                '/r/CreatedBy':"is created by",
                '/r/DefinedAs': " is defined as",
                '/r/DerivedFrom': "is derived from",
                '/r/Desires':"desires",
                '/r/DistinctFrom':"is distinct from",
                "/r/Entails":"entails",
                '/r/EtymologicallyDerivedFrom':"is etymologically derived from",
                '/r/EtymologicallyRelatedTo': "is etymologically related to",
                '/r/FormOf':"is form of",
                '/r/HasA': "has a",
                '/r/HasContext': "has the context of",
                '/r/HasFirstSubevent': "has first subevent, ",
                '/r/HasLastSubevent':"has last subevent, ",
                '/r/HasPrerequisite': "has prerequisite, ",
                '/r/HasProperty': "has property, ",
                '/r/HasSubevent': "has subevent, ",
                '/r/InstanceOf': " is an instance of",
                '/r/IsA': "is a",
                '/r/LocatedNear': "is located nearby",
                '/r/MadeOf': "is made of",
                '/r/MannerOf':"has a manner of",
                '/r/MotivatedByGoal': "is motivated by goal",
                '/r/NotCapableOf': "is not capable of",
                '/r/NotDesires':"does not desire",
                '/r/NotHasProperty':"does not have property, ",
                '/r/PartOf': "is part of",
                '/r/ReceivesAction':"receives the action, ",
                '/r/RelatedTo':"is related to",
                '/r/SimilarTo':"is similar to",
                '/r/SymbolOf':"is a symbol of",
                '/r/Synonym':"is synonym for",
                '/r/UsedFor':"is used for",
                '/r/dbpedia/capital': "is the capital of",
                '/r/dbpedia/field':" is the field of",
                '/r/dbpedia/genre':"has genre,",
                '/r/dbpedia/genus':"has genus, ",
                '/r/dbpedia/influencedBy':"is influenced by",
                '/r/dbpedia/knownFor': "is known for",
                '/r/dbpedia/language':"is the language ",
                '/r/dbpedia/leader':"has the leader, ",
                '/r/dbpedia/occupation':"has the occupation, ",
                '/r/dbpedia/product':"has the product, "}

cn_predict_1k = sentence_embedding(cn_test_1k, model = model_STR, label_embeddings = label_embeddings, word2sentence = word2sentence)
write_prediction(cn_prediction_file, cn_predict_1k)
coun1,coun2 = no_synset_count(cn_predict_1k)
coun1/len(cn_predict_1k), count2/len(cn_predict_1k)

In [None]:
distri = distribution(cn_test_1k)
plt.hist(distri.values(),log=True,bins=30)
print("mean ambiguity of label:", sum(distri.values())/len(distri), "size of records:", len(cn_test_1k),"num of distinct labels:", len(distri))