In [462]:
from tqdm import tqdm
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss, math
from numpy import dot
from numpy.linalg import norm
import random

## Folder Set Up

In [514]:
# input
cskg_embeddings_file="./cskg_embedding/cskg_embeddings.txt"
cskg_connected_file="../kg-bert/data/cskg/cskg_connected.tsv"
RICA_file="./RICA/RICA_material_KnowledgeTable.csv"

# output
Most5_RICA_line = "./cskg_embedding/Most5_RICA_line.txt"
cskg_embedding_bert="./cskg_embedding/cskg_embedding_bert.tsv"
sample_1k_lines="./RICA/sampled_1k_sentences.txt"

## Data Process

In [86]:
# load bert model
model = SentenceTransformer('nli-bert-large')

In [4]:
# load cskg embedding file
with open(cskg_embeddings_file,"r") as f:
    head= f.readline().strip().split("\t")
    
    # obtain embedding_sentence in file
    cskg_word_embeddings=dict()
    
    for item in tqdm(f):
        # obtain list of line
        line=item.strip().split("\t")
        
        # only property is text embedding can obtain embeddings
        word=line[0]
        prop=line[1]
        embedding=line[2]
        
        if prop=="text_embedding":
            cskg_word_embeddings[word]=embedding

4322096it [00:52, 82092.47it/s]


In [5]:
# load cskg file

with open(cskg_connected_file, "r") as f:
    head = f.readline().strip().split("\t")
    
    # load lines only contain relation==HasProperty
    lines_HasProperty=[]
    
    for item in f:
        line = item.strip().split("\t")
        relation_id=line[2]
        
        if relation_id == "/r/HasProperty":
            lines_HasProperty.append(line)

In [6]:
# obtain embedding for each edges
lines_HasProperty_embeddings=[]

for line in lines_HasProperty:
    node1=line[1]
    node2=line[2]
    
    node1_embedding=np.array(eval("["+cskg_word_embeddings[node1]+"]"))
    node2_embedding=np.array(eval("["+cskg_word_embeddings[node2]+"]"))
    E=(node1_embedding+node2_embedding)/2
    # normalize
    normalize_E=E/(math.sqrt(sum(E**2)))
    lines_HasProperty_embeddings.append(normalize_E)

# edges embedding for lines having relation, HasProperty
lines_HasProperty_embeddings=np.array(lines_HasProperty_embeddings).astype('float32')

In [7]:
# check nomalized result
sum(lines_HasProperty_embeddings[0]**2)

1.0000000007181535

In [8]:
# load RICA dataset and build sentence
# load RICA dataset
with open(RICA_file,"r") as f:
    head = f.readline().strip().split(",")
    RICA_lines=[]
    
    for item in f:
        line = item.strip().split(",")
        RICA_lines.append(line)
        
# build sentence
# example : Glas is [MASK] transparent than wood
ModifySent_lines=[]
for line in RICA_lines:
    sent=f"{line[0]} is [mask] {line[3]} than {line[1]}"
    ModifySent_lines.append(sent)

In [9]:
# example of first line
ModifySent_lines[0]

'glass is [mask] transparent than plastic'

In [10]:
# obtain the embeddings
model_embeddings=model.encode(ModifySent_lines)

In [11]:
# normalize the embeddings
model_embeddings=np.array([S/(math.sqrt(sum(S**2))) for S in model_embeddings])

## Node Embedding vs Sentence embedding Similarity

In [12]:
# use faiss to find neareast
d= model_embeddings.shape[1]
index = faiss.IndexFlatL2(d)
index.add(lines_HasProperty_embeddings)

In [14]:
# Take the most similar 5 edges
k = 5
D, I = index.search(model_embeddings, k) 

In [15]:
# calculate the result
edges_result=[]
for loc in I:
    idx=loc[0]
    edge=lines_HasProperty[idx]
    edges_result.append(edge[1:4])

In [16]:
# calculate accuracy
count=0
for i in range(len(RICA_lines)):
    
    line = RICA_lines[i]
    predict_result=edges_result[i]
    
    if line[0] in predict_result[0] and line[3] in predict_result[2]:
        count +=1
        
print(count/len(RICA_lines))

0.1111111111111111


In [17]:
with open("./cskg_embedding/result.txt","w") as f:
    for i in range(len(RICA_lines)):
        line = RICA_lines[i]
        predict_result=edges_result[i]
        status=0
        if line[0] in predict_result[0] and line[3] in predict_result[2]:
            status=1
        cskg_embed=lines_HasProperty_embeddings[i]
        model_embed=model_embeddings[i]
        similar = dot(cskg_embed, model_embed)/(norm(cskg_embed)*norm(model_embed))
        f.write(",".join(line[:4])+"\n"+",".join(predict_result)+"\n"+f"similar:{similar}\n"+f"result:{status}\n")
        f.write("\n")

In [18]:
len(cskg_word_embeddings)

2161048

## Method:
create two sentences (C1: glass is transparent, C2: sand is transparent), then compute similarity between each of them with CSKG edges, and assign 1 to either C1 or C2 (whichever has a higher probability)

In [27]:
RICA_lines[0]

['glass',
 'plastic',
 'More',
 'transparent',
 '"Material(A',
 ' glass) and Material(B',
 ' plastic)',
 ' so More(transparent(A)',
 ' transparent(B))"']

In [28]:
# build two sentence:
C1_sents=[]
C2_sents=[]

for line in RICA_lines:
    sub=line[0]
    obj=line[1]
    prop=line[3]
    C1=f"{sub} is {prop}"
    C2=f"{obj} is {prop}"
    
    C1_sents.append(C1)
    C2_sents.append(C2)

In [29]:
C1_sents[:5]

['glass is transparent',
 'glass is transparent',
 'glass is transparent',
 'glass is transparent',
 'glass is transparent']

In [30]:
# transfer sents to embedding

C1_sents_embedding=model.encode(C1_sents)
C2_sents_embedding=model.encode(C2_sents)

In [31]:
C1_sents_embedding=np.array([S/(math.sqrt(sum(S**2))) for S in C1_sents_embedding])
C2_sents_embedding=np.array([S/(math.sqrt(sum(S**2))) for S in C2_sents_embedding])

In [32]:
# find the closest edges

k = 1
D_C1, I_C1 = index.search(C1_sents_embedding, k)

k = 1
D_C2, I_C2 = index.search(C2_sents_embedding, k)

In [33]:
# check each sentence (More or Less)
MoreOrLess=[]
edges_c1=[]
edges_c2=[]

for idx_c1, idx_c2, c1_embed, c2_embed in zip(I_C1,I_C2,C1_sents_embedding,C2_sents_embedding):
    edge1=lines_HasProperty_embeddings[idx_c1[0]]
    edge2=lines_HasProperty_embeddings[idx_c2[0]]
    
    edges_c1.append(edge1)
    edges_c2.append(edge2)
    
    # claculate similarity
    similar1=dot(edge1, c1_embed)/(norm(edge1)*norm(c1_embed))
    
    similar2=dot(edge2, c2_embed)/(norm(edge2)*norm(c2_embed))
    
    if similar1 > similar2:
        MoreOrLess.append("More")
        
    else:
        MoreOrLess.append("Less")

In [34]:
# check accuracy
count = 0

for line, res in zip(RICA_lines, MoreOrLess):
    ground = line[2]
    
    if ground == res:
        count +=1
        
count/len(RICA_lines)

0.5396825396825397

In [35]:
RICA_lines[0]

['glass',
 'plastic',
 'More',
 'transparent',
 '"Material(A',
 ' glass) and Material(B',
 ' plastic)',
 ' so More(transparent(A)',
 ' transparent(B))"']

## Method
take each HasProperty edge from CSKG

Lexicalize it into a sentence

Embed it with bert-nli-large

Then use this embedding instead of the average of the node embeddings


In [36]:
# build sentence
sents=[]
for line in lines_HasProperty:
    sent=f"{line[4]} {line[6]} {line[5]}"
    sents.append(sent)

In [37]:
model_HasProperty_embed=model.encode(sents)

In [38]:
# normalize the embeddings
model_HasProperty_embed=np.array([S/(math.sqrt(sum(S**2))) for S in model_HasProperty_embed])

In [39]:
model_HasProperty_embed[0]

array([-0.00923568, -0.01059276,  0.02301931, ..., -0.02621119,
       -0.02785043, -0.03425661], dtype=float32)

In [40]:
# use faiss to find neareast
d= model_HasProperty_embed.shape[1]
index = faiss.IndexFlatL2(d)
index.add(model_HasProperty_embed)

In [41]:
# find the closest edges

k = 1
D_C1, I_C1 = index.search(C1_sents_embedding, k)

k = 1
D_C2, I_C2 = index.search(C2_sents_embedding, k)

In [42]:
with open("./cskg_embedding/result.txt","w") as f:
    MoreOrLess=[]
    for idx1, idx2,sent_embed1, sent_embed2, rica_line in zip(I_C1,I_C2,C1_sents_embedding,C2_sents_embedding,ModifySent_lines):
        idx1=idx1[0]
        idx2=idx2[0]
        sent1=sents[idx1]
        sent2=sents[idx2]
        
        edge1_embed=model_HasProperty_embed[idx1]
        edge2_embed=model_HasProperty_embed[idx2]
        
        # claculate similarity
        similar1=dot(edge1_embed, sent_embed1)/(norm(edge1_embed)*norm(sent_embed1))
        similar2=dot(edge2_embed, sent_embed2)/(norm(edge2_embed)*norm(sent_embed2))
        
        if similar1 > similar2:
            status="More"
        else:
            status="Less"
        
        MoreOrLess.append(status)
        f.write(f"Original Sentence: {rica_line}\n")
        f.write(sent1+"\n"+sent2+"\n")
        f.write(f"Similar1:{similar1}, Similar2:{similar2}\n")
        f.write(status+"\n")
        f.write("\n")

In [43]:
# check accuracy
count = 0

for line, res in zip(RICA_lines, MoreOrLess):
    ground = line[2]
    
    if ground == res:
        count +=1
        
count/len(RICA_lines)

0.8650793650793651

## Method

Take the entire RICA with probes P

Lexicalize the entire graph and compute embeddings with bert nli large: you should do this once, and maybe you have already done it, and just save the file

Load the graph edge embeddings into a faiss index

For a given probe, take the most similar 5 edges


In [44]:
# use faiss to find 5 neareast
d= model_HasProperty_embed.shape[1]
index = faiss.IndexFlatL2(d)
index.add(model_HasProperty_embed)
k = 5

In [45]:
# save embedding to desk
with open(cskg_embedding_bert,"w") as f:
    for line, embed in zip(lines_HasProperty,model_HasProperty_embed):
        f.write(",".join([line[1],line[2],line[3]])+":"+"\t"+",".join([str(_) for _ in embed])+"\n")

In [46]:
D, I = index.search(model_embeddings, k)

In [47]:
with open(Most5_RICA_line,"w") as f:
    for i in range(len(RICA_lines)):
        RICA_line=ModifySent_lines[i]
        idxs_=I[i]
        f.write("RICA line: "+RICA_line+"\n")
        
        for idx in idxs_:
            Similar_line=",".join(lines_HasProperty[idx][1:4])
            f.write(Similar_line+"\n")
            
        f.write("\n")

## Method 
Spacy

In [48]:
import spacy

In [49]:
nlp = spacy.load("en_core_web_sm")

In [50]:
# example of first line
ModifySent_lines[0]

'glass is [mask] transparent than plastic'

In [474]:
def elements_extraction(line):
    line = line.split(",")[-1].strip()
    doc=nlp(line)
    sent= list(doc.sents)[0]
    
    obj1=""
    obj2=""
    aspect=""
    root = sent.root
    lefts= root.lefts
    for left in lefts:
        rel=left.dep_
        if rel in ["nsubj","acomp"]:
            obj1= left
            break
    if not obj1:
        temp=list(sent.noun_chunks)
        if len(temp)>0:
            obj1=temp[0]
        else:
            obj1=""
        
    for token in sent:
        if token.text == "than":
            break
    
    rights=token.rights
    
    for right in rights:
        rel=right.dep_
        if rel in ["pobj","amod"]:
            obj2= right
            break
            
    if not obj2:
        temp=list(sent.noun_chunks)
        if len(temp)>0:
            obj2=temp[-1]
        else:
            obj2=""
        
    rights = root.rights
    
    for right in rights:
        rel=right.dep_
        if rel in ["acomp","attr"]:
            aspect= right
            break
    
    if aspect:
        temp=aspect.i
        for left in aspect.lefts:
            rel=left.dep_
            if rel not in ["advmod"] and left.i < temp:
                temp= left.i
                
            
    try:
        aspect_span= sent[temp:]
        aspect_text=aspect_span.text.split(" than ")[0]
    except:
        aspect_text=""
    
    if obj1:
        obj1=obj1.text
        
    if obj2:
        obj2=obj2.text
    return obj1,obj2, aspect_text

In [475]:
elements_extraction("transparent glass is more transparent than plastic")

('glass', 'plastic', 'transparent')

In [479]:
# check accuracy
count =0
C1s=[]
C2s=[]
for line in RICA_lines:
    sent= f"{line[0]} is {line[2].lower()} {line[3]} than {line[1]}"
    obj1,obj2, aspect=elements_extraction(sent)
    
    if obj1== line[0] and obj2== line[1] and aspect==line[3]:
        count +=1
    else:
        print(obj1, obj2, aspect,sent)
        
    # build two sentence:
    C1_sentence=f"{obj1} is {aspect}"
    C2_sentence=f"{obj2} is {aspect}"
    
    C1s.append(C1_sentence)
    C2s.append(C2_sentence)

glass metal transparent glass is more transparent than metal and not plastic
glass milk good for drink glass is less good for drink than ferment milk
glass gold reflective glass is more very reflective than gold
glass plastic reflective glass is more very reflective than plastic
glass sand reflective glass is more very reflective than sand
glass metal reflective glass is more very reflective than metal
glass silver reflective glass is more very reflective than silver
paper plastic flat paper is more flat than plastic but plastic with rubber band
paper ceramic durable paper is more durable than ceramic or plastic
paper blood lightweight paper is more lightweight than cow blood
metal metal hard metal is more hard than lead


In [480]:
# check accuracy
count/len(RICA_lines)

0.9126984126984127

In [484]:
# transfer sents to embedding

C1_sents_embedding=model.encode(C1_sents)
C2_sents_embedding=model.encode(C2_sents)

C1_sents_embedding=np.array([S/(math.sqrt(sum(S**2))) for S in C1_sents_embedding])
C2_sents_embedding=np.array([S/(math.sqrt(sum(S**2))) for S in C2_sents_embedding])

In [485]:
# use faiss to find neareast
d= model_HasProperty_embed.shape[1]
index = faiss.IndexFlatL2(d)
index.add(model_HasProperty_embed)

In [486]:
# find the closest edges

k = 100
D_C1, I_C1 = index.search(C1_sents_embedding, k)

k = 100
D_C2, I_C2 = index.search(C2_sents_embedding, k)

In [490]:
MoreOrLess=[]
for idx1_, idx2_,sent_embed1, sent_embed2, rica_line in zip(I_C1,I_C2,C1_sents_embedding,C2_sents_embedding,ModifySent_lines):
    idx1=idx1_[0]
    idx2=idx2_[0]
    sent1=sents[idx1]
    sent2=sents[idx2]

    edge1_embed=model_HasProperty_embed[idx1]
    edge2_embed=model_HasProperty_embed[idx2]

    # claculate similarity
    similar1=dot(edge1_embed, sent_embed1)/(norm(edge1_embed)*norm(sent_embed1))
    similar2=dot(edge2_embed, sent_embed2)/(norm(edge2_embed)*norm(sent_embed2))

    if similar1 > similar2:
        status="More"
    else:
        status="Less"

    MoreOrLess.append(status)

In [491]:
# check accuracy
count = 0

for line, res in zip(RICA_lines, MoreOrLess):
    ground = line[2]
    
    if ground == res:
        count +=1
        
count/len(RICA_lines)

0.8650793650793651

In [510]:
# discard edges that do not contain the object name
MoreOrLess=[]
for i in range(len(RICA_lines)):
    # find predict result for C1
    idx1s_ = I_C1[i]
    
    status=False
    for idx1 in idx1s_:
        line_HasEmbed= lines_HasProperty[idx1]
        subject_=line_HasEmbed[1]
        
        object_name = RICA_lines[i][0]
        
        # whether discard
        if object_name in subject_:
            status = True
            break
            
    # if no satisfied one, choose the first one.
    if status == True:
        idx1_predict=idx1 
    else:
        idx1_predict= idx1s_[0]
        
    # find predict result for C2
    idx2s_ = I_C2[i]
    
    status=False
    for idx2 in idx2s_:
        line_HasEmbed= lines_HasProperty[idx2]
        subject_=line_HasEmbed[1]
        
        object_name = RICA_lines[i][1]
        
        if object_name in subject_:
            status = True
            break
            
    if status ==True:
        idx2_predict=idx2 
    else:
        idx2_predict= idx2s_[0]
        
    # find the edges embed and name
    # embed
    edge1_embed=model_HasProperty_embed[idx1]
    edge2_embed=model_HasProperty_embed[idx2]
    
    # name
    edge1_name= lines_HasProperty[idx1_predict][1:4]
    edge2_name= lines_HasProperty[idx2_predict][1:4]
    
    # C1 and C2 sent embed
    C1_sent_embed=C1_sents_embedding[i]
    C2_sent_embed=C2_sents_embedding[i]
    
    # claculate similarity
    similar1=dot(edge1_embed, C1_sent_embed)/(norm(edge1_embed)*norm(C1_sent_embed))
    similar2=dot(edge2_embed, C2_sent_embed)/(norm(edge2_embed)*norm(C2_sent_embed))

    if similar1 > similar2:
        status="More"
    else:
        status="Less"

    MoreOrLess.append(status)

In [512]:
# check accuracy
count = 0

for line, res in zip(RICA_lines, MoreOrLess):
    ground = line[2]
    
    if ground == res:
        count +=1
        
count/len(RICA_lines)

0.8412698412698413

In [460]:
with open(sample_1k_lines, "r") as f:
    sample_lines=[]
    for line in f:
        sample_lines.append(line.strip())

In [516]:
for line in random.sample(sample_lines, 10):
    obj1,obj2, aspect=elements_extraction(line)
    print(f"Sentence: {line}")
    print(f"Extraction Result: {obj1}, {obj2}, {aspect}")
    print()

Sentence: bgciwx tries not to panic while jksxzurzdjmd does not, so bgciwx is more scared than jksxzurzdjmd
Extraction Result: bgciwx, jksxzurzdjmd, scared

Sentence: nxeftqopgsyn is a librarian, htqpn is not, so htqpn is not more likely to book book than nxeftqopgsyn
Extraction Result: htqpn, nxeftqopgsyn, likely to book book

Sentence: txytawms is a bear, xbulrzzhkn is a fly, so xbulrzzhkn is less likely to fish with it's paws than txytawms
Extraction Result: xbulrzzhkn, txytawms, likely to fish with it's paws

Sentence: wnedzjjwgqb grabs vbb's arms, so wnedzjjwgqb is more forceful than vbb
Extraction Result: wnedzjjwgqb, vbb, forceful

Sentence: kctedtc buys euhwmkfkau's son, so euhwmkfkau is less likely to giving than kctedtc
Extraction Result: euhwmkfkau, kctedtc, likely to giving

Sentence: kau takes it to a veterinarian more than pusqbpmkbyt, so pusqbpmkbyt is less worried than kau
Extraction Result: pusqbpmkbyt, kau, worried

Sentence: ilhybfsuk leaves in a huff more than wegdb