# Data Pre for RICA

In [1]:
import pickle,os, faiss, spacy, math
import numpy as np
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from numpy import dot
from numpy.linalg import norm
from collections import defaultdict
import pandas as pd

In [2]:
# input:
cskg_connected_file="../kg-bert/data/cskg/cskg_connected.tsv"
RICA_file="./RICA/RICA_material_KnowledgeTable.csv"
cskg_embed_file="cskg_model_embed.pickle"

In [3]:
# design dependency rule
class token_text():
    def __init__(self, text):
        self.text=text

def subitem_depCheck(subs, require={}):
    output_token=token_text("")
    for sub in subs:
        rel=sub.dep_
        
        if rel in require:
            output_token=sub
            break
            
    return output_token

def walk_tree(node, depth, depths={}):
    depths[node] = depth
    if node.n_lefts + node.n_rights > 0:
        return [walk_tree(child, depth + 1,depths=depths) for child in node.children]

def find_end_tree(root,res=[], left_=False, right_=False):
    # find the end token of dependency True.
    # left_ means find left hand side children
    # right_ means find right hand side children
    if root.n_lefts*left_ + root.n_rights*right_ > 0:
        if left_ and right_:
            for child in root.children:
                find_end_tree(child, res=res, left_=left_,right_=right_)
        elif left_:
            for child in root.lefts:
                find_end_tree(child, res=res, left_=left_,right_=right_)
        elif right_:
            for child in root.rights:
                find_end_tree(child, res=res, left_=left_,right_=right_)
    else:
        res.append(root)
    
    return res

def elements_extraction(sent):
    # new information extraction rule
    if sent[-1]!=".":
        sent+="."
    
    doc=nlp(sent)
    sent=list(doc.sents)[0]
    part_roots=set()
    
    spanLength_list=[]
    for token in sent:
        token_index=token.i
        for child in token.children:
            child_index = child.i
            
            if child.dep_=="punct":
                continue
            spanLength_list.append((abs(token_index-child_index),(token_index, child_index)))
            
    spanLength_list.sort(reverse=True)
    part_pos=set()
    
    depths={}
    [walk_tree(sent.root, 0, depths=depths) for sent in doc.sents]
    #print(depths)
    for span_length, tokens in spanLength_list:
        token1_index, token2_index=tokens
        token1=sent[token1_index]
        token2=sent[token2_index]
        #print(token1, token2, token2.dep_, part_pos)
        if len(list(token2.children))<1 or depths[token2]>2:
            continue
        
        if token2.dep_ in {"conj","ccomp","advcl","dep"}:
            part_pos.add(token1_index)
            part_pos.add(token2_index)
        
        if len(part_pos)>=3:
            break
            
            
    part_pos=sorted(list(part_pos))
    
    # final compaare part
    # print(part_pos)
    compare_root=sent[part_pos[2]]
    
    # find reasoning part
    reasoning_first_root=sent[part_pos[0]]
    reasoning_second_root=sent[part_pos[1]]
    
    # find object1 and object2
    leftEnd_compare_tokens=find_end_tree(compare_root,left_=True,right_=False, res=[])
    leftEnd_first_tokens=find_end_tree(reasoning_first_root,left_=True,right_=False, res=[])
    leftEnd_second_tokens=find_end_tree(reasoning_second_root,left_=True,right_=False, res=[])
    
    leftEnd_first_tokens_text=[_.text for _ in leftEnd_first_tokens]
    leftEnd_second_tokens_text=[_.text for _ in leftEnd_second_tokens]
    
    object1, object2=token_text(""),token_text("")
    
    if len(leftEnd_first_tokens)==1:
        object2=leftEnd_first_tokens[0]
        
    else:
        for token in leftEnd_first_tokens:
            if token.pos_ == "NOUN":
                object2 = token
                
            if token.dep_ in {"nsubj","acomp"}:
                object2=token
                
            if list(token.ancestors)[0].text == "is":
                object2 = token
            
    if len(leftEnd_second_tokens)==1:
        object1=leftEnd_second_tokens[0]
        
    else:
        for token in leftEnd_second_tokens:
            if token.i < part_pos[0]:
                continue
                
            if token.pos_ == "NOUN":
                object1 = token
            
            if token.dep_ in {"nsubj","acomp"}:
                object1=token
                
            if list(token.ancestors)[0].text == "is":
                object1 = token
            
    # use than find one object candidates
    for token in sent:
        if token.text == "than":
            than_token=token
            
            than_child = list(than_token.children)
            than_anc= list(than_token.ancestors)
            
            if than_child:
                object_temp =than_child[0]
                
            else:
                object_temp=than_anc[0]
                
    objects_text=[object1.text, object2.text]
    
    # replace empty object result with object candidates found by than
    if object_temp.text in objects_text:
        pass
    else:
        if objects_text[0]=="":
            object1=object_temp
        else:
            object2=object_temp
            
    # find the property of each objects
    rightEnd_first_tokens=find_end_tree(reasoning_first_root,left_=False,right_=True, res=[])
    rightEnd_second_tokens=find_end_tree(reasoning_second_root,left_=False,right_=True, res=[])
    
    max_dep_obj2=0
    
    for token in rightEnd_first_tokens:
        idx = token.i
        if token.i > part_pos[1]:
            continue
            
        else:
            max_dep_obj2=max(max_dep_obj2, idx)
            
    max_dep_obj1=0
    for token in rightEnd_second_tokens:
        idx = token.i
        if token.i > part_pos[2]:
            continue
            
        else:
            max_dep_obj1=max(max_dep_obj1, idx)
            
    obj2_property=sent[part_pos[0]+1:max_dep_obj2+1].text
    obj1_property=sent[part_pos[1]+1:max_dep_obj1+1].text
    
    # find compafre aspect
    aspect_token=subitem_depCheck(compare_root.rights, require={"acomp","attr"})
    
    if aspect_token.text:
        aspect_index=aspect_token.i
        for left in aspect_token.lefts:
            rel=left.dep_
            if rel not in ["advmod","amod"] and left.i < aspect_index:
                aspect_index= left.i

        aspect_span= sent[aspect_index:]
        aspect_text=aspect_span.text.split(" than ")[0]
    else:
        aspect_span = sent[part_pos[2]+1:]
        aspect_text=aspect_span.text.split(" than ")[0]
        
    # find ground truth is more or less:
    more=0
    less=0
    reverse= 1
    MoreOrLess=0
    for token in sent[part_pos[2]:]:
        if token.text == "more":
            more +=1
            
        elif token.text == "less":
            less += 1
            
        elif token.text == "not" or token.text == "no":
            reverse=-1
        
    if "er " in sent[part_pos[2]:].text:
        more += 1
    
    MoreOrLess = reverse*(more-less)>0
    return object1,object2,obj1_property.split(",")[0],obj2_property.strip(",").split(",")[0],aspect_text, MoreOrLess

def embed_normalize(model_embeddings):
    # normalize embedding
    model_embeddings=np.array([S/(math.sqrt(sum(S**2))) for S in tqdm(model_embeddings)])
    return model_embeddings

In [4]:
# load model
nlp = spacy.load("en_core_web_sm")
model = SentenceTransformer('nli-bert-large')

In [5]:
# cskg lines
cskg_lines=[]
with open(cskg_connected_file,"r") as f:
    head = f.readline().strip().split("\t")
    
    for item in f:
        line = item.strip().split("\t")
        cskg_lines.append(line)

In [6]:
# relation distinct label set
rels=set()
for line in cskg_lines:
    rel_id=line[2].split("/")[-1].split(":")[-1]
    rels.add(rel_id)

In [7]:
# load cskg embedding
# cskg embedding is generated by sentence transformer bert model on hypo sentence "subject label relation lebale object label"
isfile_=os.path.isfile(cskg_embed_file)
if isfile_:
    with open(cskg_embed_file, 'rb') as handle:
        cskg_embed = pickle.load(handle)
    
    cskg_lines=[]
    
    with open(cskg_connected_file,"r") as f:
        head = f.readline().strip().split("\t")
        
        for item in f:
            line = item.strip().split("\t")
            relation_id=line[2]

            cskg_lines.append(line)
else:
    model = SentenceTransformer('nli-bert-large')
    cskg_lines=[]
    
    with open(cskg_connected_file,"r") as f:
        head = f.readline().strip().split("\t")
        
        for item in f:
            line = item.strip().split("\t")
            relation_id=line[2]

            if relation_id == "/r/HasProperty":
                cskg_lines.append(line)
            
    # build sentence
    sents=[]
    for line in lines_HasProperty:
        sent=f"{line[4]} {line[6]} {line[5]}"
        sents.append(sent)
        
    cskg_embed=model.encode(sents)
    
    # store file into desk
    with open(cskg_embed_file, 'wb') as handle:
        pickle.dump(cskg_embed, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [8]:
# normalize cskg embedding
cskg_embed=embed_normalize(cskg_embed)

100%|██████████| 6003237/6003237 [21:49<00:00, 4584.86it/s]


In [32]:
# load rica data
with open(RICA_file,"r") as f:
    head = f.readline().strip().split(",")
    RICA_lines=[]
    
    for item in f:
        line = item.strip().split(",")
        RICA_lines.append(line)

In [33]:
# check accuracy
count =0
C1s=[]
C2s=[]
extracted_info=[]
for line in RICA_lines:
    temp1=line[1].replace(" ","_")
    temp2=line[0].replace(" ","_")
    item1=f"Item_{temp1}"
    item2=f"Item_{temp2}"
    sent= f"{item1} is {line[1]}, {item2} is {line[0]}, so {item1} is {line[2].lower()} {line[3]} than {item2}"
    obj1,obj2, obj1_property, obj2_property, aspect, truth=elements_extraction(sent)
    
    extracted_info.append([obj1.text,obj2.text,obj1_property,obj2_property,aspect])
    if obj1_property== line[0] and obj2_property== line[1] and aspect==line[3]:
        count +=1
    else:
        print(obj1_property)
        print(obj2_property)
        print(aspect)
        print(sent)
        print()
    # build two sentence:
    C1_sentence=f"{obj1_property} is {aspect}"
    C2_sentence=f"{obj2_property} is {aspect}"
    
    C1s.append(C1_sentence)
    C2s.append(C2_sentence)

glass
gas
liquid
Item_gas is gas, Item_glass is glass, so Item_gas is more contain liquid than Item_glass

glass
gold
reflective
Item_gold is gold, Item_glass is glass, so Item_gold is more very reflective than Item_glass

glass
plastic
reflective
Item_plastic is plastic, Item_glass is glass, so Item_plastic is more very reflective than Item_glass

glass
sand
reflective
Item_sand is sand, Item_glass is glass, so Item_sand is more very reflective than Item_glass

glass
metal
reflective
Item_metal is metal, Item_glass is glass, so Item_metal is more very reflective than Item_glass

glass
silver
reflective
Item_silver is silver, Item_glass is glass, so Item_silver is more very reflective than Item_glass

paper
blood
lightweight
Item_cow_blood is cow blood, Item_paper is paper, so Item_cow_blood is more lightweight than Item_paper



In [34]:
# transfer sents to embedding

C1_sents_embedding=model.encode(C1s,device=3)
C2_sents_embedding=model.encode(C2s,device=3)

C1_sents_embedding=embed_normalize(C1_sents_embedding)
C2_sents_embedding=embed_normalize(C2_sents_embedding)

100%|██████████| 126/126 [00:00<00:00, 5071.08it/s]
100%|██████████| 126/126 [00:00<00:00, 5060.30it/s]


In [35]:
# use faiss to find neareast
d= cskg_embed.shape[1]
index = faiss.IndexFlatL2(d)
index.add(cskg_embed)

In [36]:
# find the closest edges

k = 1
D_C1, I_C1 = index.search(C1_sents_embedding, k)

k = 1
D_C2, I_C2 = index.search(C2_sents_embedding, k)

In [37]:
for idx1_, idx2_,sent_embed1, sent_embed2,rica_line in zip(I_C1,I_C2,C1_sents_embedding,C2_sents_embedding,extracted_info):
    idx1=idx1_[0]
    idx2=idx2_[0]
    line1=cskg_lines[idx1]
    line2=cskg_lines[idx2]
    edge1_embed=cskg_embed[idx1]
    edge2_embed=cskg_embed[idx2]
    
    rel1=line1[2]
    rel2=line2[2]
    
    similar1=dot(edge1_embed, sent_embed1)/(norm(edge1_embed)*norm(sent_embed1))
    similar2=dot(edge2_embed, sent_embed2)/(norm(edge2_embed)*norm(sent_embed2))
    # item1 line
    rica_line.append([rel1,similar1])
    # item2 line
    rica_line.append([rel2,similar2])

In [38]:
max_sim=0
min_sim=1
for line in extracted_info:
    _, sim1=line[-2]
    _, sim2=line[-1]
    
    max_sim=max(max_sim,sim1)
    max_sim=max(max_sim,sim2)
    
    min_sim=min(min_sim,sim1)
    min_sim=min(min_sim,sim2)

In [39]:
min_sim, max_sim

(0.7802252, 0.98956645)

In [40]:
with open('extracted_info.pickle', 'wb') as handle:
    pickle.dump(extracted_info, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [41]:
# rescall sim

scale=1/(max_sim-min_sim)
for i in range(len(extracted_info)):
    line=extracted_info[i]
    
    extracted_info[i][-1][-1]=(line[-1][-1]-min_sim)*scale
    extracted_info[i][-2][-1]=(line[-2][-1]-min_sim)*scale

In [42]:
# write data for psl:
psl_data=dict()
material_property=dict()
for rel in rels:
    psl_data[rel]=dict()
    material_property[rel]=defaultdict(dict)

compare_aspects=set()
items_=set()
for i in range(len(extracted_info)):
    line=extracted_info[i]
    item1_name,item2_name,item1_property,item2_property,compare_aspect,item1_info,item2_info=line
    
    item1_rel=item1_info[0].split("/")[-1].split(":")[-1]
    item2_rel=item2_info[0].split("/")[-1].split(":")[-1]
    sim1=item1_info[-1]
    sim2=item2_info[-1]
    compare_aspects.add(compare_aspect)
    items_.add(item1_name)
    items_.add(item2_name)
    
    psl_data[item1_rel][item1_name]=item1_property
    psl_data[item2_rel][item2_name]=item2_property
    material_property[item1_rel][item1_property][compare_aspect]=sim1
    material_property[item2_rel][item2_property][compare_aspect]=sim2
    
# write item and material relation obs file
with open(f"./RICA/material_knowledge/ItemMaterial_obs.txt","w") as f:
    item_material=set()
    for rel in psl_data:
        psl_rel_data=psl_data[rel]
        
        for item, material in psl_rel_data.items():
            item_material.add((item,material))
            
    f.write("\n".join(["\t".join(_) for _ in item_material]))
            
# write material and compare_aspect 1 or 0:
for rel in material_property:
    if not material_property[rel]:
        continue
    with open(f"./RICA/material_knowledge/{rel}_obs.txt","w") as f:
        material_data=material_property[rel]
        for material in material_data:
            property_dict=material_data[material]
            for aspect, sim in property_dict.items():
                f.write(f"{material}\t{aspect}\t{sim}\n")
                

# write predict result:
with open(f"./RICA/material_knowledge/more_targets.txt","w") as f1, open(f"./RICA/material_knowledge/more_truth.txt","w") as f2:
    for item1 in items_:
        for item2 in items_:
            if item1==item2:
                continue
            for aspect in compare_aspects:
                f1.write(f"{item1}\t{item2}\t{aspect}\t0\n")
                f2.write(f"{item1}\t{item2}\t{aspect}\t1\n")

In [43]:
# write PSL rules
# "10: Material(I1,M1) & HasProperty(M1, T) & Material(I2,M2) & !HasProperty(M2, T) & I1 != I2 -> More(I1, I2, T) ^2"
# "10: More(I1, I2, T) & I1 != I2 -> !More(I2, I1, T) ^2"

Rules=[]

for rel in rels:
    if not material_property[rel]:
        continue
    rule1=f"10: Material(I1,M1) & {rel}(M1, T) & Material(I2,M2) & !{rel}(M2, T) & I1 != I2 -> More(I1, I2, T) ^2"
    Rules.append(rule1)
Rules.append("1000: More(I1, I2, T) & I1 != I2 -> !More(I2, I1, T) ^2")
Rules.append("1000: More(I1, I2, T) + More(I2, I1, T) = 1")

In [44]:
# write psl rules into desk
with open("rules.txt", "w") as f:
    for rule in Rules:
        f.write(rule)
        f.write("\n")

In [45]:
# dump data into desk
with open('rel.pickle', 'wb') as handle:
    pickle.dump([rel for rel in rels if material_property[rel]], handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('rules.pickle', 'wb') as handle:
    pickle.dump(Rules, handle, protocol=pickle.HIGHEST_PROTOCOL)

Data Prepareation steps are finished  
Run MaterialKnowledge_1k.py to infere prediction  
bash: python MaterialKnowledge_1k.py

In [46]:
# The data preparation is finished
# the file PSL python file should be runed
# After running psl check the psl inference result 

with open("inferred-predicates/MORE.txt", "r") as f:
    psl_result=[]
    for line in f:
        line = line.strip().split("\t")
        line[-1]=round(eval(line[-1]))
        psl_result.append(line)

In [47]:
# buid dict of infere result
df=pd.DataFrame(psl_result, columns=['item1',"item2","aspect","result"])

In [48]:
df

Unnamed: 0,item1,item2,aspect,result
0,Item_silver,Item_plastic_but_plastic_with_rubber_band,crumple,1
1,Item_steel,Item_milk,liquid,1
2,Item_ferment_milk,Item_metal_and_not_plastic,durable,1
3,Item_silicon,Item_alcohol,opaque,1
4,Item_carbon,Item_rum,contain liquid,1
...,...,...,...,...
26671,Item_wood,Item_water,crumple,1
26672,Item_gas,Item_ceramic,fragile,0
26673,Item_crystal,Item_ferment_milk,reflective,0
26674,Item_sugar,Item_cow_blood,flat,0


In [50]:
tp = 0
tn = 0
fn = 0
fp = 0
for line1, line2 in zip(extracted_info, RICA_lines):
    item1, item2, property1, property2, aspect, _, _ = line1
    ground = line2[2]
    
    predict=df.loc[(df['item1'] == item1) & (df['item2'] == item2)& (df['aspect'] == aspect)]["result"].values[0]
    
    if ground=="More":
        ground=1
    else:
        ground=0
        
    if predict==ground:
        if ground==1:
            tp +=1
            
        elif ground==0:
            tn +=1
            
    else:
        if ground==1:
            fp +=1
        elif ground ==0:
            fn+=1

In [51]:
confusion matix
print("true positive:",tp)
print("true negative:",tn)
print("false positive:",fp)
print("false negative:",fn)

true positive: 87
true negative: 6
false positive: 26
false negative: 7
