In [1]:
from nltk.corpus import wordnet as wn
import random, csv,time

import run_bert_relation_prediction as relation_prediction
import run_bert_link_prediction as link_prediction
import numpy as np

## Set folder

In [2]:
# input file
cskg_file = "./data/cskg/cskg_connected.tsv"
ground_truth="./data/cskg/ground_truth_100.tsv"
web_child = "./data/wc/train.tsv"

# output file
cskg_relation_prediction = "./data/cskg/relation_prediction.tsv"
cskg_link_prediction = "./data/cskg/link_prediction.tsv"
cskg_prediction = "./data/cskg/prediction.tsv"
cskg_100 = "./data/cskg/cskg_100.tsv"

## Data Preparation

In [3]:
def load_file(filename,encoding=None,errors=None):
    # load data file
    with open(filename,"r",encoding=encoding, errors=errors) as f:
        head_str = f.readline()
        head=head_str.split("\t")
        
        lines = []
        for line_str in f:
            line_str=line_str.strip()
            
            if line_str:
                line=line_str.split("\t")

                lines.append(line)
            
    return head,lines

def line_filter(lines, label_="/r/HasProperty"):
    #
    res = []
    
    for line in lines:
        entity1 = line[1]
        entity2 = line[3]
        relation = line[2]
        
        entity1_label=line[4]
        entity2_label=line[5]
        relation_label=line[6]
        
        if line[2] == label_:
            # has relation label hasproperty
            
            # subject 2+ candidates
            # object 1+ candidates
            
            entity1_synsets = wn.synsets(entity1_label.replace(" ","_"))
            entity2_synsets = wn.synsets(entity2_label.replace(" ","_"))
            #res.append(line)
            
            if len(entity1_synsets) >= 2 and len(entity2_synsets)>=1:
                res.append(line)
                
    return res

def write_file(filename,lines,encoding=None,errors=None):
    # write data file
    with open(filename,"w",encoding=encoding, errors=errors) as f:
        writer = csv.writer(f, delimiter='\t')
        writer.writerows(lines)
        
    return

In [4]:
# load file
head, lines=load_file(cskg_file,encoding="utf8")

In [5]:
# example of line
lines[15014]

['/c/en/about_ten_percent_of_people-/r/HasProperty-/c/en/left_handed-0000',
 '/c/en/about_ten_percent_of_people',
 '/r/HasProperty',
 '/c/en/left_handed',
 'about ten percent of people',
 'left handed',
 'has property',
 '',
 'CN',
 '[[About ten percent of people]] are [[left-handed]]']

In [None]:
# Filter
filter_lines = line_filter(lines)

# example of filter line, and number
filter_lines[0], len(filter_lines)

In [None]:
# random choose 100 samples from filter_lines
lines_100=random.sample(filter_lines,100)

# generate triple
ent2text={}
relation2text={}

for line in lines_100:
    ent2text[line[1]]=wn.synsets(line[4].replace(" ","_"))[0].definition()
    ent2text[line[3]]=wn.synsets(line[5].replace(" ","_"))[0].definition()
    relation2text[line[2]] = line[6]

In [None]:
# write file
write_file(cskg_100,[_[1:7] for _ in lines_100])

In [None]:
# load file
with open(cskg_100,"r") as f:
    lines = []
    for line_str in f:
        line_str=line_str.strip()

        if line_str:
            line=line_str.split("\t")

            lines.append(line)

In [None]:
# example of new lines
lines[0]

## Relation Classification

In [None]:
# basic parameter
relation_prediction.os.environ['CUDA_VISIBLE_DEVICES']= '2'
data_path = "./data/wc"
data_saved_path = "./output_wc_result2"
bert_model="bert-base-cased"
task_name="kg"
max_seq_length=100
eval_batch_size=32

device = relation_prediction.torch.device("cuda" if relation_prediction.torch.cuda.is_available() else "cpu")

# load precessor
processors = {"kg": relation_prediction.KGProcessor,}
processor = processors[task_name]()

# obtain label
label_list = processor.get_relations(data_path)
num_labels = len(label_list)

# obtain entity list
entity_list = processor.get_entities(data_path)

# load model
tokenizer = relation_prediction.BertTokenizer.from_pretrained(bert_model, do_lower_case=False)
model = relation_prediction.BertForSequenceClassification.from_pretrained(data_saved_path, num_labels=num_labels)
location_detail=model.to(device)

label_list = processor.get_relations(data_path)
num_labels = len(label_list)

examples = []
set_type="test"
for (i, line) in enumerate(lines_100):
    guid = "%s-%s" % (set_type, i)
    text_a = ent2text[line[1]]
    text_b = ent2text[line[3]]
    label = "wn:quality.n.1"
    
    examples.append(
        relation_prediction.InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
    
examples[0]

In [None]:
# load data
eval_features = relation_prediction.convert_examples_to_features(examples, label_list, max_seq_length, tokenizer)

all_input_ids = relation_prediction.torch.tensor([f.input_ids for f in eval_features], dtype=relation_prediction.torch.long)
all_input_mask = relation_prediction.torch.tensor([f.input_mask for f in eval_features], dtype=relation_prediction.torch.long)
all_segment_ids = relation_prediction.torch.tensor([f.segment_ids for f in eval_features], dtype=relation_prediction.torch.long)
all_label_ids = relation_prediction.torch.tensor([f.label_id for f in eval_features], dtype=relation_prediction.torch.long)

eval_data = relation_prediction.TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)

# do predict
eval_sampler = relation_prediction.SequentialSampler(eval_data)
eval_dataloader = relation_prediction.DataLoader(eval_data, sampler=eval_sampler, batch_size=eval_batch_size)

preds = []

for input_ids, input_mask, segment_ids, label_ids in relation_prediction.tqdm(eval_dataloader, desc="Testing"):
    input_ids = input_ids.to(device)
    input_mask = input_mask.to(device)
    segment_ids = segment_ids.to(device)
    label_ids = label_ids.to(device)
    
    with relation_prediction.torch.no_grad():
        logits = model(input_ids, segment_ids, input_mask, labels=None)

    if len(preds) == 0:
        preds.append(logits.detach().cpu().numpy())
    else:
        preds[0] = relation_prediction.np.append(
            preds[0], logits.detach().cpu().numpy(), axis=0)

preds = preds[0]
print(preds, preds.shape)

all_label_ids = all_label_ids.numpy()

result = []
for i, pred in enumerate(preds):
    rel_values = relation_prediction.torch.tensor(pred)
    _, argsort1 = relation_prediction.torch.sort(rel_values, descending=True)
    argsort1 = argsort1.cpu().numpy()
    
    result.append(argsort1[0])

In [None]:
# output the prediction
with open(cskg_relation_prediction, "w") as f:
    for line, idx_ in zip(lines_100, result):
        sentence_list=line[1:6]
        sentence_list[1]=label_list[idx_]
        sentence_list.append("0")
        sentence="\t".join(sentence_list)
        f.write(sentence+"\n")

# Node Classification

In [None]:
# basic parameters
data_path = "./data/wn"
data_saved_path = "./output_wn_result"
bert_model="bert-base-cased"
task_name="kg"
max_seq_length=50
eval_batch_size=1500

device = link_prediction.torch.device("cuda" if link_prediction.torch.cuda.is_available() else "cpu")

# load precessor
processors = {"kg": link_prediction.KGProcessor,}
processor = processors[task_name]()

# obtain label
label_list = ["0","1"]
num_labels = len(label_list)

# obtain entity list
entity_list = processor.get_entities(data_path)

# load model
tokenizer = link_prediction.BertTokenizer.from_pretrained(bert_model, do_lower_case=False)
model = link_prediction.BertForSequenceClassification.from_pretrained(data_saved_path, num_labels=num_labels)
location_detail=model.to(device)

In [None]:
def _build_examples(line):
    examples=[]
    text_a = ent2text[line[1]]
    text_b = relation2text[line[2]]
    text_c = ent2text[line[3]]
    
    # corrupt head
    text_a_candits = wn.synsets(line[4].replace(" ","_"))
    #print(text_a_candits)
    for idx in range(len(text_a_candits)):
        item = text_a_candits[idx]
        
        text_a = item.definition()
        
        if idx == 0:
            examples.append(link_prediction.InputExample(guid=None,text_a=text_a, text_b=text_b, text_c=text_c, label="1"))
        else:
            examples.append(link_prediction.InputExample(guid=None,text_a=text_a, text_b=text_b, text_c=text_c, label="0"))
            
    return examples, text_a_candits

In [None]:
res = []
temp = []
for line in link_prediction.tqdm(line_100, desc="Testing"):
    examples, candits=_build_examples(line)
    
    # load data
    eval_features = link_prediction.convert_examples_to_features(examples, label_list, max_seq_length, tokenizer)

    all_input_ids = link_prediction.torch.tensor([f.input_ids for f in eval_features], dtype=link_prediction.torch.long)
    all_input_mask = link_prediction.torch.tensor([f.input_mask for f in eval_features], dtype=link_prediction.torch.long)
    all_segment_ids = link_prediction.torch.tensor([f.segment_ids for f in eval_features], dtype=link_prediction.torch.long)
    all_label_ids = link_prediction.torch.tensor([f.label_id for f in eval_features], dtype=link_prediction.torch.long)

    eval_data = link_prediction.TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
    
    # do predict
    eval_sampler = link_prediction.SequentialSampler(eval_data)
    eval_dataloader = link_prediction.DataLoader(eval_data, sampler=eval_sampler, batch_size=eval_batch_size)

    preds = []

    for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
        input_ids = input_ids.to(device)
        input_mask = input_mask.to(device)
        segment_ids = segment_ids.to(device)
        label_ids = label_ids.to(device)

        with link_prediction.torch.no_grad():
            logits = model(input_ids, segment_ids, input_mask, labels=None)

        if len(preds) == 0:
            preds.append(logits.detach().cpu().numpy())
        else:
            preds[0] = link_prediction.np.append(
                preds[0], logits.detach().cpu().numpy(), axis=0)
    
    all_label_ids = all_label_ids.numpy()
    preds = preds[0]
    rel_values = preds[:, all_label_ids[0]]
    rel_values = link_prediction.torch.tensor(rel_values)
    
    _, argsort1 = link_prediction.torch.sort(rel_values, descending=True)
    argsort1 = argsort1.cpu().numpy()
    
    idx_ = argsort1[0]
    temp.append(idx_)
    predict_output = candits[idx_]
    res.append(predict_output)

In [None]:
# output the prediction
with open(cskg_link_prediction, "w") as f:
    for line, out_ in zip(lines_100, res):
        sentence_list=line[1:6]
        sentence_list[0]="wn:"+out_.name()
        
        sentence_list.append(out_.definition())
        sentence_list.append("0")
        sentence="\t".join(sentence_list)
        f.write(sentence+"\n")

After the preidcition, go to the output file. inspect whether the system was right

## Check Accuracy

In [None]:
# output the relation prediction
with open(cskg_relation_prediction, "r") as f:
    accuracy_count = 0
    total = 0
    
    for line in f:
        label_ = int(line.split("\t")[-1])
        accuracy_count += label_
        
        total += 1
        
print("Accuracy: {}".format(accuracy_count/total))

In [None]:
# output the link prediction
with open(cskg_link_prediction, "r") as f:
    accuracy_count = 0
    total = 0
    
    for line in f:
        label_ = int(line.split("\t")[-1])
        accuracy_count += label_
        
        total += 1
        
print("Accuracy: {}".format(accuracy_count/total))

# Combine Node & Relation Calssification

In [4]:
head,lines=load_file(web_child)
lines.append(head)
len(lines)

80000

In [6]:
# basic parameter
relation_prediction.os.environ['CUDA_VISIBLE_DEVICES']= '2'
data_path_relation = "./data/wc"
data_saved_path_relation = "./output_wc_result2"
bert_model="bert-base-cased"
task_name="kg"
max_seq_length_relation=100
eval_batch_size_relation=32

device_relation = relation_prediction.torch.device("cuda" if relation_prediction.torch.cuda.is_available() else "cpu")

# load precessor
processors_relation = {"kg": relation_prediction.KGProcessor,}
processor_relation = processors_relation[task_name]()

# obtain label
label_list_relation = processor_relation.get_relations(data_path_relation)
num_labels_relation = len(label_list_relation)

# obtain entity list
entity_list_relation = processor_relation.get_entities(data_path_relation)

# load model
tokenizer = relation_prediction.BertTokenizer.from_pretrained(bert_model, do_lower_case=False)
model_relation = relation_prediction.BertForSequenceClassification.from_pretrained(data_saved_path_relation, 
                                                                                   num_labels=num_labels_relation)
location_detail_1=model_relation.to(device_relation)


data_path_link = "./data/wc"
data_saved_path_link = "./output_wn_result"
max_seq_length_link=50
eval_batch_size_link=1500

device_link = link_prediction.torch.device("cuda" if link_prediction.torch.cuda.is_available() else "cpu")

# load precessor
processors_link = {"kg": link_prediction.KGProcessor,}
processor_link = processors_link[task_name]()

# obtain label
label_list_link = ["0","1"]
num_labels_link = len(label_list_link)

# obtain entity list
entity_list_link = processor_link.get_entities(data_path_link)

# load model
model_link = link_prediction.BertForSequenceClassification.from_pretrained(data_saved_path_link, num_labels=num_labels_link)
location_detail=model_link.to(device_link)

In [7]:
# check relation frequency
fre = {}

for item in lines:
    relation = item[1]
    fre[relation]=fre.get(relation,0)+1
    
fre_sort = sorted(fre.items(),key=lambda k:k[1],reverse=True)
fre={}
idx = 0
for item in fre_sort:
    idx +=1
    fre[item[0]] = 1/idx

fre_relation_list = []
for item in label_list_relation:
    fre_relation_list.append(fre[item])

In [8]:
# load file
with open(cskg_100,"r") as f:
    lines = []
    for line_str in f:
        line_str=line_str.strip()

        if line_str:
            line=line_str.split("\t")

            lines.append(line)

In [9]:
def corrupt_examples_(ent_candits,text_a=None,text_b=None,text_c=None, pos="head"):
    examples = []
    for idx in range(len(ent_candits)):
        item = ent_candits[idx]
        text_ = item.definition()
        
        if pos =="head":
            text_a = text_
        else:
            text_c = text_
        
        if idx == 0:
            examples.append(link_prediction.InputExample(guid=None,text_a=text_a, text_b=text_b, text_c=text_c, label="1"))
        else:
            examples.append(link_prediction.InputExample(guid=None,text_a=text_a, text_b=text_b, text_c=text_c, label="0"))
    return examples

def combine_examples_generation(line):
    # create three examples for node resolution and relation classification.
    # examples_relation: for relation 
    examples=[]
    
    # use MFS to find id
    relation_id = "wn:quality.n.1"
    
    # corrupt head & tail
    ent_1_candits = wn.synsets(line[3].replace(" ","_"))
    ent_2_candits = wn.synsets(line[4].replace(" ","_"))
    
    # use MFS to find id
    relation_id = "wn:quality.n.1"
    ent_1_text = ent_1_candits[0].definition()
    ent_2_text = ent_2_candits[0].definition()
    relation_text = "quality"
    
    ent_1_examples = corrupt_examples_(ent_1_candits,text_a=ent_1_text,text_b=relation_text,text_c=ent_2_text, pos="head")
    ent_2_examples = corrupt_examples_(ent_2_candits,text_a=ent_1_text,text_b=relation_text,text_c=ent_2_text, pos="tail")
    
    relation_examples = [relation_prediction.InputExample(guid=None, text_a=ent_1_text, text_b=ent_2_text, label=relation_id)]
    
    return ent_1_candits,ent_2_candits,ent_1_examples,ent_2_examples,relation_examples

def validation(predict_lines, filename=ground_truth):
    accuracy_left = 0
    accuracy_relation = 0
    accuracy_right = 0
    
    count= 0
    with open(filename, "r") as f:
        for p_line,g_line_str in zip(predict_lines,f):
            g_line=g_line_str.strip().split("\t")
            
            # check head
            if p_line[0]:
                p_head = wn.synset(p_line[0].split(":")[1])
                g_heads=g_line[0].split("|")
                temp = accuracy_left
                for item in g_heads:
                    if not item:
                        continue
                    try:
                        g_head=wn.synset(item.split(":")[1])
                    except:
                        print("split error: ",item)

                    if p_head==g_head:
                        accuracy_left+=1
                        break

            # check tail
            if p_line[2]:
                p_tail = wn.synset(p_line[2].split(":")[1])
                g_tails=g_line[2].split("|")

                for item in g_tails:
                    if not item:
                        continue
                    try:
                        g_tail=wn.synset(item.split(":")[1])
                    except:
                        print(item,item.split(":"))

                    if p_tail==g_tail:
                        accuracy_right+=1
                        break
                    
            # check relation
            if p_line[1]:
                p_relation = wn.synset(p_line[1].split(":")[1])
                g_relations=g_line[1].split("|")

                for item in g_relations:
                    if not item:
                        continue
                    try:
                        g_relation=wn.synset(item.split(":")[1])
                    except:
                        print(item,item.split(":"))

                    if p_relation==g_relation:
                        accuracy_relation+=1
                        break
    return accuracy_left/len(predict_lines),accuracy_right/len(predict_lines),accuracy_relation/len(predict_lines)

def inverse_fre(length):
    res= []
    
    for i in range(1,length+1):
        res.append(1/i)
        
    return res

In [10]:
def alpha_test(alpha=0):
    predict_lines = []
    input_lines = [[None,None,None, line[3],line[4]]for line in lines]
    for line in link_prediction.tqdm(input_lines, desc="Testing"):
        head_candits,tail_candits,head_examples,tail_examples,relation_examples=combine_examples_generation(line)

        # predict relation
        eval_features = relation_prediction.convert_examples_to_features(relation_examples, label_list_relation, 
                                                                         max_seq_length_relation, tokenizer)

        all_input_ids = relation_prediction.torch.tensor([f.input_ids for f in eval_features], dtype=relation_prediction.torch.long)
        all_input_mask = relation_prediction.torch.tensor([f.input_mask for f in eval_features], dtype=relation_prediction.torch.long)
        all_segment_ids = relation_prediction.torch.tensor([f.segment_ids for f in eval_features], dtype=relation_prediction.torch.long)
        all_label_ids = relation_prediction.torch.tensor([f.label_id for f in eval_features], dtype=relation_prediction.torch.long)

        eval_data = relation_prediction.TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)

        # do predict
        eval_sampler = relation_prediction.SequentialSampler(eval_data)
        eval_dataloader = relation_prediction.DataLoader(eval_data, sampler=eval_sampler, batch_size=eval_batch_size_relation)

        preds = []

        for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
            input_ids = input_ids.to(device_relation)
            input_mask = input_mask.to(device_relation)
            segment_ids = segment_ids.to(device_relation)
            label_ids = label_ids.to(device_relation)

            with relation_prediction.torch.no_grad():
                logits = model_relation(input_ids, segment_ids, input_mask, labels=None)

            if len(preds) == 0:
                preds.append(logits.detach().cpu().numpy())
            else:
                preds[0] = relation_prediction.np.append(
                    preds[0], logits.detach().cpu().numpy(), axis=0)

        preds = preds[0]

        all_label_ids = all_label_ids.numpy()

        result = []
        for i, pred in enumerate(preds):
            rel_values = relation_prediction.torch.tensor(pred)
            rel_values = rel_values.cpu().numpy()
            fre_list = np.array(inverse_fre(len(rel_values)))
            
            rel_values = rel_values*(1-alpha)+np.array(fre_relation_list)*alpha
            
            argsort1 = sorted(enumerate(rel_values), key=lambda k:k[1], reverse=True)
            result.append(argsort1[0][0])

        predict_relation = label_list_relation[result[0]]

        # head prediction
        eval_features = link_prediction.convert_examples_to_features(head_examples, label_list_link, 
                                                                         max_seq_length_link, tokenizer)

        all_input_ids = relation_prediction.torch.tensor([f.input_ids for f in eval_features], dtype=relation_prediction.torch.long)
        all_input_mask = relation_prediction.torch.tensor([f.input_mask for f in eval_features], dtype=relation_prediction.torch.long)
        all_segment_ids = relation_prediction.torch.tensor([f.segment_ids for f in eval_features], dtype=relation_prediction.torch.long)
        all_label_ids = relation_prediction.torch.tensor([f.label_id for f in eval_features], dtype=relation_prediction.torch.long)

        eval_data = relation_prediction.TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)

        # do predict
        eval_sampler = link_prediction.SequentialSampler(eval_data)
        eval_dataloader = link_prediction.DataLoader(eval_data, sampler=eval_sampler, batch_size=eval_batch_size_link)

        preds = []

        for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
            input_ids = input_ids.to(device_link)
            input_mask = input_mask.to(device_link)
            segment_ids = segment_ids.to(device_link)
            label_ids = label_ids.to(device_link)

            with link_prediction.torch.no_grad():
                logits = model_link(input_ids, segment_ids, input_mask, labels=None)

            if len(preds) == 0:
                preds.append(logits.detach().cpu().numpy())
            else:
                preds[0] = link_prediction.np.append(
                    preds[0], logits.detach().cpu().numpy(), axis=0)

        all_label_ids = all_label_ids.numpy()
        preds = preds[0]
        rel_values = preds[:, all_label_ids[0]]
        rel_values = link_prediction.torch.tensor(rel_values)
        rel_values = rel_values.cpu().numpy()

        fre_list = np.array(inverse_fre(len(rel_values)))

        rel_values = rel_values*(1-alpha)+alpha*fre_list

        argsort1 = sorted(enumerate(rel_values), key=lambda k:k[1], reverse=True)

        idx_ = argsort1[0][0]
        predict_output = head_candits[idx_]
        predict_head="wn:"+predict_output.name()

        # tail prediction\
        eval_features = link_prediction.convert_examples_to_features(tail_examples, label_list_link, 
                                                                         max_seq_length_link, tokenizer)

        all_input_ids = relation_prediction.torch.tensor([f.input_ids for f in eval_features], dtype=relation_prediction.torch.long)
        all_input_mask = relation_prediction.torch.tensor([f.input_mask for f in eval_features], dtype=relation_prediction.torch.long)
        all_segment_ids = relation_prediction.torch.tensor([f.segment_ids for f in eval_features], dtype=relation_prediction.torch.long)
        all_label_ids = relation_prediction.torch.tensor([f.label_id for f in eval_features], dtype=relation_prediction.torch.long)

        eval_data = relation_prediction.TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)

        # do predict
        eval_sampler = link_prediction.SequentialSampler(eval_data)
        eval_dataloader = link_prediction.DataLoader(eval_data, sampler=eval_sampler, batch_size=eval_batch_size_link)

        preds = []

        for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
            input_ids = input_ids.to(device_link)
            input_mask = input_mask.to(device_link)
            segment_ids = segment_ids.to(device_link)
            label_ids = label_ids.to(device_link)

            with link_prediction.torch.no_grad():
                logits = model_link(input_ids, segment_ids, input_mask, labels=None)

            if len(preds) == 0:
                preds.append(logits.detach().cpu().numpy())
            else:
                preds[0] = link_prediction.np.append(
                    preds[0], logits.detach().cpu().numpy(), axis=0)

        all_label_ids = all_label_ids.numpy()
        preds = preds[0]
        rel_values = preds[:, all_label_ids[0]]
        rel_values = link_prediction.torch.tensor(rel_values)
        rel_values = rel_values.cpu().numpy()

        fre_list = np.array(inverse_fre(len(rel_values)))

        rel_values = rel_values*(1-alpha)+alpha*fre_list

        argsort1 = sorted(enumerate(rel_values), key=lambda k:k[1], reverse=True)

        idx_ = argsort1[0][0]
        predict_output = tail_candits[idx_]
        predict_tail="wn:"+predict_output.name()

        predict_lines.append([predict_head,predict_relation,predict_tail,line[3],line[4]])
    time.sleep(1)
    print(print("alpha value: {}".format(alpha)))
    return predict_lines

In [295]:
predict_lines = alpha_test()

Testing: 100%|██████████| 100/100 [00:04<00:00, 21.08it/s]


alpha value: 0
None


In [296]:
with open(cskg_prediction, "w") as f:
    for line in predict_lines:
        sentence="\t".join(line)
        f.write(sentence+"\n")

In [297]:
validation(predict_lines)

(0.39, 0.37, 0.78)

In [327]:
for alpha in range(11):
    alpha = alpha/10
    predict_lines = alpha_test(alpha=alpha)
    print(validation(predict_lines))

Testing: 100%|██████████| 100/100 [00:04<00:00, 20.84it/s]


alpha value: 0.0
None


Testing:   2%|▏         | 2/100 [00:00<00:07, 12.52it/s]

(0.4, 0.3, 0.78)


Testing: 100%|██████████| 100/100 [00:04<00:00, 20.97it/s]
Testing:   2%|▏         | 2/100 [00:00<00:07, 12.36it/s]

alpha value: 0.1
None
(0.37, 0.34, 0.78)


Testing: 100%|██████████| 100/100 [00:05<00:00, 19.87it/s]
Testing:   2%|▏         | 2/100 [00:00<00:07, 12.51it/s]

alpha value: 0.2
None
(0.52, 0.36, 0.78)


Testing: 100%|██████████| 100/100 [00:06<00:00, 15.76it/s]
Testing:   1%|          | 1/100 [00:00<00:11,  8.97it/s]

alpha value: 0.3
None
(0.53, 0.4, 0.74)


Testing: 100%|██████████| 100/100 [00:06<00:00, 16.47it/s]
Testing:   2%|▏         | 2/100 [00:00<00:06, 14.32it/s]

alpha value: 0.4
None
(0.64, 0.43, 0.79)


Testing: 100%|██████████| 100/100 [00:05<00:00, 17.47it/s]
Testing:   2%|▏         | 2/100 [00:00<00:07, 13.05it/s]

alpha value: 0.5
None
(0.65, 0.49, 0.77)


Testing: 100%|██████████| 100/100 [00:05<00:00, 18.63it/s]
Testing:   1%|          | 1/100 [00:00<00:11,  8.97it/s]

alpha value: 0.6
None
(0.7, 0.49, 0.74)


Testing: 100%|██████████| 100/100 [00:05<00:00, 16.74it/s]
Testing:   2%|▏         | 2/100 [00:00<00:07, 13.85it/s]

alpha value: 0.7
None
(0.7, 0.5, 0.77)


Testing: 100%|██████████| 100/100 [00:04<00:00, 20.05it/s]
Testing:   2%|▏         | 2/100 [00:00<00:08, 12.10it/s]

alpha value: 0.8
None
(0.7, 0.49, 0.78)


Testing: 100%|██████████| 100/100 [00:05<00:00, 17.60it/s]
Testing:   2%|▏         | 2/100 [00:00<00:06, 14.04it/s]

alpha value: 0.9
None
(0.7, 0.49, 0.81)


Testing: 100%|██████████| 100/100 [00:05<00:00, 19.24it/s]


alpha value: 1.0
None
(0.7, 0.49, 0.65)


In [28]:
line=[None,None,None,"mandarin orange","orange"]
alpha = 0

head_candits,tail_candits,head_examples,tail_examples,relation_examples=combine_examples_generation(line)

# predict relation
eval_features = relation_prediction.convert_examples_to_features(relation_examples, label_list_relation, 
                                                                 max_seq_length_relation, tokenizer)

all_input_ids = relation_prediction.torch.tensor([f.input_ids for f in eval_features], dtype=relation_prediction.torch.long)
all_input_mask = relation_prediction.torch.tensor([f.input_mask for f in eval_features], dtype=relation_prediction.torch.long)
all_segment_ids = relation_prediction.torch.tensor([f.segment_ids for f in eval_features], dtype=relation_prediction.torch.long)
all_label_ids = relation_prediction.torch.tensor([f.label_id for f in eval_features], dtype=relation_prediction.torch.long)

eval_data = relation_prediction.TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)

# do predict
eval_sampler = relation_prediction.SequentialSampler(eval_data)
eval_dataloader = relation_prediction.DataLoader(eval_data, sampler=eval_sampler, batch_size=eval_batch_size_relation)

preds = []

for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
    input_ids = input_ids.to(device_relation)
    input_mask = input_mask.to(device_relation)
    segment_ids = segment_ids.to(device_relation)
    label_ids = label_ids.to(device_relation)

    with relation_prediction.torch.no_grad():
        logits = model_relation(input_ids, segment_ids, input_mask, labels=None)

    if len(preds) == 0:
        preds.append(logits.detach().cpu().numpy())
    else:
        preds[0] = relation_prediction.np.append(
            preds[0], logits.detach().cpu().numpy(), axis=0)

preds = preds[0]

all_label_ids = all_label_ids.numpy()

result = []
for i, pred in enumerate(preds):
    rel_values = relation_prediction.torch.tensor(pred)
    rel_values = rel_values.cpu().numpy()

    fre_list = np.array(inverse_fre(len(rel_values)))

    rel_values = rel_values*(1-alpha)

    argsort1 = sorted(enumerate(rel_values), key=lambda k:k[1], reverse=True)
    result.append(argsort1[0][0])

predict_relation = label_list_relation[result[0]]

print(rel_values)

# head prediction
eval_features = link_prediction.convert_examples_to_features(head_examples, label_list_link, 
                                                                 max_seq_length_link, tokenizer)

all_input_ids = relation_prediction.torch.tensor([f.input_ids for f in eval_features], dtype=relation_prediction.torch.long)
all_input_mask = relation_prediction.torch.tensor([f.input_mask for f in eval_features], dtype=relation_prediction.torch.long)
all_segment_ids = relation_prediction.torch.tensor([f.segment_ids for f in eval_features], dtype=relation_prediction.torch.long)
all_label_ids = relation_prediction.torch.tensor([f.label_id for f in eval_features], dtype=relation_prediction.torch.long)

eval_data = relation_prediction.TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)

# do predict
eval_sampler = link_prediction.SequentialSampler(eval_data)
eval_dataloader = link_prediction.DataLoader(eval_data, sampler=eval_sampler, batch_size=eval_batch_size_link)

preds = []

for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
    input_ids = input_ids.to(device_link)
    input_mask = input_mask.to(device_link)
    segment_ids = segment_ids.to(device_link)
    label_ids = label_ids.to(device_link)

    with link_prediction.torch.no_grad():
        logits = model_link(input_ids, segment_ids, input_mask, labels=None)

    if len(preds) == 0:
        preds.append(logits.detach().cpu().numpy())
    else:
        preds[0] = link_prediction.np.append(
            preds[0], logits.detach().cpu().numpy(), axis=0)

all_label_ids = all_label_ids.numpy()
preds = preds[0]
rel_values = preds[:, all_label_ids[0]]
rel_values = link_prediction.torch.tensor(rel_values)
rel_values = rel_values.cpu().numpy()

fre_list = np.array(inverse_fre(len(rel_values)))

rel_values = rel_values*(1-alpha)+alpha*fre_list
print(rel_values)
argsort1 = sorted(enumerate(rel_values), key=lambda k:k[1], reverse=True)

idx_ = argsort1[0][0]
predict_output = head_candits[idx_]
predict_head="wn:"+predict_output.name()

# tail prediction\
eval_features = link_prediction.convert_examples_to_features(tail_examples, label_list_link, 
                                                                 max_seq_length_link, tokenizer)

all_input_ids = relation_prediction.torch.tensor([f.input_ids for f in eval_features], dtype=relation_prediction.torch.long)
all_input_mask = relation_prediction.torch.tensor([f.input_mask for f in eval_features], dtype=relation_prediction.torch.long)
all_segment_ids = relation_prediction.torch.tensor([f.segment_ids for f in eval_features], dtype=relation_prediction.torch.long)
all_label_ids = relation_prediction.torch.tensor([f.label_id for f in eval_features], dtype=relation_prediction.torch.long)

eval_data = relation_prediction.TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)

# do predict
eval_sampler = link_prediction.SequentialSampler(eval_data)
eval_dataloader = link_prediction.DataLoader(eval_data, sampler=eval_sampler, batch_size=eval_batch_size_link)

preds = []

for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
    input_ids = input_ids.to(device_link)
    input_mask = input_mask.to(device_link)
    segment_ids = segment_ids.to(device_link)
    label_ids = label_ids.to(device_link)

    with link_prediction.torch.no_grad():
        logits = model_link(input_ids, segment_ids, input_mask, labels=None)

    if len(preds) == 0:
        preds.append(logits.detach().cpu().numpy())
    else:
        preds[0] = link_prediction.np.append(
            preds[0], logits.detach().cpu().numpy(), axis=0)

all_label_ids = all_label_ids.numpy()
preds = preds[0]
rel_values = preds[:, all_label_ids[0]]
rel_values = link_prediction.torch.tensor(rel_values)
rel_values = rel_values.cpu().numpy()

fre_list = np.array(inverse_fre(len(rel_values)))

rel_values = rel_values*(1-alpha)+alpha*fre_list

argsort1 = sorted(enumerate(rel_values), key=lambda k:k[1], reverse=True)

idx_ = argsort1[0][0]
predict_output = tail_candits[idx_]
predict_tail="wn:"+predict_output.name()
print(rel_values)

[-0.24500589  0.38136557 -0.31796426 15.985915   -1.5484285   1.1072263
  1.4132776   0.4049753  -0.32776767  0.1809184  -4.7585135  -0.62195414
  0.503321    0.27086622 -1.1540339  -0.781703   -0.24960321 -1.2921071
  0.43842727 -1.130424   -1.29697    -0.5626882  -0.7122333   0.529878
 -1.2836498  -1.662291   -0.19502428]
[3.42065907 3.47564721]
[ 3.09846354  0.01112205  3.34114218 -1.28109705 -4.65054989  0.80405474]


In [13]:
predict_head,predict_relation,predict_tail

('wn:mandarin.n.05', 'wn:color.n.1', 'wn:orange.n.01')