In [95]:
from nltk.corpus import wordnet as wn
import random, csv

import run_bert_relation_prediction as relation_prediction
import run_bert_link_prediction as link_prediction

## Set folder

In [220]:
# input file
cskg_file = "./data/cskg/cskg_connected.tsv"

# output file
cskg_relation_prediction = "./data/cskg/relation_prediction.tsv"
cskg_link_prediction = "./data/cskg/link_prediction.tsv"
cskg_100 = "./data/cskg/cskg_100.tsv"

## Data Preparation

In [96]:
def load_file(filename,encoding=None,errors=None):
    # load data file
    with open(filename,"r",encoding=encoding, errors=errors) as f:
        head_str = f.readline()
        head=head_str.split("\t")
        
        lines = []
        for line_str in f:
            line_str=line_str.strip()
            
            if line_str:
                line=line_str.split("\t")

                lines.append(line)
            
    return head,lines

def line_filter(lines, label_="/r/HasProperty"):
    #
    res = []
    
    for line in lines:
        entity1 = line[1]
        entity2 = line[3]
        relation = line[2]
        
        entity1_label=line[4]
        entity2_label=line[5]
        relation_label=line[6]
        
        if line[2] == label_:
            # has relation label hasproperty
            
            # subject 2+ candidates
            # object 1+ candidates
            
            entity1_synsets = wn.synsets(entity1_label.replace(" ","_"))
            entity2_synsets = wn.synsets(entity2_label.replace(" ","_"))
            #res.append(line)
            
            if len(entity1_synsets) >= 2 and len(entity2_synsets)>=1:
                res.append(line)
                
    return res

def write_file(filename,lines,encoding=None,errors=None):
    # write data file
    with open(filename,"w",encoding=encoding, errors=errors) as f:
        writer = csv.writer(f, delimiter='\t')
        writer.writerows(lines)
        
    return

In [4]:
# load file
head, lines=load_file(cskg_file,encoding="utf8")

In [5]:
# example of line
lines[15014]

['/c/en/about_ten_percent_of_people-/r/HasProperty-/c/en/left_handed-0000',
 '/c/en/about_ten_percent_of_people',
 '/r/HasProperty',
 '/c/en/left_handed',
 'about ten percent of people',
 'left handed',
 'has property',
 '',
 'CN',
 '[[About ten percent of people]] are [[left-handed]]']

In [6]:
# Filter
filter_lines = line_filter(lines)

# example of filter line, and number
filter_lines[0], len(filter_lines)

(['/c/en/1000-/r/HasProperty-/c/en/one_thousand-0000',
  '/c/en/1000',
  '/r/HasProperty',
  '/c/en/one_thousand',
  '1000',
  'one thousand',
  'has property',
  '',
  'CN',
  '[[1000]] is [[one thousand]]'],
 1788)

In [7]:
# random choose 100 samples from filter_lines
lines_100=random.sample(filter_lines,100)

# generate triple
ent2text={}
relation2text={}

for line in lines_100:
    ent2text[line[1]]=wn.synsets(line[4].replace(" ","_"))[0].definition()
    ent2text[line[3]]=wn.synsets(line[5].replace(" ","_"))[0].definition()
    relation2text[line[2]] = line[6]

<run_bert_relation_prediction.InputExample at 0x7fb23f15eda0>

In [103]:
# write file
write_file(cskg_100,[_[1:7] for _ in lines_100])

In [104]:
# load file
with open(cskg_100,"r") as f:
    lines = []
    for line_str in f:
        line_str=line_str.strip()

        if line_str:
            line=line_str.split("\t")

            lines.append(line)

In [221]:
# example of new lines
lines[0]

['Q190024', '/r/HasProperty', 'Q39338', 'mandarin orange', 'orange', 'color']

## Relation Classification

In [244]:
# basic parameter
relation_prediction.os.environ['CUDA_VISIBLE_DEVICES']= '2'
data_path = "./data/wc"
data_saved_path = "./output_wc_result2"
bert_model="bert-base-cased"
task_name="kg"
max_seq_length=100
eval_batch_size=32

device = relation_prediction.torch.device("cuda" if relation_prediction.torch.cuda.is_available() else "cpu")

# load precessor
processors = {"kg": relation_prediction.KGProcessor,}
processor = processors[task_name]()

# obtain label
label_list = processor.get_relations(data_path)
num_labels = len(label_list)

# obtain entity list
entity_list = processor.get_entities(data_path)

# load model
tokenizer = relation_prediction.BertTokenizer.from_pretrained(bert_model, do_lower_case=False)
model = relation_prediction.BertForSequenceClassification.from_pretrained(data_saved_path, num_labels=num_labels)
location_detail=model.to(device)

label_list = processor.get_relations(data_path)
num_labels = len(label_list)

examples = []
set_type="test"
for (i, line) in enumerate(lines_100):
    guid = "%s-%s" % (set_type, i)
    text_a = ent2text[line[1]]
    text_b = ent2text[line[3]]
    label = "wn:quality.n.1"
    
    examples.append(
        relation_prediction.InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
    
examples[0]

<run_bert_relation_prediction.InputExample at 0x7fb237cce5f8>

In [245]:
# load data
eval_features = relation_prediction.convert_examples_to_features(examples, label_list, max_seq_length, tokenizer)

all_input_ids = relation_prediction.torch.tensor([f.input_ids for f in eval_features], dtype=relation_prediction.torch.long)
all_input_mask = relation_prediction.torch.tensor([f.input_mask for f in eval_features], dtype=relation_prediction.torch.long)
all_segment_ids = relation_prediction.torch.tensor([f.segment_ids for f in eval_features], dtype=relation_prediction.torch.long)
all_label_ids = relation_prediction.torch.tensor([f.label_id for f in eval_features], dtype=relation_prediction.torch.long)

eval_data = relation_prediction.TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)

# do predict
eval_sampler = relation_prediction.SequentialSampler(eval_data)
eval_dataloader = relation_prediction.DataLoader(eval_data, sampler=eval_sampler, batch_size=eval_batch_size)

preds = []

for input_ids, input_mask, segment_ids, label_ids in relation_prediction.tqdm(eval_dataloader, desc="Testing"):
    input_ids = input_ids.to(device)
    input_mask = input_mask.to(device)
    segment_ids = segment_ids.to(device)
    label_ids = label_ids.to(device)
    
    with relation_prediction.torch.no_grad():
        logits = model(input_ids, segment_ids, input_mask, labels=None)

    if len(preds) == 0:
        preds.append(logits.detach().cpu().numpy())
    else:
        preds[0] = relation_prediction.np.append(
            preds[0], logits.detach().cpu().numpy(), axis=0)

preds = preds[0]
print(preds, preds.shape)

all_label_ids = all_label_ids.numpy()

result = []
for i, pred in enumerate(preds):
    rel_values = relation_prediction.torch.tensor(pred)
    _, argsort1 = relation_prediction.torch.sort(rel_values, descending=True)
    argsort1 = argsort1.cpu().numpy()
    
    result.append(argsort1[0])

Testing: 100%|██████████| 4/4 [00:00<00:00, 12.00it/s]

[[-6.53604269e-02  1.46461636e-01 -2.98043787e-02 ... -3.50986511e-01
  -1.88698745e+00  4.34495598e-01]
 [ 1.56642199e-02 -1.59057188e+00  1.33126907e+01 ... -2.62189841e+00
  -6.84538782e-01 -2.39190435e+00]
 [ 9.39943254e-01  4.68988836e-01 -1.89230680e+00 ... -6.65193176e+00
  -6.22472668e+00 -6.11609173e+00]
 ...
 [-1.15386076e-01 -2.62573361e-04 -1.00252843e+00 ... -9.57773924e-01
  -2.14523363e+00  2.09113359e-01]
 [-1.16387403e+00  8.94880295e-03 -2.47058153e+00 ... -5.04134369e+00
  -5.43410635e+00 -5.10229588e+00]
 [ 1.05213976e+00 -1.40085506e+00 -8.30978751e-01 ... -4.22682667e+00
  -3.15243816e+00 -6.16434956e+00]] (100, 27)





In [246]:
# output the prediction
with open(cskg_relation_prediction, "w") as f:
    for line, idx_ in zip(lines_100, result):
        sentence_list=line[1:6]
        sentence_list[1]=label_list[idx_]
        sentence_list.append("0")
        sentence="\t".join(sentence_list)
        f.write(sentence+"\n")

# Node Classification

In [261]:
# basic parameters
data_path = "./data/wn"
data_saved_path = "./output_wn_result"
bert_model="bert-base-cased"
task_name="kg"
max_seq_length=50
eval_batch_size=1500

device = link_prediction.torch.device("cuda" if link_prediction.torch.cuda.is_available() else "cpu")

# load precessor
processors = {"kg": link_prediction.KGProcessor,}
processor = processors[task_name]()

# obtain label
label_list = ["0","1"]
num_labels = len(label_list)

# obtain entity list
entity_list = processor.get_entities(data_path)

# load model
tokenizer = link_prediction.BertTokenizer.from_pretrained(bert_model, do_lower_case=False)
model = link_prediction.BertForSequenceClassification.from_pretrained(data_saved_path, num_labels=num_labels)
location_detail=model.to(device)

In [262]:
def _build_examples(line):
    examples=[]
    text_a = ent2text[line[1]]
    text_b = relation2text[line[2]]
    text_c = ent2text[line[3]]
    
    # corrupt head
    text_a_candits = wn.synsets(line[4].replace(" ","_"))
    #print(text_a_candits)
    for idx in range(len(text_a_candits)):
        item = text_a_candits[idx]
        
        text_a = item.definition()
        
        if idx == 0:
            examples.append(link_prediction.InputExample(guid=None,text_a=text_a, text_b=text_b, text_c=text_c, label="1"))
        else:
            examples.append(link_prediction.InputExample(guid=None,text_a=text_a, text_b=text_b, text_c=text_c, label="0"))
            
    return examples, text_a_candits

In [263]:
res = []
temp = []
for line in link_prediction.tqdm(lines_100, desc="Testing"):
    examples, candits=_build_examples(line)
    
    # load data
    eval_features = link_prediction.convert_examples_to_features(examples, label_list, max_seq_length, tokenizer)

    all_input_ids = link_prediction.torch.tensor([f.input_ids for f in eval_features], dtype=link_prediction.torch.long)
    all_input_mask = link_prediction.torch.tensor([f.input_mask for f in eval_features], dtype=link_prediction.torch.long)
    all_segment_ids = link_prediction.torch.tensor([f.segment_ids for f in eval_features], dtype=link_prediction.torch.long)
    all_label_ids = link_prediction.torch.tensor([f.label_id for f in eval_features], dtype=link_prediction.torch.long)

    eval_data = link_prediction.TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
    
    # do predict
    eval_sampler = link_prediction.SequentialSampler(eval_data)
    eval_dataloader = link_prediction.DataLoader(eval_data, sampler=eval_sampler, batch_size=eval_batch_size)

    preds = []

    for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
        input_ids = input_ids.to(device)
        input_mask = input_mask.to(device)
        segment_ids = segment_ids.to(device)
        label_ids = label_ids.to(device)

        with link_prediction.torch.no_grad():
            logits = model(input_ids, segment_ids, input_mask, labels=None)

        if len(preds) == 0:
            preds.append(logits.detach().cpu().numpy())
        else:
            preds[0] = link_prediction.np.append(
                preds[0], logits.detach().cpu().numpy(), axis=0)
    
    all_label_ids = all_label_ids.numpy()
    preds = preds[0]
    rel_values = preds[:, all_label_ids[0]]
    rel_values = link_prediction.torch.tensor(rel_values)
    
    _, argsort1 = link_prediction.torch.sort(rel_values, descending=True)
    argsort1 = argsort1.cpu().numpy()
    
    idx_ = argsort1[0]
    temp.append(idx_)
    predict_output = candits[idx_]
    res.append(predict_output)

Testing: 100%|██████████| 100/100 [00:01<00:00, 63.49it/s]


In [264]:
# output the prediction
with open(cskg_link_prediction, "w") as f:
    for line, out_ in zip(lines_100, res):
        sentence_list=line[1:6]
        sentence_list[0]="wn:"+out_.name()
        
        sentence_list.append(out_.definition())
        sentence_list.append("0")
        sentence="\t".join(sentence_list)
        f.write(sentence+"\n")

After the preidcition, go to the output file. inspect whether the system was right

## Check Accuracy

In [269]:
# output the relation prediction
with open(cskg_relation_prediction, "r") as f:
    accuracy_count = 0
    total = 0
    
    for line in f:
        label_ = int(line.split("\t")[-1])
        accuracy_count += label_
        
        total += 1
        
print("Accuracy: {}".format(accuracy_count/total))

Accuracy: 0.8


In [277]:
# output the link prediction
with open(cskg_link_prediction, "r") as f:
    accuracy_count = 0
    total = 0
    
    for line in f:
        label_ = int(line.split("\t")[-1])
        accuracy_count += label_
        
        total += 1
        
print("Accuracy: {}".format(accuracy_count/total))

Accuracy: 0.58
