In [10]:
from run_bert_relation_prediction import *

## Setup directories

In [11]:
# basic parameters
os.environ['CUDA_VISIBLE_DEVICES']= '2'
data_path = "./data/wc"
data_saved_path = "./output_wc_result"
bert_model="bert-base-cased"
task_name="kg"
max_seq_length=25
eval_batch_size=32

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [12]:
# load precessor
processors = {"kg": KGProcessor,}
processor = processors[task_name]()

# obtain label
label_list = processor.get_relations(data_path)
num_labels = len(label_list)

# obtain entity list
entity_list = processor.get_entities(data_path)

# load model
tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=False)
model = BertForSequenceClassification.from_pretrained(data_saved_path, num_labels=num_labels)
location_detail=model.to(device)

## Train File Relation Prediction

In [4]:
def rank_accuracy(ranks):
    # check the accuracy for different hits
    max_dep = 1
    threshold = 0
    accuracy_dict = dict()
    ite = 0
    
    while threshold <= max_dep and ite <500:
        for rank in ranks:
            max_dep = max(max_dep, rank)
            if rank <= threshold:
                accuracy_dict[threshold] = accuracy_dict.get(threshold,0)+1
                
        accuracy_dict[threshold] = accuracy_dict[threshold]/len(ranks)
        threshold += 1
        ite += 1
    return accuracy_dict

In [5]:
# load train data
eval_examples = processor.get_train_examples(data_path)
eval_features = convert_examples_to_features(eval_examples, label_list, max_seq_length, tokenizer)

all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)

eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)

# do predict
eval_sampler = SequentialSampler(eval_data)
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=eval_batch_size)

preds = []

for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Testing"):
    input_ids = input_ids.to(device)
    input_mask = input_mask.to(device)
    segment_ids = segment_ids.to(device)
    label_ids = label_ids.to(device)
    
    with torch.no_grad():
        logits = model(input_ids, segment_ids, input_mask, labels=None)

    if len(preds) == 0:
        preds.append(logits.detach().cpu().numpy())
    else:
        preds[0] = np.append(
            preds[0], logits.detach().cpu().numpy(), axis=0)

preds = preds[0]
print(preds, preds.shape)

all_label_ids = all_label_ids.numpy()

Testing: 100%|██████████| 12500/12500 [7:45:34<00:00,  2.23s/it]  

[[ -2.418129     0.37876976   2.9576035  ...  -6.878661    -6.9027905
   -8.332947  ]
 [ -1.7731899   -0.17856944   4.0300937  ...  -7.6012      -8.458498
   -9.099413  ]
 [  0.0993562   -3.0691712    4.315088   ...  -8.305269    -8.870956
   -9.867961  ]
 ...
 [ -2.8487058    0.8943816    5.0485106  ...  -4.2121305   -4.3122816
   -6.001689  ]
 [ -1.1938187    3.0598202    6.8402953  ...  -9.60652     -8.834217
  -10.4396925 ]
 [ -1.72309      1.6641227   13.399329   ...  -7.3354406   -7.329503
   -8.942232  ]] (400000, 6331)





In [8]:
# calculate the rank of the correct answer location in the 
ranks = []
filter_ranks = []
hits = []
hits_filter = []
for i in range(10):
    hits.append([])
    hits_filter.append([])

for i, pred in enumerate(preds):
    rel_values = torch.tensor(pred)
    _, argsort1 = torch.sort(rel_values, descending=True)
    argsort1 = argsort1.cpu().numpy()

    rank = np.where(argsort1 == all_label_ids[i])[0][0]
    #print(argsort1, all_label_ids[i], rank)
    ranks.append(rank)

In [9]:
accuracy_dict = rank_accuracy(ranks)
# show the accuracy for highest rank candit
accuracy_dict[0]

0.6257225

## Dev File Relation Prediction

In [5]:
# load dev data
eval_examples = processor.get_dev_examples(data_path)
eval_features = convert_examples_to_features(eval_examples, label_list, max_seq_length, tokenizer)

all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)

eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)

# do predict
eval_sampler = SequentialSampler(eval_data)
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=eval_batch_size)

preds = []

for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Testing"):
    input_ids = input_ids.to(device)
    input_mask = input_mask.to(device)
    segment_ids = segment_ids.to(device)
    label_ids = label_ids.to(device)
    
    with torch.no_grad():
        logits = model(input_ids, segment_ids, input_mask, labels=None)

    if len(preds) == 0:
        preds.append(logits.detach().cpu().numpy())
    else:
        preds[0] = np.append(
            preds[0], logits.detach().cpu().numpy(), axis=0)

preds = preds[0]
print(preds, preds.shape)

all_label_ids = all_label_ids.numpy()

Testing: 100%|██████████| 1563/1563 [09:08<00:00,  2.85it/s]

[[-1.3663081  -0.35011637  4.215994   ... -5.3430133  -5.9850407
  -7.349557  ]
 [ 2.7053308  11.519518    6.2654366  ... -6.949276   -7.282262
  -8.332974  ]
 [-4.7281375   0.35013202  2.667777   ... -6.9289436  -6.215674
  -8.118363  ]
 ...
 [-2.7581234   1.8021257   3.6017327  ... -8.592715   -8.177821
  -9.730576  ]
 [-1.4960191   1.7245771   4.6012874  ... -6.1359034  -6.5996304
  -7.611351  ]
 [-3.303895   -0.8380291   2.221367   ... -7.6081243  -7.2373414
  -8.242722  ]] (50000, 6331)





In [6]:
# calculate the rank of the correct answer location in the 
ranks = []
filter_ranks = []
hits = []
hits_filter = []
for i in range(10):
    hits.append([])
    hits_filter.append([])

for i, pred in enumerate(preds):
    rel_values = torch.tensor(pred)
    _, argsort1 = torch.sort(rel_values, descending=True)
    argsort1 = argsort1.cpu().numpy()

    rank = np.where(argsort1 == all_label_ids[i])[0][0]
    #print(argsort1, all_label_ids[i], rank)
    ranks.append(rank)

In [7]:
accuracy_dict = rank_accuracy(ranks)
# show the accuracy for highest rank candit
accuracy_dict[0]

0.36992

## Test File Relation Prediction

In [15]:
# load test data
eval_examples = processor.get_test_examples(data_path)
eval_features = convert_examples_to_features(eval_examples, label_list, max_seq_length, tokenizer)

all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)

eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)

# do predict
eval_sampler = SequentialSampler(eval_data)
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=eval_batch_size)

preds = []

for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Testing"):
    input_ids = input_ids.to(device)
    input_mask = input_mask.to(device)
    segment_ids = segment_ids.to(device)
    label_ids = label_ids.to(device)
    
    with torch.no_grad():
        logits = model(input_ids, segment_ids, input_mask, labels=None)

    if len(preds) == 0:
        preds.append(logits.detach().cpu().numpy())
    else:
        preds[0] = np.append(
            preds[0], logits.detach().cpu().numpy(), axis=0)

preds = preds[0]
print(preds, preds.shape)

all_label_ids = all_label_ids.numpy()

Testing: 100%|██████████| 1563/1563 [06:42<00:00,  3.88it/s]

[[ 1.323254    5.1532063   4.985848   ... -6.9448853  -7.2415037
  -8.597521  ]
 [-0.45913562  4.6801825  12.855275   ... -6.348547   -6.891628
  -8.719812  ]
 [ 0.41970402  2.4043767   9.534689   ... -8.552229   -8.642421
  -9.782871  ]
 ...
 [-2.9259968   0.89694214  6.954977   ... -7.8598304  -8.264762
  -9.413802  ]
 [-1.40529    -1.5783433   3.7033048  ... -7.8304453  -8.069033
  -9.473032  ]
 [-2.7316413   5.6561646   4.4248576  ... -8.029417   -7.7321568
  -9.467815  ]] (50000, 6331)





In [43]:
# calculate the rank of the correct answer location in the 
ranks = []
filter_ranks = []
hits = []
hits_filter = []
for i in range(10):
    hits.append([])
    hits_filter.append([])

for i, pred in enumerate(preds):
    rel_values = torch.tensor(pred)
    _, argsort1 = torch.sort(rel_values, descending=True)
    argsort1 = argsort1.cpu().numpy()

    rank = np.where(argsort1 == all_label_ids[i])[0][0]
    #print(argsort1, all_label_ids[i], rank)
    ranks.append(rank)

In [47]:
accuracy_dict = rank_accuracy(ranks)
# show the accuracy for highest rank candit
accuracy_dict[0]

0.37148