In [1]:
from transformers import BertModel, BertTokenizer, PretrainedConfig, PreTrainedModel
import torch
import os
from datasets import load_dataset,load_from_disk
import tqdm
import pandas as pd
import re
import numpy as np
import csv
import json
import time
from zhconv import convert

SPECIAL_TOKENS = ['[unused0]', '[unused1]', '[unused2]']



  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(


In [2]:
dataset_path =  '/home/yaozhiming/NLP/tasks/zero-shot.jsonl'

#读取测试数据
with open(dataset_path, 'r') as validation:
    val_datas = validation.readlines()

 

In [4]:
#at base
at_base = pd.read_csv('/home/yaozhiming/NLP/data/at-base.tsv',sep='\t',names= ['title', 'qid', 'freq'])
at_base['title'] = at_base['title'].fillna("")
at_base['title'] = at_base['title'].apply(to_simpified_chinese)
at_base

Unnamed: 0,title,qid,freq
0,台湾,Q22502,89974
1,台湾,Q865,398
2,台湾,Q137816,110
3,台湾,Q245107,90
4,台湾,Q32081,62
...,...,...,...
2360061,阿波罗登月飞行器,Q46611,1
2360062,阿波罗号,Q430728,1
2360063,高球场,Q1048525,1
2360064,阿波罗祀祝节,Q1813885,1


In [5]:
#knowledge data
kb_path = "/home/yaozhiming/NLP/data/kb.jsonl"
with open(kb_path, 'r') as f:
    l = f.readlines()


kn_data = []
for line in tqdm.tqdm(l):
    data = json.loads(line.strip("\n"))
    kn_data.append(data)

100%|██████████| 1106303/1106303 [03:19<00:00, 5556.20it/s]


In [6]:
#knowledge base
def to_simpified_chinese(x):
    x = re.sub(' +', '', x)
    return convert(x, 'zh-cn')


kb_df = pd.DataFrame(kn_data)
kb_df['title'] = kb_df['title'].fillna("")
kb_df['title'] = kb_df['title'].apply(to_simpified_chinese)
kb_df

Unnamed: 0,qid,title,text,vector
0,Q1,宇宙,宇宙是所有时间、空间与其包含的内容物所构成的统一体；它包含了行星、恆星、星系、星系际空间、次...,"[[0.3392995894, 0.1768409908, -0.0198271517, -..."
1,Q2,地球,地球是太阳系中由内及外的第三颗行星，距离太阳149 597 890.7公里/1天文单位，是宇...,"[[0.2487931252, 0.1893451214, -0.0190741345, -..."
2,Q3,生命,生命是一种特征，物质存在的一种活跃形式。目前对于生命的定义在学术界还无共识，较流行的定义是一...,"[[0.3636945784, 0.2380280644, 0.0107873324, -0..."
3,Q4,死亡,死亡（），是相对于生命体存在（存活）的生命现象，指维持一个生物存活的所有的永久终止。能够导致...,"[[0.153353855, 0.1682929695, -0.1472070962, 0...."
4,Q5,人,人在生物学上通常指智人（），偶尔也泛指人属的史前物种，为灵长目、人科的一部分，人属成员大致都...,"[[0.2332355231, 0.1824601293, 0.0065428372, 0...."
...,...,...,...,...
1106298,Q105978805,CHALLENGER,《CHALLENGER》是日本男子组合JO1的第3张单曲，将于2021年4月28日由发行。 ...,"[[0.1082611158, 0.5486280918, -0.1542419493, 0..."
1106299,Q105979347,The_Renaissance_(Super_Junior专辑),《The Renaissance》是韩国演唱团体Super Junior的第十张正规专辑，于...,"[[0.2598350942, 0.4277872443, -0.2423282266, 0..."
1106300,Q105981916,华亭县_(隋朝),华亭县，中国曾经设置的一个县，在今甘肃省华亭市。 Section::::沿革. Secti...,"[[-0.0635062754, 0.1128137559, -0.3678101003, ..."
1106301,Q105983449,过江新娘,《过江新娘》（）是新加坡新传媒私人有限公司制作的关于越南新娘的电视剧。此剧由及徐彬领衔主演，...,"[[0.1781165004, 0.2649869323, -0.3329152167, 0..."


In [7]:
class DualBertConfig(PretrainedConfig):
    model_type = 'dual_bert'
    def __init__(self, **kwargs):
        self.bert_model_name = kwargs.pop('bert_model_name', 'bert-base-chinese')
        self.tokenizer_len = kwargs.pop('tokenizer_len', 21128)
        super().__init__(**kwargs)


class DualBert(PreTrainedModel):
    config_class = DualBertConfig
    def __init__(self, config):
        super().__init__(config)
        self.bert1 = BertModel.from_pretrained(config.bert_model_name)
        self.bert2 = BertModel.from_pretrained(config.bert_model_name)
        self.bert1.resize_token_embeddings(config.tokenizer_len)
        self.bert2.resize_token_embeddings(config.tokenizer_len)

    def forward(self, input_text, candidate_text):
        batch_cls1 = self.bert1(**input_text).last_hidden_state[:, 0, :]
        batch_cls2 = self.bert2(**candidate_text).last_hidden_state[:, 0, :]
        similarity_scores = batch_cls1.mm(batch_cls2.T)
        return similarity_scores




In [31]:
#读取bm25的dataframe
with open("/home/yaozhiming/NLP/data/candidates.json", 'r') as file:
    l  = json.load(file)
bm25_df = pd.DataFrame(l)
bm25_df

Unnamed: 0,id,candidate
0,hansel-eval-zs-0,"[Q97300711, Q1579354, Q1579265, Q1579270, Q157..."
1,hansel-eval-zs-1,"[Q11090848, Q105985227, Q1579238, Q1579397, Q1..."
2,hansel-eval-zs-2,"[Q105985227, Q1579238, Q1579406, Q1579397, Q15..."
3,hansel-eval-zs-3,"[Q97958559, Q105985227, Q1579367, Q1579270, Q1..."
4,hansel-eval-zs-4,"[Q105985227, Q1579238, Q1579406, Q1579397, Q15..."
...,...,...
4710,hansel-eval-zs-4710,"[Q9063180, Q105985227, Q1579354, Q1579265, Q15..."
4711,hansel-eval-zs-4711,"[Q10552244, Q16502, Q47064, Q1579367, Q1579285..."
4712,hansel-eval-zs-4712,"[Q24885548, Q637776, Q105985227, Q1579354, Q15..."
4713,hansel-eval-zs-4713,"[Q105985227, Q1579238, Q1579406, Q1579397, Q15..."


In [32]:
#test BM25 recall rate

pretrained_path = "/home/yaozhiming/NLP/results/"
tokenizer = BertTokenizer.from_pretrained(pretrained_path)   
config = DualBertConfig(bert_model_name = "/home/yaozhiming/NLP/bert-base-chinese/",tokenizer_len=len(tokenizer))
model = DualBert.from_pretrained(pretrained_path,config=config)
cnt = zero_qid = correct = 0



def find_match_index(qid, data_base):
    start = 0
    end = len(data_base)-1
    while start <= end:
        id = int(qid[1:])
        mid = start + (end-start)//2
        search_id = int(data_base.iloc[mid]['qid'][1:])
        if search_id == id:
            return mid
        elif search_id < id:
            start = mid + 1
        else:
            end = mid - 1
    return -1

def get_score(QID):
    index = find_match_index(QID, kb_df)
    candidate_last_hidden_state = torch.tensor(kb_df.iloc[index]['vector'])
    similarity_scores = torch.mm(input_last_hidden_state, candidate_last_hidden_state.T)
    return similarity_scores

for i in tqdm.tqdm(val_datas):
    t0 = time.time()
    val_data = json.loads(i)
    
    #at_base 候选
    mention = to_simpified_chinese(val_data['mention'])
    contains_keyword_base = np.vectorize(lambda x: mention in x)(at_base['title'])
    QID_candidates_base = at_base[contains_keyword_base]['qid']
    
    #knowledge base候选
    contains_keyword_kb = np.vectorize(lambda x: mention in x)(kb_df['title'])
    QID_candidates_kb = kb_df[contains_keyword_kb]['qid']
    
    QID_candidates = list(QID_candidates_base) 
    
    
    
    
    #BM25候选
    id = val_data['id']
    contains_keyword_bm25 = np.vectorize(lambda x: id == x)(bm25_df['id'])
    QID_candidates_bm25 = bm25_df[contains_keyword_bm25]['candidate']
    if len(QID_candidates_bm25) != 0:
        QID_candidates_bm25 = list(QID_candidates_bm25)[0]
    else:
        QID_candidates_bm25 = []
    
    #候选QID为at_base候选，bm25候选，kb候选的并集
    QID_candidates = list(QID_candidates_base) + list(QID_candidates_bm25) + list(QID_candidates_kb)
    QID_candidates = list(set(QID_candidates))
    
    gt_QID = val_data['gold_id']
    
    #无候选QID
    if len(QID_candidates) == 0:
        zero_qid += 1
        # print("QID 0")
        continue
        # QID_candidates = list(kb_df['qid'])
    
    score_lists = []
    
     #计算mention text的embedding
    context_left = val_data['text'][:val_data['start']]
    context_right = val_data['text'][val_data['end']:]
    input_text = tokenizer.cls_token + context_left + SPECIAL_TOKENS[0] + val_data['mention'] + SPECIAL_TOKENS[1] + context_right + tokenizer.sep_token
    input_encodings = tokenizer(input_text, return_tensors='pt', add_special_tokens=False, padding=True, max_length=128, truncation=True)
    with torch.no_grad():
        input_outputs = model.bert1(**input_encodings) 
        input_last_hidden_state = input_outputs.last_hidden_state[:, 0, :]

    #计算相似度
    for j in range(len(QID_candidates)):
        QID = QID_candidates[j]
        similarity_scores = get_score(QID)
        score_lists.append(similarity_scores)
    
    #预测QID
    ans_QID = QID_candidates[score_lists.index(max(score_lists))]
    if ans_QID == gt_QID:
        correct += 1
    cnt += 1
print(cnt, correct, zero_qid)

You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 21129. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc
You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 21129. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc
100%|██████████| 4715/4715 [40:26<00:00,  1.94it/s]  

4715 3077 0





In [34]:
acc = correct/cnt
acc

0.6525980911983033

In [None]:
#用于提前存储
import torch.nn as nn
import os
file = "/home/yaozhiming/NLP/data/kb.jsonl"

if os.path.exists(file):
    os.remove(file)
    
tokenizer = BertTokenizer.from_pretrained(pretrained_path)   

config = DualBertConfig(bert_model_name = "/home/yaozhiming/NLP/bert-base-chinese/",tokenizer_len=len(tokenizer))
model = DualBert.from_pretrained(pretrained_path,config=config)
model_b2 = model.bert2

for index in tqdm.tqdm(range(len(kb_df))):
    
    candidate_text = tokenizer.cls_token + str(kb_df.iloc[index]['title']) + SPECIAL_TOKENS[2] + str(kb_df.iloc[index]['text']) + tokenizer.sep_token
    candidate_encodings = tokenizer(candidate_text, return_tensors='pt', add_special_tokens=False, padding=True, max_length=128, truncation=True)
    
    model.eval()
    with torch.no_grad():
        candidate_outputs = model_b2(**candidate_encodings) 
        candidate_last_hidden_state = candidate_outputs.last_hidden_state[:, 0, :]
    knowledge = kb_df.iloc[index]
    knowledge['vector'] = candidate_last_hidden_state.numpy()
    knowledge_str = knowledge.to_json(orient = 'columns', force_ascii=False)
    with open(file, 'a') as fout:
        fout.write(knowledge_str)
        fout.write("\n")