In [1]:
import pandas as pd
from rank_bm25 import BM25Okapi
wp_title_desc = pd.read_csv("dataset/KB/wp_title_desc_wd0315.tsv", sep="\t", header=None)
at_base = pd.read_csv("dataset/KB/at-base.tsv", sep="\t", header=None)

## Create&Save BM25 Model

In [2]:
import jieba
import pickle
import os

def create_bm25(corpus, type="word"):
    if type == "word":
        tokenized_corpus = [str(doc).split(" ") for doc in corpus]
    elif type == "sentence":
        tokenized_corpus = [list(jieba.cut(doc)) for doc in corpus]
    elif type == "none":
        tokenized_corpus = [[str(doc)] for doc in corpus]
    bm25 = BM25Okapi(tokenized_corpus)
    return bm25

def create_save_bm25(corpus, model_name, type="word", save_path="models/"):
    bm25 = create_bm25(corpus, type)
    # save 
    with open(save_path + model_name + '.pkl', 'wb') as f:
        pickle.dump(bm25, f)
        
def load_bm25(model_name, load_path="models/"):
    with open(load_path + model_name + '.pkl', 'rb') as f:
        bm25 = pickle.load(f)
    return bm25


# Load BM25 Model
### Entity name match

### Description match

### Load Dataset

In [3]:
import numpy as np
import jieba
import json
zero_shot_data = []
retrieval_number = 10

corpus = (wp_title_desc.iloc[:,2]).to_list()
# get query
with open("dataset/zero-shot.jsonl", 'r', encoding='utf-8') as file:
    for line in file:
        json_line = json.loads(line.strip())
        zero_shot_data.append(json_line)


### Main Table Search

In [4]:
# Load Model
if not os.path.exists("models/bm25_table_name.pkl"):
    corpus = (wp_title_desc.iloc[:,1]).to_list()
    print("Begin to create bm25 model")
    create_save_bm25(corpus, "bm25_table_name", type="word")
TA_bm25 = load_bm25("bm25_table_name")

if not os.path.exists("models/bm25_alias_table.pkl"):
    corpus = (at_base.iloc[:,0]).to_list()
    print("Begin to create bm25 model")
    create_save_bm25(corpus, "bm25_alias_table", type="word")
AT_bm25 = load_bm25("bm25_alias_table")



Begin to create bm25 model


In [25]:
def get_coarse_list(tokenized_mention):
    retrieval_number = 50
    # TA
    scores = TA_bm25.get_scores(tokenized_mention)
    table_name_top_n = np.argsort(scores)[::-1][:retrieval_number]
    TA_filtered_description = [wp_title_desc.iloc[index,2] for index in table_name_top_n]
    TA_filtered_id = [wp_title_desc.iloc[index,0] for index in table_name_top_n]
    # print(TA_filtered_description, TA_filtered_id)
    
    # AT
    scores = AT_bm25.get_scores(tokenized_mention)
    table_name_top_n = np.argsort(scores)[::-1][:retrieval_number]
    alias_qid = [at_base.iloc[index,1] for index in table_name_top_n]
    alias_descript = []
    alias_id = []
    for qid in alias_qid:
        q_wp = wp_title_desc[wp_title_desc[0]== qid]
        if len(q_wp[0]):
            alias_descript.append(q_wp.iloc[0,2])
            alias_id.append(q_wp.iloc[0,0])
    print(alias_descript, alias_id)
    return TA_filtered_id + alias_qid

In [23]:
def get_fine_list(tokenized_mention, tokenized_descript, retrieval_number):
    description,q_id = get_coarse_list(tokenized_mention)
    DPT_bm25 = create_bm25(description, "sentence")
    scores = DPT_bm25.get_scores(tokenized_mention)
    table_name_top_n = np.argsort(scores)[::-1][:retrieval_number]
    return [description[index] for index in table_name_top_n], [q_id[index] for index in table_name_top_n]


In [28]:
# with open("BM25_Candidate.json", "r") as file:
#     json_data = json.load(file)
json_data = []

for index, row in enumerate(zero_shot_data): 
    if index != -1:
        id = row["id"]
        mention = row["mention"]
        description = row["text"]
        tokenized_mention = list(jieba.cut(mention))
        tokenized_descript = list(jieba.cut(description))

        print(index, ":    ", mention)
        retrieval_number = 100

        # _, qid_list = get_fine_list(tokenized_mention, tokenized_descript, retrieval_number)
        qid_list = get_coarse_list(tokenized_mention)
        json_data.append({"id":id, "candidate": qid_list})

with open("BM25_Candidate.json", "w") as file:
    json.dump(json_data, file, indent=4)

0 :     邓立光
1 :     左更增七
2 :     议会选举
3 :     王曰善
4 :     春逝/ marie
5 :     戴浩
6 :     伊特尔
7 :     苏健
8 :     机界战队全界者
9 :     裴家营站
10 :     费城2020-21赛季
11 :     江苏卫视跨年演唱会
12 :     2019大阪国际挑战赛
13 :     宁西街道
14 :     方弘
15 :     鱼骨沙洲
16 :     商朝晖
17 :     何德兰
18 :     She Got Me
19 :     跨过鸭绿江
20 :     钟欣
21 :     小欢喜
22 :     储泽祥
23 :     2019年世界羽联巡回赛印度公开赛
24 :     万圣节杀戮
25 :     南安郡
26 :     潘心城
27 :     精神错乱
28 :     戴树和
29 :     追杀夏娃
30 :     格林维尔市
31 :     河静教区
32 :     斯坦普·菲尔泰斯
33 :     何耀
34 :     PRODUCE X 101
35 :     國立東華大學原住民民族學院
36 :     中国科学院空天信息创新研究院
37 :     YOOK O'clock
38 :     黄旖旎
39 :     海藏
40 :     骄傲游行
41 :     俄罗斯修宪公投
42 :     Oh my god
43 :     天之骄子
44 :     美国犯罪的最后日子
45 :     杜汶泽喱骚
46 :     新入史官丘海昤
47 :     阿城北站
48 :     马雄成
49 :     哈哈农夫
50 :     俘获芳心频道
51 :     景雪变
52 :     SBL明星賽
53 :     光之美少女 Miracle Universe
54 :     刘语熙rachel
55 :     龟兹研究院
56 :     李察朱威尔事件
57 :     王雙玉
58 :     梅朵
59 :     蠡墅站
60 :     云湖桥站
61 :     Secret Boutique
62 :     唐人街探案3
63 :   