In [2]:
from abc import ABCMeta, abstractmethod
class CandidateGenerator(metaclass=ABCMeta):
    def __init__(self, entities):
        self.entities = entities
        pass
    
    @abstractmethod
    def train(self, mentions, entities):
        pass
    
    @abstractmethod
    def generate_candidates(self, mentions, k):
        pass

class Tokenizer(metaclass=ABCMeta):
    @abstractmethod
    def tokenize(self, text):
        pass

In [5]:
import MeCab

class MeCabTokenizer(Tokenizer):
    def __init__(self, mecab):
        self.mecab = mecab
        
    def tokenize(self, text):
        tokens = self.mecab.parse(text).split()
        return tokens
mecab = MeCab.Tagger('-Owakati')
tokenizer = MeCabTokenizer(mecab)
tokenizer.tokenize("こんにちは宇宙")

['こんにちは', '宇宙']

In [30]:
from rank_bm25 import BM25Okapi

class BM25Generator(CandidateGenerator):
    def __init__(self, entities, tokenizer):
        """
        arg:
            entities: List[[entity, mention]]
            tokenizer: tokenizer
        """
        super().__init__(entities)
        self.tokenizer = tokenizer
        self.entities = [[e[0], self.tokenizer.tokenize(e[1])] for e in self.entities]
        print(self.entities[:20])
        self.bm25 = BM25Okapi([e[1] for e in self.entities])
        
    def train(self, mentions, entities):
        pass
    
    def generate_candidates(self, mentions, k):
        mentions = [self.tokenizer.tokenize(m) for m in mentions]
        candidates = [self.bm25.get_top_n(m, self.entities, n=k) for m in mentions]
        candidates = [[t[0] for t in tt] for tt in candidates]
        return candidates

In [33]:
import csv
entities = []
with open("/data1/ujiie/wiki_resource/alias_table.tsv", "r") as f:
    reader = csv.reader(f)
    for line in reader:
        if line[0] == "id":
            continue
        if len(line) != 2:
            print(line)
        entities.append(line)

In [34]:
generator = BM25Generator(entities, tokenizer)

[['17432', ['記号']], ['6932', ['ラテン語']], ['90949', ['合', '字']], ['1848349', ['Trebuchet', 'MS']], ['16771', ['Z']], ['13663', ['ラテン', '文字']], ['18665', ['アルファベット']], ['2860395', ['ゲール', '文字']], ['67756', ['欧米']], ['371198', ['無声', '歯茎', '側面', '摩擦音']], ['558367', ['発音', '記号']], ['69', ['プログラミング', '言語']], ['1022', ['C']], ['34108', ['演算', '子']], ['3558', ['PHP']], ['613697', ['参照', '渡し']], ['1487', ['BASIC']], ['225994', ['文字', '列']], ['2454', ['マイクロソフト']], ['12099', ['十', '六', '進', '表記']]]


In [36]:
generator.generate_candidates(["ラテン", "オカピ", "Okapi"], 10)

[['122253',
  '87504',
  '155355',
  '13663',
  '549564',
  '206006',
  '9299',
  '92256',
  '6932',
  '1011069'],
 ['61543',
  '982166',
  '1955411',
  '647688',
  '446530',
  '3792760',
  '432680',
  '3813554',
  '470790',
  '1877939'],
 ['1860426',
  '1955411',
  '2114104',
  '647688',
  '2354056',
  '446530',
  '3792760',
  '432680',
  '3813554',
  '470790']]