In [16]:
tag2index = {"O": 0,
             "B-PER": 1, "I-PER": 2,
             "B-LOC": 3, "I-LOC": 4,
             "B-ORG": 5, "I-ORG": 6
             }

In [22]:
UNK = "$UNK$"
NUM = "$NUM$"
ENG = "$ENG$"
PAD = "$PAD$"

In [23]:
def load_char2index(file_name):
    token2idx = {}
    token2idx[PAD] = 0
    with open(file_name) as f:
        for idx, token in enumerate(f):
            token = token.strip()
            token2idx[token] = idx + 1
    return token2idx

In [24]:
word2index = load_char2index('data/ch_word_vocab.txt')

In [26]:
print(word2index[UNK])
print(word2index[NUM])
print(word2index[ENG])
print(word2index[PAD])

957
4324
1504
0


In [27]:
print(len(word2index))
print(len(tag2index))

4812
7


In [28]:
def get_processing_token(token2index, lowercase=False):
    
    def f(token):
        
        if lowercase:
            token = token.lower()
            
        if token.isdigit():
            token = NUM
        elif ('\u0041' <= word <='\u005a') or ('\u0061' <= word <='\u007a'):
            token = ENG
            
        if token in token2index:
            token = token2index[token]
        else:
            token = token2index[UNK]
        
        return token

    return f

In [29]:
process_word_f = get_processing_token(word2index, False)
process_tag_f = get_processing_token(tag2index, False)

In [30]:
class Dataset(object):
    def __init__(self, file_name, processing_word, processing_tag, max_iter=None):
        self.file_name = file_name
        self.processing_word = processing_word
        self.processing_tag = processing_tag
        self.max_iter = max_iter
        
    def __iter__(self):
        words = []
        ner_tags = []
        niter = 0
        with open(self.file_name, encoding='utf-8') as f:
            for line in f:
#                 line = line.strip()
                if line == '\n':
                    if len(words)!=0:
                        niter+=1
                        if self.max_iter is not None and niter > self.max_iter:
                            break
                        yield (words, pos_tags)
                        words, pos_tags = [], []
                else:
                    ls = line.strip().split()
                    word, ner_tag = ls[0], ls[-1]
                    if self.processing_word is not None:
                        word = self.processing_word(word)
                    if self.processing_tag is not None:
                        ner_tag = self.processing_tag(ner_tag)
                    words += [word]
                    ner_tags += [ner_tag]

In [31]:
def minibatch(dataset, batch_size):
    xbatch, ybatch = [], []
    for word, tag in dataset:
        if len(xbatch) == batch_size:
            yield xbatch, ybatch
            xbatch, ybatch = [], []
            
        xbatch += [word]
        ybatch += [tag]
    
    if len(xbatch) != 0:
        yield xbatch, ybatch  