## English

In [52]:
train_file = 'data/train.txt'
valid_file = 'data/valid.txt'
test_file = 'data/test.txt'
files = [train_file, valid_file ,test_file]
files

['data/train.txt', 'data/valid.txt', 'data/test.txt']

In [59]:
def get_data(file_name):
    words = []
    pos_tags = []
    ner_tags = []
    with open(file_name) as f:
        for line in f:
            line = line.strip()
            if len(line) !=0 and not line.startswith('-DOCSTART-'):
                ls = line.split(' ')
                words.append(ls[0].lower())
                pos_tags.append(ls[1])
                ner_tags.append(ls[-1])
    return words, pos_tags, ner_tags

In [60]:
words_set = set()
pos_tags_set = set()
ner_tags_set = set()

for file_name in files:
    words, pos_tags, ner_tags = get_data(file_name)
    words_set.update(words)
    print(len(words_set))
    pos_tags_set.update(pos_tags)
    ner_tags_set.update(ner_tags)

21009
23865
26869


In [64]:
len(pos_tags_set)
pos_tags_set

{'"',
 '$',
 "''",
 '(',
 ')',
 ',',
 '.',
 ':',
 'CC',
 'CD',
 'DT',
 'EX',
 'FW',
 'IN',
 'JJ',
 'JJR',
 'JJS',
 'LS',
 'MD',
 'NN',
 'NNP',
 'NNPS',
 'NNS',
 'NN|SYM',
 'PDT',
 'POS',
 'PRP',
 'PRP$',
 'RB',
 'RBR',
 'RBS',
 'RP',
 'SYM',
 'TO',
 'UH',
 'VB',
 'VBD',
 'VBG',
 'VBN',
 'VBP',
 'VBZ',
 'WDT',
 'WP',
 'WP$',
 'WRB'}

In [65]:
ner_tags_set

{'B-LOC', 'B-MISC', 'B-ORG', 'B-PER', 'I-LOC', 'I-MISC', 'I-ORG', 'I-PER', 'O'}

In [66]:
dim_word = 100
# glove files
filename_glove = "data/glove.6B/glove.6B.{}d.txt".format(dim_word)
# trimmed embeddings (created from glove_filename with build_data.py)
filename_trimmed = "data/glove.6B.{}d.trimmed.npz".format(dim_word)

print(filename_glove)
print(filename_trimmed)

data/glove.6B/glove.6B.100d.txt
data/glove.6B.100d.trimmed.npz


In [67]:
glove_vocab = set()
with open(filename_glove) as f:
    for line in f:
        word = line.strip().split(' ')[0]
        glove_vocab.add(word)


In [68]:
print(len(glove_vocab))

400000


In [69]:
UNK = "$UNK$"
NUM = "$NUM$"
NONE = "O"

In [70]:
# glove_vocab
# words_set

In [71]:
# get intersection of words_set and glove_vocab
vocab = words_set & glove_vocab
vocab.add(UNK)
vocab.add(NUM)
print(len(vocab))

22949


In [32]:
def write_vocab(file_name, vocab):
    with open(file_name, 'w') as f:
        for i, token in enumerate(vocab):
            if i != len(vocab)-1:
                f.write("{}\n".format(token))
            else:
                f.write(token)
    print("- done. {} tokens".format(len(vocab)))

In [73]:
write_vocab('data/word_vocab.txt', vocab)
write_vocab('data/pos_tag_vocab.txt', pos_tags_set)
write_vocab('data/ner_tag_vocab.txt', ner_tags_set)

- done. 22949 tokens
- done. 45 tokens
- done. 9 tokens


## Chinese

In [35]:
train_file = 'data/ch_train.txt'
test_file = 'data/ch_test.txt'

files = [train_file ,test_file]
files

['data/ch_train.txt', 'data/ch_test.txt']

In [36]:
def get_chinese_char(file_name):
    words = []
    ner_tags = []
    with open(file_name, encoding='utf-8') as f:
        for line in f:
#             print(line)
            if line != '\n':
                ls = line.strip().split()
                words.append(ls[0].lower())
                ner_tags.append(ls[-1])
    return words, ner_tags

In [58]:
words_set = set()
ner_tags_set = set()

for file_name in files:
    words, ner_tags = get_chinese_char(file_name)
    words_set.update(words)
    print('# of characters', len(words_set))
    ner_tags_set.update(ner_tags)

# of characters 4743
# of characters 4808


In [59]:
print(len(words_set))
print(len(ner_tags_set))

4808
7


In [60]:
ner_tags_set

{'B-LOC', 'B-ORG', 'B-PER', 'I-LOC', 'I-ORG', 'I-PER', 'O'}

In [61]:
UNK = "$UNK$"
NUM = "$NUM$"
ENG = "$ENG$"
# PAD = "$PAD$"
words_set.add(UNK)
words_set.add(NUM)
words_set.add(ENG)

In [62]:
write_vocab('data/ch_word_vocab.txt', words_set)
write_vocab('data/ch_ner_tag_vocab.txt', ner_tags_set)

- done. 4811 tokens
- done. 7 tokens
