# MetaCAT - Tokenizer & Embeddings

In [1]:
import numpy as np
from tokenizers import ByteLevelBPETokenizer
from gensim.models import Word2Vec
from pathlib import Path
from medcat.preprocessing.tokenizers import TokenizerWrapperBPE

In [2]:
# Input
data_dir = Path.cwd().parents[0] / 'data'
text_file = data_dir / 'data.txt'

# Output
model_dir = Path.cwd().parents[0] / 'models' / 'bilstm'
model_dir.mkdir(parents=True, exist_ok=True)
embeddings_file = model_diroutput_dir / 'embeddings.npy'

# Name should contain 'bbpe' for ByteLevelBPETokenizer or 'bert' for BertTokenizerFast
# This name is saved in the model_config dict and subssequently in vars.dat on disk.
tokenizer_name = 'bbpe_dutch-wikipedia'

## Create tokenizer

In [3]:
# Create, train and save the tokenizer
tokenizer = ByteLevelBPETokenizer(add_prefix_space=True)
tokenizer.train(text_file.__str__())

In [4]:
# Save the tokenizer
tokenizer.save_model(model_dir.__str__(), tokenizer_name)

['D:\\Repositories\\negation-detection\\output\\bilstm\\bbpe_dutch-wikipedia-vocab.json',
 'D:\\Repositories\\negation-detection\\output\\bilstm\\bbpe_dutch-wikipedia-merges.txt']

## Create embeddings matrix

In [5]:
# Tokenize text and train with Word2Vec
text_data = []
with open(text_file, encoding='utf-8') as text:
    for line in text:
        text_data.append(tokenizer.encode(line).tokens)
w2v = Word2Vec(text_data, size=300, min_count=1)

In [6]:
# Check trained word2vec model
# Ġ denotes start of word (a space)
w2v.wv.most_similar('Ġhoesten')

[('Ġkortademigheid', 0.9220506548881531),
 ('Ġniezen', 0.9031012654304504),
 ('Ġbraken', 0.887566089630127),
 ('Ġjeuk', 0.876832127571106),
 ('Ġconstipatie', 0.8761879205703735),
 ('Ġdiarree', 0.8760838508605957),
 ('Ġspierpijn', 0.8698310852050781),
 ('Ġovergeven', 0.8678290843963623),
 ('Ġbenauwdheid', 0.8653152585029602),
 ('Ġmisselijkheid', 0.8651226758956909)]

In [7]:
# Create embeddings matrix
embeddings = []
for i in range(tokenizer.get_vocab_size()):
    word = tokenizer.id_to_token(i)
    if word in w2v.wv:
        embeddings.append(w2v.wv[word])
    else:
        # Assign a random vector if the word was not frequent enough to receive an embedding
        embeddings.append(np.random.rand(300))

In [8]:
# Save the embeddings
np.save(open(embeddings_file, 'wb'), np.array(embeddings))