# MetaCAT - Tokenizer & Embeddings

In [1]:
import numpy as np
from medcat.tokenizers.meta_cat_tokenizers import ByteLevelBPETokenizer, TokenizerWrapperBPE
from gensim.models import Word2Vec
from pathlib import Path

In [2]:
# Input
data_dir = Path.cwd().parents[0] / 'data'
text_file = data_dir / 'nlwiki_20200901_geneeskunde_depth4.txt'

# Output
model_dir = Path.cwd().parents[0] / 'models' / 'bilstm'
model_dir.mkdir(parents=True, exist_ok=True)
embeddings_file = model_dir / 'embeddings.npy'

## Create tokenizer

In [3]:
# Create, train and save the tokenizer
tokenizer = ByteLevelBPETokenizer(add_prefix_space=True)
tokenizer.train(text_file.__str__())

In [4]:
# Convert to wrapped tokenizer for MetaCAT
wrapped_tokenizer = TokenizerWrapperBPE(hf_tokenizers=tokenizer)

# Save the tokenizer
wrapped_tokenizer.save(model_dir.__str__())

## Create embeddings matrix

In [5]:
# Tokenize text and train with Word2Vec
text_data = []
with open(text_file, encoding='utf-8') as text:
    for line in text:
        text_data.append(tokenizer.encode(line).tokens)
w2v = Word2Vec(text_data, size=300, min_count=1)

In [6]:
# Check trained word2vec model
# Ġ denotes start of word (a space)
w2v.wv.most_similar('Ġhoesten')

[('Ġkortademigheid', 0.9043293595314026),
 ('Ġzweten', 0.8842253684997559),
 ('Ġniezen', 0.8775047063827515),
 ('Ġjeuk', 0.8688153624534607),
 ('Ġbraken', 0.8676013946533203),
 ('Ġdiarree', 0.8642471432685852),
 ('Ġspierpijn', 0.8637697100639343),
 ('Ġkoude', 0.8457282781600952),
 ('Ġmisselijkheid', 0.8452661037445068),
 ('Ġschokken', 0.8435850143432617)]

In [7]:
# Create embeddings matrix
embeddings = []
for i in range(tokenizer.get_vocab_size()):
    word = tokenizer.id_to_token(i)
    if word in w2v.wv:
        embeddings.append(w2v.wv[word])
    else:
        # Assign a random vector if the word was not frequent enough to receive an embedding
        embeddings.append(np.random.rand(300))
# Save the embeddings
np.save(open(embeddings_file, 'wb'), np.array(embeddings))