# Create tokenizer & embedding matrix
This notebook generates a tokenizer and embedding matrix based on medical articles from the Dutch Wikipedia. Everything in this notebook is based on the MedCAT tutorials, see https://github.com/CogStack/MedCAT/tree/master/tutorial.

In [1]:
import numpy as np
from medcat.tokenizers.meta_cat_tokenizers import ByteLevelBPETokenizer, TokenizerWrapperBPE
from gensim.models import Word2Vec
from pathlib import Path

In [2]:
# Input
data_dir = Path.cwd().parents[0] / 'data'
text_file = data_dir / 'nlwiki_20200901_geneeskunde_depth4.txt'

# Output
model_dir = Path.cwd().parents[0] / 'models' / 'bilstm'
model_dir.mkdir(parents=True, exist_ok=True)
embeddings_file = model_dir / 'embeddings.npy'

In [4]:
data_dir

WindowsPath('//ds/data/LAB/laupodteam/AIOS/Bram/notebooks/code_dev/negation-detection/data')

## Create tokenizer

In [5]:
# Create, train and save the tokenizer
tokenizer = ByteLevelBPETokenizer(add_prefix_space=True)
tokenizer.train(text_file.__str__())

In [6]:
# Convert to wrapped tokenizer for MetaCAT
wrapped_tokenizer = TokenizerWrapperBPE(hf_tokenizers=tokenizer)
# Save the tokenizer
wrapped_tokenizer.save(model_dir.__str__())

## Create embedding matrix

In [7]:
# Tokenize text and train with Word2Vec
text_data = []
with open(text_file, encoding='utf-8') as text:
    for line in text:
        text_data.append(tokenizer.encode(line).tokens)
w2v = Word2Vec(text_data, vector_size=300, min_count=1, workers=8, sg=1)

In [8]:
# Check trained word2vec model
# Ġ denotes start of word (a space)
w2v.wv.most_similar('Ġhoesten')

[('Ġniezen', 0.8427163362503052),
 ('Ġloopneus', 0.8114466667175293),
 ('Ġkortademigheid', 0.8107344508171082),
 ('Ġbenauwdheid', 0.7975018620491028),
 ('rampen', 0.7906360626220703),
 ('Ġhartkloppingen', 0.7903270125389099),
 ('Ġconstipatie', 0.789264440536499),
 ('Ġspierpijn', 0.7881028652191162),
 ('Ġrillingen', 0.7871307730674744),
 ('Ġkeelpijn', 0.7849506139755249)]

In [9]:
# Create embedding matrix
embeddings = []
for i in range(tokenizer.get_vocab_size()):
    word = tokenizer.id_to_token(i)
    if word in w2v.wv:
        embeddings.append(w2v.wv[word])
    else:
        # Assign a random vector if the word was not frequent enough to receive an embedding
        embeddings.append(np.random.rand(300))
# Save the embeddings
np.save(open(embeddings_file, 'wb'), np.array(embeddings))