# MetaCAT - Tokenizer & Embeddings

In [2]:
import numpy as np
from medcat.tokenizers.meta_cat_tokenizers import ByteLevelBPETokenizer
from gensim.models import Word2Vec
from pathlib import Path

In [3]:
# Input
data_dir = Path.cwd().parents[0] / 'data'
text_file = data_dir / 'data.txt'

# Output
model_dir = Path.cwd().parents[0] / 'models' / 'bilstm'
model_dir.mkdir(parents=True, exist_ok=True)
embeddings_file = model_dir / 'embeddings.npy'

# Name should contain 'bbpe' for ByteLevelBPETokenizer or 'bert' for BertTokenizerFast
# This name is saved in the model_config dict and subssequently in vars.dat on disk.
tokenizer_name = 'bbpe_dutch-wikipedia'

## Create tokenizer

In [4]:
# Create, train and save the tokenizer
tokenizer = ByteLevelBPETokenizer(add_prefix_space=True)
tokenizer.train(text_file.__str__())

In [5]:
# Save the tokenizer
tokenizer.save_model(model_dir.__str__(), tokenizer_name)

['D:\\Repositories\\negation-detection\\models\\bilstm\\bbpe_dutch-wikipedia-vocab.json',
 'D:\\Repositories\\negation-detection\\models\\bilstm\\bbpe_dutch-wikipedia-merges.txt']

## Create embeddings matrix

In [6]:
# Tokenize text and train with Word2Vec
text_data = []
with open(text_file, encoding='utf-8') as text:
    for line in text:
        text_data.append(tokenizer.encode(line).tokens)
w2v = Word2Vec(text_data, size=300, min_count=1)

In [7]:
# Check trained word2vec model
# Ġ denotes start of word (a space)
w2v.wv.most_similar('Ġhoesten')

[('Ġkortademigheid', 0.9120935201644897),
 ('Ġzweten', 0.9087247848510742),
 ('Ġdiarree', 0.8979878425598145),
 ('Ġjeuk', 0.8853147625923157),
 ('Ġniezen', 0.8850206136703491),
 ('Ġbraken', 0.8805117607116699),
 ('Ġovergeven', 0.8755332827568054),
 ('Ġkrampen', 0.8732550144195557),
 ('Ġbenauwdheid', 0.8729687929153442),
 ('Ġconstipatie', 0.8685443997383118)]