## Tokenizers

In [3]:
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.normalizers import Lowercase

In [5]:
sample1 = "We are learning AI"
sample2 = "AI is a CS topic"
corpus = [sample1, sample2]

# Initialize the tokenizer and define a trainer
tokenizer = Tokenizer(WordLevel())
tokenizer.normalizer = Lowercase()
tokenizer.pre_tokenizer = Whitespace()
tokenizer.enable_padding(pad_id=1, pad_token="<pad>", length=5)
tokenizer.enable_truncation(max_length=5)

# Train the tokenizer on your corpus
trainer = WordLevelTrainer(vocab_size=8, special_tokens=["<unk>", "<pad>"])
tokenizer.train_from_iterator(corpus, trainer)
vocab = tokenizer.get_vocab()
print(vocab)

{'are': 4, '<unk>': 0, '<pad>': 1, 'is': 6, 'ai': 2, 'learning': 7, 'a': 3, 'cs': 5}


In [8]:
output = tokenizer.encode(sample1)
print(output.tokens)
print(output.ids)

['<unk>', 'are', 'learning', 'ai', '<pad>']
[0, 4, 7, 2, 1]


In [9]:
output = tokenizer.encode(sample2)
print(output.tokens)
print(output.ids)

['ai', 'is', 'a', 'cs', '<unk>']
[2, 6, 3, 5, 0]


## Embedding

In [13]:
import torch.nn as nn
import torch

vocab_size = 8
embed_dim = 4
embedding = nn.Embedding(vocab_size, embed_dim)

print(embedding.weight)

Parameter containing:
tensor([[-0.0761,  0.1746, -0.1855, -0.7363],
        [-0.7457,  0.6099,  0.0480, -0.8983],
        [-0.1202, -1.4226,  0.7527, -0.1794],
        [-1.5723, -0.7351,  1.7490, -0.5941],
        [-1.3017, -0.4285, -0.4323,  0.8948],
        [-2.1426,  0.1073,  1.9736,  0.0829],
        [ 0.5543,  2.0615,  1.3596,  0.6395],
        [-0.1014,  0.3957,  1.6053, -1.4782]], requires_grad=True)


In [None]:
input1 = torch.tensor([0, 3, 4, 5, 6], dtype=torch.long)
output1 = embedding(input1)
print(output1)

tensor([[-0.0761,  0.1746, -0.1855, -0.7363],
        [-1.5723, -0.7351,  1.7490, -0.5941],
        [-1.3017, -0.4285, -0.4323,  0.8948],
        [-2.1426,  0.1073,  1.9736,  0.0829],
        [ 0.5543,  2.0615,  1.3596,  0.6395]], grad_fn=<EmbeddingBackward0>)


## Step-by-Step Example

In [None]:
corpus = ["gậy ông đập lưng ông", "có làm mới có ăn"]
data_size = len(corpus)

# 0: negative - 1: positive
labels = [0, 1]

# Define the max vocabulary size and sequence length
vocab_size = 8
sequence_length = 5

In [None]:
# Initialize the tokenizer and define a trainer
tokenizer = Tokenizer(WordLevel())
tokenizer.pre_tokenizer = Whitespace()
tokenizer.enable_padding(pad_id=1, pad_token="<pad>", length=sequence_length)
tokenizer.enable_truncation(max_length=sequence_length)

# Train the tokenizer on your corpus
trainer = WordLevelTrainer(vocab_size=vocab_size, special_tokens=["<unk>", "<pad>"])
tokenizer.train_from_iterator(corpus, trainer)


# Vectorize the samples
corpus_ids = []
for sentence in corpus:
    output = tokenizer.encode(sentence)
    output = torch.tensor(output.ids, dtype=torch.long)
    corpus_ids.append(output)

for v in corpus_ids:
    print(v)

tensor([4, 3, 0, 6, 3])
tensor([2, 5, 7, 2, 0])


In [None]:
embedding = nn.Embedding(vocab_size, 2)  # (8, 2)
flatten = nn.Flatten()
classifier = nn.Linear(5 * 2, 2)  # (2, )
model = nn.Sequential(embedding, flatten, classifier)

In [None]:
from torchinfo import summary

input_data = torch.randint(0, vocab_size, (2, 5))

print(summary(model, input_data=input_data))

print(summary(model))

Layer (type:depth-idx)                   Output Shape              Param #
Sequential                               [2, 2]                    --
├─Embedding: 1-1                         [2, 5, 2]                 16
├─Flatten: 1-2                           [2, 10]                   --
├─Linear: 1-3                            [2, 2]                    22
Total params: 38
Trainable params: 38
Non-trainable params: 0
Total mult-adds (M): 0.00
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params size (MB): 0.00
Estimated Total Size (MB): 0.00
Layer (type:depth-idx)                   Param #
Sequential                               --
├─Embedding: 1-1                         16
├─Flatten: 1-2                           --
├─Linear: 1-3                            22
Total params: 38
Trainable params: 38
Non-trainable params: 0


In [49]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(),
                             lr=0.1)

inputs = torch.tensor([[4, 3, 0, 6, 3],
                       [2, 5, 7, 2, 0]], dtype=torch.long)
labels = torch.tensor([0, 1], dtype=torch.long)

for _ in range(50):
    optimizer.zero_grad()
    outputs = model(inputs)
    loss = criterion(outputs, labels)
    print(loss.item())

    loss.backward()
    optimizer.step()

0.9377927780151367
0.6289594769477844
0.4476128816604614
0.33828476071357727
0.2679654061794281
0.21985246241092682
0.18524283170700073
0.15933893620967865
0.13932527601718903
0.12345785647630692
0.11060637980699539
0.10000963509082794
0.09113849699497223
0.08361443877220154
0.07716044038534164
0.07156932353973389
0.0666833221912384
0.06238032132387161
0.05856452137231827
0.05515972524881363
0.05210445076227188
0.04934896156191826
0.04685213416814804
0.044580183923244476
0.04250473529100418
0.04060200974345207
0.03885173052549362
0.037236787378787994
0.035742491483688354
0.034356020390987396
0.03306646645069122
0.0318642258644104
0.03074093908071518
0.029689297080039978
0.028702694922685623
0.027775581926107407
0.026902731508016586
0.026079759001731873
0.025302482768893242
0.02456735074520111
0.023871077224612236
0.02321077696979046
0.022583723068237305
0.021987583488225937
0.021420206874608994
0.020879531279206276
0.020363852381706238
0.019871456548571587
0.01940079778432846
0.0189505

In [50]:
outputs = model(inputs)
print(outputs)
print(torch.softmax(outputs, axis=-1))

tensor([[ 1.6610, -2.2893],
        [-2.1030,  1.9068]], grad_fn=<AddmmBackward0>)
tensor([[0.9811, 0.0189],
        [0.0178, 0.9822]], grad_fn=<SoftmaxBackward0>)
