<a href="https://colab.research.google.com/github/tonystark11/transformer-from-scratch/blob/main/src/translation_transformer_from_scratch_pytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installations

In [1]:
!pip install -U 'spacy[cuda-autodetect]' -q
!python -m spacy download en_core_web_sm
!python -m spacy download es_core_news_sm

  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Preparing metadata (setup.py) ... [?25l[?25herror
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for output.

[1;35mnote[0m: This is an issue with the package mentioned above, not pip.
[1;36mhint[0m: See above for details.
Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m103.0 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load 

In [2]:
pip install -U torchtext==0.15.2

Collecting torchtext==0.15.2
  Downloading torchtext-0.15.2-cp310-cp310-manylinux1_x86_64.whl.metadata (7.4 kB)
Collecting torch==2.0.1 (from torchtext==0.15.2)
  Downloading torch-2.0.1-cp310-cp310-manylinux1_x86_64.whl.metadata (24 kB)
Collecting torchdata==0.6.1 (from torchtext==0.15.2)
  Downloading torchdata-0.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collecting nvidia-cuda-nvrtc-cu11==11.7.99 (from torch==2.0.1->torchtext==0.15.2)
  Downloading nvidia_cuda_nvrtc_cu11-11.7.99-2-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu11==11.7.99 (from torch==2.0.1->torchtext==0.15.2)
  Downloading nvidia_cuda_runtime_cu11-11.7.99-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cuda-cupti-cu11==11.7.101 (from torch==2.0.1->torchtext==0.15.2)
  Downloading nvidia_cuda_cupti_cu11-11.7.101-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu11==8.5.0.96 (from torch==2.0.1->torchtex

# Imports

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import math
import copy
from functools import partial

from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

In [5]:
random_seed = 42

# MultiHeadAttention

In [6]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads

        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)

    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 0, -1e9)

        attn_probs = torch.softmax(attn_scores, dim=-1)
        output = torch.matmul(attn_probs, V)
        return output

    def split_heads(self, x):
        batch_size, seq_len, d_model = x.size()
        return x.reshape(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)

    def combine_heads(self, x):
        batch_size, _, seq_len, d_k = x.size()
        return x.transpose(1, 2).reshape(batch_size, seq_len, self.d_model)

    def forward(self, Q, K, V, mask=None):
        Q = self.split_heads(self.W_q(Q))
        K = self.split_heads(self.W_k(K))
        V = self.split_heads(self.W_v(V))

        attn_output = self.scaled_dot_product_attention(Q, K, V, mask)
        output = self.W_o(self.combine_heads(attn_output))
        return output


# Position wise Feed Forward Network

In [7]:
class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(PositionWiseFeedForward, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

# Positional Encoding

In [8]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_length):
        super(PositionalEncoding, self).__init__()

        pe = torch.zeros(max_seq_length, d_model, device=device)
        position = torch.arange(0, max_seq_length, dtype=torch.float, device=device).unsqueeze(1)
        div_term = torch.pow(10_000, (-torch.arange(0, d_model, 2, device=device).float() / d_model))

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        return self.register_buffer('pe', pe.unsqueeze(0))


    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

# Encoder Layer

In [9]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()

        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        attn_output = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x


# Decoder Layer

In [10]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(DecoderLayer, self).__init__()

        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)


    def forward(self, x, enc_output, src_mask, tgt_mask):
        attn_output = self.self_attn(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout(attn_output))
        attn_output = self.cross_attn(x, enc_output, enc_output, src_mask)
        x = self.norm2(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout(ff_output))
        return x

# Transformer Model

In [11]:
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout):
        super(Transformer, self).__init__()
        self.encoder_embedding = nn.Embedding(src_vocab_size, d_model)
        self.decoder_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_seq_length)

        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])

        self.fc = nn.Linear(d_model, tgt_vocab_size)
        self.dropout = nn.Dropout(dropout)

    def generate_mask(self, src, tgt):
        src_mask = (src != 0).unsqueeze(1).unsqueeze(2)
        tgt_mask = (tgt != 0).unsqueeze(1).unsqueeze(3)
        seq_length = tgt.size(1)
        nopeak_mask = (1 - torch.triu(torch.ones(1, seq_length, seq_length, device=device), diagonal=1)).bool()
        tgt_mask = tgt_mask & nopeak_mask
        return src_mask, tgt_mask

    def forward(self, src, tgt):
        src_mask, tgt_mask = self.generate_mask(src, tgt)

        src_embedded = self.dropout(self.positional_encoding(self.encoder_embedding(src)))
        tgt_embedded = self.dropout(self.positional_encoding(self.decoder_embedding(tgt)))

        enc_output = src_embedded
        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)

        dec_output = tgt_embedded
        for dec_layer in self.decoder_layers:
            dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)

        output = self.fc(dec_output)
        return output

# Load Data

In [12]:
!wget https://www.manythings.org/anki/hin-eng.zip

--2024-12-22 19:22:10--  https://www.manythings.org/anki/hin-eng.zip
Resolving www.manythings.org (www.manythings.org)... 173.254.30.110
Connecting to www.manythings.org (www.manythings.org)|173.254.30.110|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 139578 (136K) [application/zip]
Saving to: ‘hin-eng.zip’


2024-12-22 19:22:10 (1.51 MB/s) - ‘hin-eng.zip’ saved [139578/139578]



In [13]:
!unzip hin-eng.zip

Archive:  hin-eng.zip
  inflating: hin.txt                 
  inflating: _about.txt              


In [14]:
with open('hin.txt', 'r') as f:
    lines = f.readlines()

In [15]:
len(lines)

3061

In [17]:
lines[3000]

"It isn't like you to not listen to other people's opinions.\tतुम तो किसी की बात नहीं सुनने वालों में से नहीं थे।\tCC-BY 2.0 (France) Attribution: tatoeba.org #3123929 (CK) & #11370682 (Sorcien)\n"

In [18]:
# Remove everything after the 2nd tab character.
# As we can see above, we only need the first two columns of the data
lines = [line.split('\t') for line in lines]
lines = ['\t'.join(line[:2]) for line in lines]

In [19]:
lines[3000]

"It isn't like you to not listen to other people's opinions.\tतुम तो किसी की बात नहीं सुनने वालों में से नहीं थे।"

In [20]:
# Create train, val, test split
train_lines, val_test_lines = train_test_split(lines, test_size=0.2, random_state=random_seed, shuffle=True)
val_lines, test_lines = train_test_split(val_test_lines, test_size=0.5, random_state=random_seed, shuffle=True)

In [21]:
print(len(train_lines))
print(len(val_lines))
print(len(test_lines))

2448
306
307


In [22]:
train_lines[0]

'She is as beautiful as her mother.\tवह अपनी माँ जैसी सुंदर है।'

In [23]:
val_lines[0]

"We've got to talk to Tom.\tहमें टॉम से बात करनी होगी।"

In [24]:
test_lines[0]

"I didn't expect Tom to be so good-looking.\tमुझे उम्मीद नहीं थी कि टॉम इतना अच्छा दिखने वाला होगा।"

# Preprocess Data

In [26]:
SRC_LANGUAGE = "en"
TGT_LANGUAGE = "hi"

In [28]:
!python -m spacy download xx_ent_wiki_sm

Collecting xx-ent-wiki-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/xx_ent_wiki_sm-3.7.0/xx_ent_wiki_sm-3.7.0-py3-none-any.whl (11.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.1/11.1 MB[0m [31m47.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xx-ent-wiki-sm
Successfully installed xx-ent-wiki-sm-3.7.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('xx_ent_wiki_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [29]:
tokenizer = {}
tokenizer[SRC_LANGUAGE] = get_tokenizer("spacy", "en_core_web_sm")
tokenizer[TGT_LANGUAGE] = get_tokenizer("spacy", language="xx_ent_wiki_sm")

## Create Dataset

In [30]:
class SentencePairDataset(Dataset):
    def __init__(self, lines, src_tokenizer, tgt_tokenizer):
        super(SentencePairDataset, self).__init__()

        self.lines = lines
        self.src_tokenizer = src_tokenizer
        self.tgt_tokenizer = tgt_tokenizer

    def __len__(self):
        return len(self.lines)

    def __getitem__(self, idx):
        line = self.lines[idx]

        src, tgt = line.split('\t')
        src_tokens = self.src_tokenizer(src)
        tgt_tokens = self.tgt_tokenizer(tgt)

        return src_tokens, tgt_tokens

In [31]:
train_ds = SentencePairDataset(train_lines, tokenizer[SRC_LANGUAGE], tokenizer[TGT_LANGUAGE])
val_ds = SentencePairDataset(val_lines, tokenizer[SRC_LANGUAGE], tokenizer[TGT_LANGUAGE])
test_ds = SentencePairDataset(test_lines, tokenizer[SRC_LANGUAGE], tokenizer[TGT_LANGUAGE])

In [32]:
# Length of longest src sequence
print(max(len(x[0]) for x in train_ds))
print(max(len(x[0]) for x in val_ds))
print(max(len(x[0]) for x in test_ds))

26
17
19


In [33]:
# Length of longest tgt sequence
print(max(len(x[1]) for x in train_ds))
print(max(len(x[1]) for x in val_ds))
print(max(len(x[1]) for x in test_ds))

28
17
17


In [34]:
next(iter(train_ds))

(['She', 'is', 'as', 'beautiful', 'as', 'her', 'mother', '.'],
 ['वह', 'अपनी', 'माँ', 'जैसी', 'सुंदर', 'है', '।'])

## Create Vocabulary

In [35]:
vocab = {}

In [36]:
src_vocab_size = 10_000
tgt_vocab_size = 10_000
max_seq_len = 100

PAD_IDX = 0
UNK_IDX = 1
BOS_IDX = 2
EOS_IDX = 3

special_symbols = ['<PAD>', '<UNK>', '<BOS>', '<EOS>']

In [37]:
def yield_tokens(dataset, lang_idx=0):
    n = len(dataset)
    i = 0

    while i < n:
        yield dataset[i][lang_idx]
        i += 1

In [38]:
src_iterator = yield_tokens(train_ds, lang_idx=0)
tgt_iterator = yield_tokens(train_ds, lang_idx=1)

In [39]:
vocab[SRC_LANGUAGE] = build_vocab_from_iterator(
    src_iterator,
    min_freq=1,
    specials=special_symbols,
    special_first=True,
    max_tokens=src_vocab_size,
)

In [40]:
vocab[TGT_LANGUAGE] = build_vocab_from_iterator(
    tgt_iterator,
    min_freq=1,
    specials=special_symbols,
    special_first=True,
    max_tokens=tgt_vocab_size,
)

In [41]:
vocab[SRC_LANGUAGE].set_default_index(UNK_IDX)
vocab[TGT_LANGUAGE].set_default_index(UNK_IDX)

In [42]:
vocab[SRC_LANGUAGE]['hello']

1

In [43]:
vocab[TGT_LANGUAGE]['Hola']

1

In [44]:
def collate_fn(batch, vocab):
    batch_size = len(batch)
    srcs, tgts = zip(*batch)
    src_vectors = torch.zeros((batch_size, max_seq_len), dtype=torch.long, device=device)
    tgt_vectors = torch.zeros((batch_size, max_seq_len), dtype=torch.long, device=device)

    for i in range(batch_size):
        src_vectors[i] = torch.tensor(([BOS_IDX] + vocab[SRC_LANGUAGE](srcs[i]) + [EOS_IDX] + [0] * (max_seq_len - len(srcs[i])))[:max_seq_len], dtype=torch.long, device=device)
        tgt_vectors[i] = torch.tensor(([BOS_IDX] + vocab[TGT_LANGUAGE](tgts[i]) + [EOS_IDX] + [0] * (max_seq_len - len(tgts[i])))[:max_seq_len], dtype=torch.long, device=device)

    return src_vectors, tgt_vectors

In [46]:
train_dataloader = DataLoader(train_ds, batch_size=64, shuffle=True, collate_fn=partial(collate_fn, vocab=vocab))
val_dataloader = DataLoader(val_ds, batch_size=64, shuffle=True, collate_fn=partial(collate_fn, vocab=vocab))
test_dataloader = DataLoader(test_ds, batch_size=64, shuffle=True, collate_fn=partial(collate_fn, vocab=vocab))

In [47]:
src_vocab_size = 10_000
tgt_vocab_size = 10_000
d_model = 512
num_heads = 4
num_layers = 6
d_ff = 2048
max_seq_length = 100
dropout = 0.1
num_epochs = 3

transformer = Transformer(src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

transformer.train()

for epoch in range(num_epochs):
    print(f"Epoch: {epoch+1}\n------------------------------")
    transformer.train()
    for data in train_dataloader:
        src_data, tgt_data = data
        optimizer.zero_grad()
        output = transformer(src_data, tgt_data[:, :-1])
        loss = criterion(output.contiguous().view(-1, tgt_vocab_size), tgt_data[:, 1:].contiguous().view(-1))
        loss.backward()
        optimizer.step()
        print(f"Epoch: {epoch+1}, Training Loss: {loss.item()}")

    transformer.eval()
    with torch.no_grad():
        for data in val_dataloader:
            src_data, tgt_data = data
            output = transformer(src_data, tgt_data[:, :-1])
            loss = criterion(output.contiguous().view(-1, tgt_vocab_size), tgt_data[:, 1:].contiguous().view(-1))
            print(f"Epoch: {epoch+1}, Validation Loss: {loss.item()}")

    torch.save(transformer.state_dict(), f'./transformer_state_dict_epoch_{epoch+1}')

Epoch: 1
------------------------------
Epoch: 1, Training Loss: 9.469741821289062
Epoch: 1, Training Loss: 8.224419593811035


KeyboardInterrupt: 

In [None]:
transformer.eval()
with torch.no_grad():
    for data in test_dataloader:
        src_data, tgt_data = data
        output = transformer(src_data, tgt_data[:, :-1])
        loss = criterion(output.contiguous().view(-1, tgt_vocab_size), tgt_data[:, 1:].contiguous().view(-1))
        print(f"Test Loss: {loss.item()}")

Test Loss: 1.5609937906265259
Test Loss: 1.5811073780059814
Test Loss: 1.359971046447754
Test Loss: 1.352913737297058
Test Loss: 1.5743625164031982
Test Loss: 1.2384793758392334
Test Loss: 1.5436125993728638
Test Loss: 1.3538049459457397
Test Loss: 1.5148919820785522
Test Loss: 1.5467768907546997
Test Loss: 1.3038854598999023
Test Loss: 1.6451777219772339
Test Loss: 1.2898495197296143
Test Loss: 1.4407848119735718
Test Loss: 1.408454179763794
Test Loss: 1.4048285484313965
Test Loss: 1.359633207321167
Test Loss: 1.3183794021606445
Test Loss: 1.4878709316253662
Test Loss: 1.5396616458892822
Test Loss: 1.5498554706573486
Test Loss: 1.5271481275558472
Test Loss: 1.4464315176010132
Test Loss: 1.4931429624557495
Test Loss: 1.4360525608062744
Test Loss: 1.3095979690551758
Test Loss: 1.4987579584121704
Test Loss: 1.551461935043335
Test Loss: 1.426331877708435
Test Loss: 1.5104517936706543
Test Loss: 1.5458340644836426
Test Loss: 1.4078935384750366
Test Loss: 1.1358911991119385
Test Loss: 1.421

# Inference

In [None]:
model_path = "/content/transformer_state_dict_epoch_3"
state_dict = torch.load(model_path)

src_vocab_size = 10_000
tgt_vocab_size = 10_000
d_model = 512
num_heads = 4
num_layers = 6
d_ff = 2048
max_seq_length = 100
dropout = 0.1
num_epochs = 3

transformer_loaded = Transformer(src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout).to(device)
transformer_loaded.load_state_dict(state_dict)

<All keys matched successfully>

In [None]:
def translate(src):
    src_tokens = tokenizer[SRC_LANGUAGE](src)
    tgt_tokens = ["<BOS>"]

    src_vectors = torch.tensor(([BOS_IDX] + vocab[SRC_LANGUAGE](src_tokens) + [EOS_IDX] + [0] * (max_seq_len - len(src_tokens)))[:max_seq_len], dtype=torch.long, device=device).unsqueeze(0)

    for i in range(max_seq_len):
        tgt_vectors = torch.tensor((vocab[TGT_LANGUAGE](tgt_tokens) + [0] * (max_seq_len - len(tgt_tokens)))[:max_seq_len], dtype=torch.long, device=device).unsqueeze(0)
        output = transformer(src_vectors, tgt_vectors)
        idx = torch.argmax(nn.functional.softmax(output, dim=2)[0][i]).item()
        tgt_tokens.append(vocab[TGT_LANGUAGE].lookup_token(idx))

        if idx == EOS_IDX:
            break

    return " ".join(tgt_tokens).replace("<BOS>", "").replace("<EOS>", "").replace("<PAD>", "").strip()

In [None]:
translate("Hello, how are you nice to meet you my friend.")

'Hola , como estás bien con mi amigo .'

In [None]:
translate("My name is John.")

'Mi nombre es John .'

In [None]:
translate("I am learning Spanish.")

'Estoy aprendiendo español .'

In [None]:
translate("I eat apples.")

'Como manzanas .'

In [None]:
translate("I have three books and two pens.")

'Tengo tres libros y dos <UNK> .'

In [None]:
translate("Do you work in an office?")

'¿ Sabes trabajar en una oficina ?'

In [None]:
translate("How are you?")

'¿ Cómo estás ?'

In [None]:
eng, spa = test_lines[0].split('\t')
print(eng)
print(spa)
translate(eng)

Tom's flight was delayed.
El vuelo de Tom fue retrasado.


'El vuelo de Tom fue <UNK> .'

In [None]:
eng, spa = test_lines[500].split('\t')
print(eng)
print(spa)
translate(eng)

I can't believe Tom really said no to me.
No puedo creer que Tom verdaderamente me haya dicho no.


'No puedo creer que Tom no me dijo mucho .'

In [None]:
eng, spa = train_lines[1000].split('\t')
print(eng)
print(spa)
translate(eng)

Tom eats breakfast every morning at six o'clock.
Tom desayuna a las seis todas las mañanas.


'Tom come todos los días a las seis .'

In [None]:
eng, spa = train_lines[10000].split('\t')
print(eng)
print(spa)
translate(eng)

He can speak five languages.
Él habla cinco lenguas.


'Él habla cinco idiomas .'

# Export model and vocabulary

In [None]:
torch.save(vocab[SRC_LANGUAGE], "./vocab-english")
torch.save(vocab[TGT_LANGUAGE], "./vocab-spanish")

In [None]:
torch.save(tokenizer[SRC_LANGUAGE], "./tokenizer-english")
torch.save(tokenizer[TGT_LANGUAGE], "./tokenizer-spanish")

In [None]:
torch.save(transformer, "./transformer_model")