# Language translator: English to French

Vivek Viswam R. V. <br>
Rachel Messenger

### Imports

In [None]:
import os
import warnings

import torch
import torch.nn as nn
import torch.optim as optim
import spacy

from torch.utils.data import Dataset, DataLoader
from collections import Counter

!python -m pip install torchmetrics
!python -m spacy download fr_core_news_sm
!python -m spacy download en_core_web_sm

from torchmetrics.text import BLEUScore
from itertools import product

warnings.filterwarnings('ignore')

Collecting fr-core-news-sm==3.8.0
  Using cached https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.8.0/fr_core_news_sm-3.8.0-py3-none-any.whl (16.3 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('fr_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Collecting en-core-web-sm==3.8.0
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencie

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Constants

In [None]:
ROOT_PATH = "/content/drive/MyDrive/language-translator"
EN_REL_PATH = "dataset/europarl-v7.fr-en.en"
FR_REL_PATH = "dataset/europarl-v7.fr-en.fr"
TRAIN_STATS_FILENAME = "training_stats.pth"

EN_PATH = os.path.join(ROOT_PATH, EN_REL_PATH)
FR_PATH = os.path.join(ROOT_PATH, FR_REL_PATH)
TRAIN_STATS_PATH = os.path.join(ROOT_PATH, TRAIN_STATS_FILENAME)

START_TOKEN = "<START>"
END_TOKEN = "<END>"
PAD_TOKEN = "<PAD>"
UNKN_TOKEN = "<UNKN>"

MAX_SENT_LEN = 100
MAX_SEQ_LEN = MAX_SENT_LEN + 2
MAX_LINE_COUNT = 200000

### Custom dataset class definitions

In [None]:
class LangDataset(Dataset):
  START_TOKEN = START_TOKEN
  END_TOKEN = END_TOKEN
  PAD_TOKEN = PAD_TOKEN
  UNKN_TOKEN = UNKN_TOKEN
  SPECIAL_TOKENS = ( START_TOKEN, END_TOKEN, PAD_TOKEN, UNKN_TOKEN )

  def __init__(self, src_path, tgt_path, src_tokenizer, tgt_tokenizer, min_tok_freq=2, max_lines=None, max_sent_len=MAX_SENT_LEN):
    self.src = None
    self.tgt = None

    self.src_path = src_path
    self.tgt_path = tgt_path
    self.max_sent_len = max_sent_len
    self.max_seq_len = max_sent_len + 2
    self.min_tok_freq = min_tok_freq

    self.src_lines = []
    self.tgt_lines = []
    self.src_idx2tok = {}
    self.src_tok2idx = {}
    self.tgt_idx2tok = {}
    self.tgt_tok2idx = {}
    self.src_vocab_size = 0
    self.tgt_vocab_size = 0

    self._src_tokenizer = src_tokenizer
    self._tgt_tokenizer = tgt_tokenizer
    self._max_lines = max_lines


    self._load_dataset()
    self._preprocess_dataset()
    self._create_vocab()
    self._tensorify_dataset()

  def tokenize_sentence(self, sentence, lang_type="src"):
      tokenize = self._src_tokenizer if lang_type == "src" else self._tgt_tokenizer
      tok2idx = self.src_tok2idx if lang_type == "src" else self.tgt_tok2idx
      counter = self.src_tok_ctr if lang_type == "src" else self.tgt_tok_ctr

      tokens = tokenize(sentence)
      tokens = [ self.UNKN_TOKEN if counter[tok] < self.min_tok_freq else tok for tok in tokens ]
      tokens = [ self.START_TOKEN ] + tokens + [ self.END_TOKEN ]
      padding_size = self.max_seq_len - len(tokens)
      tokens += [ self.PAD_TOKEN ] * padding_size
      tokens = [ tok2idx[tok] for tok in tokens ]

      return tokens

  def detokenize_tensor(self, tensor, lang_type="src"):
      idx2tok = self.src_idx2tok if lang_type == "src" else self.tgt_idx2tok
      words = [ idx2tok[idx.item()] for idx in tensor ]

      return words

  def _load_dataset(self):
    print("Reading source and target files...")

    with open(self.src_path) as src_f, open(self.tgt_path) as tgt_path:
      src_lines = src_f.read().splitlines()
      tgt_lines = tgt_path.read().splitlines()

      self.src_lines = src_lines[:self._max_lines] if self._max_lines else src_lines
      self.tgt_lines = tgt_lines[:self._max_lines] if self._max_lines else tgt_lines

    print(f"Reading source files! Read {len(src_lines)} lines. Keeping {len(self.src_lines)} lines.")

  def _preprocess_dataset(self):
    print("Preprocessing dataset...")

    final_src_lines = []
    final_tgt_lines = []
    total_lines = len(self.src_lines)

    self.src_tok_ctr = Counter()
    self.tgt_tok_ctr = Counter()

    self._src_tok_set = set()
    self._tgt_tok_set = set()

    for idx in range(total_lines):
      src_line = self.src_lines[idx].lower()
      tgt_line = self.tgt_lines[idx].lower()

      src_line_toks = self._src_tokenizer(src_line)
      tgt_line_toks = self._tgt_tokenizer(tgt_line)

      src_line_len = len(src_line_toks)
      tgt_line_len = len(tgt_line_toks)

      if src_line_len <= self.max_sent_len and tgt_line_len <= self.max_sent_len:
        final_src_lines.append(src_line)
        final_tgt_lines.append(tgt_line)

        self._src_tok_set |= set(src_line_toks)
        self._tgt_tok_set |= set(tgt_line_toks)

        self.src_tok_ctr.update(src_line_toks)
        self.tgt_tok_ctr.update(tgt_line_toks)

    self.src_lines, self.tgt_lines = final_src_lines, final_tgt_lines

    print(f"Preprocessing completed! Dropped {total_lines - len(self.src_lines)}/{total_lines} lines!")

  def _create_vocab(self):
    print("Creating vocabularies...")

    src_tokens = list(self.SPECIAL_TOKENS) + list(self._src_tok_set)
    tgt_tokens = list(self.SPECIAL_TOKENS) + list(self._tgt_tok_set)

    del self._src_tok_set
    del self._tgt_tok_set

    for idx, tok in enumerate(src_tokens):
      self.src_tok2idx[tok] = idx
      self.src_idx2tok[idx] = tok

    for idx, tok in enumerate(tgt_tokens):
      self.tgt_tok2idx[tok] = idx
      self.tgt_idx2tok[idx] = tok

    self.src_vocab_size = len(self.src_tok2idx)
    self.tgt_vocab_size = len(self.tgt_tok2idx)

    print(f"Created vocabularies! Source vocab size: {self.src_vocab_size} and target vocab size: {self.tgt_vocab_size}")

  def _tensorify_dataset(self):
    print("Tensorifying dataset...")

    final_src_lines = [ self.tokenize_sentence(line, "src") for line in self.src_lines ]
    final_tgt_lines = [ self.tokenize_sentence(line, "tgt") for line in self.tgt_lines ]

    self.src = torch.tensor(final_src_lines)
    self.tgt = torch.tensor(final_tgt_lines)

    print("Dataset tensorified!")

  def __len__(self):
    return len(self.src)

  def __getitem__(self, idx):
    return self.src[idx], self.tgt[idx]

### Class definitions

In [None]:
class Transformer(nn.Module):
  def __init__(self,
               embedding_size,
               num_heads,
               num_encoder_layers,
               num_decoder_layers,
               dim_feedforward,
               dropout,
               src_voc_size,
               tgt_voc_size,
               src_pad_idx,
               tgt_pad_idx,
               device,
               max_sent_len=MAX_SENT_LEN,
    ):

    super(Transformer, self).__init__()

    self.device = device
    self.src_pad_idx = src_pad_idx
    self.tgt_pad_idx = tgt_pad_idx
    self.max_seq_len = max_sent_len + 2

    self.dropout = nn.Dropout(dropout)
    self.src_word_embedding = nn.Embedding(src_voc_size, embedding_size)
    self.src_position_embedding = nn.Embedding(self.max_seq_len, embedding_size)
    self.tgt_word_embedding = nn.Embedding(tgt_voc_size, embedding_size)
    self.tgt_position_embedding = nn.Embedding(self.max_seq_len, embedding_size)
    self.transformer = nn.Transformer(
        embedding_size,
        num_heads,
        num_encoder_layers,
        num_decoder_layers,
        dim_feedforward,
        dropout
    )

    self.fc_out = nn.Linear(embedding_size, tgt_voc_size)

  def create_padding_mask(self, seq, pad_idx):
    return seq.transpose(0, 1) == pad_idx

  def forward(self, src, tgt):
    src_seq_len, batch_size = src.shape
    tgt_seq_len, _ = tgt.shape

    src_positions = (
        torch.arange(0, src_seq_len).unsqueeze(1).expand(src_seq_len, batch_size)
        .to(self.device)
    )

    tgt_positions = (
        torch.arange(0, tgt_seq_len).unsqueeze(1).expand(tgt_seq_len, batch_size)
        .to(self.device)
    )

    emb_src = self.dropout(
        self.src_word_embedding(src) + self.src_position_embedding(src_positions)
    )

    emb_tgt = self.dropout(
        self.tgt_word_embedding(tgt) + self.tgt_position_embedding(tgt_positions)
    )

    src_padding_mask = self.create_padding_mask(src, self.src_pad_idx)
    tgt_padding_mask = self.create_padding_mask(tgt, self.tgt_pad_idx)
    tgt_mask = self.transformer.generate_square_subsequent_mask(tgt_seq_len).to(self.device)

    out = self.transformer(
        emb_src,
        emb_tgt,
        tgt_mask=tgt_mask,
        src_key_padding_mask=src_padding_mask,
        tgt_key_padding_mask=tgt_padding_mask
    )

    return self.fc_out(out)

In [None]:
def batch_translate_tensor(model, batch_data, max_length=MAX_SEQ_LEN, detokenize=True):
  data_len, seq_len = batch_data.shape

  model.eval()

  src = batch_data.T.to(device)
  tgt = torch.full(
      (1, data_len),
      dataset.tgt_tok2idx[START_TOKEN],
      dtype=torch.long,
      device=device
  )

  finished = torch.zeros(data_len, dtype=torch.bool, device=device)

  for _ in range(max_length-1):
    with torch.no_grad():
      output = model(src, tgt)

    next_token_logits = output[-1, :, :]
    next_tokens = next_token_logits.argmax(dim=1)

    next_tokens = next_tokens.masked_fill(finished, dataset.src_tok2idx[END_TOKEN])
    tgt = torch.cat([tgt, next_tokens.unsqueeze(0)], dim=0)

    finished |= (next_tokens == dataset.src_tok2idx[END_TOKEN])
    if finished.all():
      break

  if not detokenize:
    return tgt.T

  translations = []
  tgt_np = tgt.detach().cpu().numpy().T

  for seq in tgt_np:
    words = []
    for idx in seq[1:]:
      if idx == dataset.tgt_tok2idx[END_TOKEN]:
        break
      words.append(dataset.tgt_idx2tok[idx])
    translations.append(" ".join(words))

  return translations

def en_token_rejoin(tokens):
  sentence = ""
  for token in tokens:
    if token[0] in ["'", ".", ",", "!", "?", ":", ";"]:
      sentence = sentence.rstrip() + token
      continue

    sentence+= token + " "

  return sentence

def translate_sentence(model, sentence, max_length=MAX_SENT_LEN):
  tokens = [token.text.lower() for token in spacy_fr(sentence)]

  tokens.insert(0, START_TOKEN)
  tokens.append(END_TOKEN)

  text_to_indices = [ dataset.src_tok2idx.get(token, dataset.src_tok2idx[UNKN_TOKEN]) for token in tokens]
  sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)

  outputs = [dataset.tgt_tok2idx[START_TOKEN]]
  for i in range(max_length):
      tgt_tensor = torch.LongTensor(outputs).unsqueeze(1).to(device)

      with torch.no_grad():
          output = model(sentence_tensor, tgt_tensor)

      best_guess = output.argmax(2)[-1, :].item()
      if best_guess == dataset.tgt_tok2idx[END_TOKEN]:
          break

      outputs.append(best_guess)

  translated_sentence = [dataset.tgt_idx2tok[idx] for idx in outputs]

  return en_token_rejoin(translated_sentence[1:])

def bleu_score(model, test_iter, max_count=None, verbose=True):
  tgts = []
  pred_tgts = []
  cnt = 0

  for (x, y) in test_iter:
    src = x.to(device)
    tgt = y[:, :-1]

    pred = batch_translate_tensor(model, src)
    pred_tgts.extend(pred)
    for sent_tok in tgt:
      tgts.append([" ".join([ val for tok in sent_tok if (val:=dataset.tgt_idx2tok[tok.item()]) not in [START_TOKEN, END_TOKEN, PAD_TOKEN]])])

    cnt += src.shape[0]
    if max_count and cnt >= max_count:
      break

  bleu = BLEUScore().to(device)
  score = bleu(pred_tgts, tgts)

  if(verbose):
    print(f"Total tested datapoints: {cnt}")
    print(f"BLEU score: {score * 100:.2f}%")

  return score.item()

### Misc definitions

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

spacy_fr = spacy.load("fr_core_news_sm")
spacy_en = spacy.load("en_core_web_sm")

tokenize_fr = lambda text: [ text for tok in spacy_fr.tokenizer(text) if (text:=tok.text.strip()) ]
tokenize_en = lambda text: [ text for tok in spacy_en.tokenizer(text) if (text:=tok.text.strip()) ]

### Loading dataset and vocabularies

In [None]:
batch_size = 1024
train_test_split = [0.9, 0.1]

dataset = LangDataset(FR_PATH, EN_PATH, tokenize_fr, tokenize_en, max_lines=20000)

train_data, test_data = torch.utils.data.random_split(dataset, train_test_split)
train_iter = DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_iter = DataLoader(test_data, batch_size=batch_size, shuffle=False)

Reading source and target files...
Reading source files! Read 2007723 lines. Keeping 20000 lines.
Preprocessing dataset...
Preprocessing completed! Dropped 156/20000 lines!
Creating vocabularies...
Created vocabularies! Source vocab size: 20151 and target vocab size: 14531
Tensorifying dataset...
Dataset tensorified!


### Hyperparameter tuning

In [None]:
def train(train_iter, val_iter, dataset, device, config):
  embedding_size = config["embedding_size"]
  num_heads = config["num_heads"]
  num_encoder_layers = config["num_encoder_layers"]
  num_decoder_layers = config["num_decoder_layers"]
  dropout = config["dropout"]
  learning_rate = config["learning_rate"]
  num_epochs = config["num_epochs"]

  log_per = 5
  dim_feedforward = 4 * embedding_size

  src_pad_idx = dataset.src_tok2idx[PAD_TOKEN]
  tgt_pad_idx = dataset.tgt_tok2idx[PAD_TOKEN]
  tgt_start_idx = dataset.tgt_tok2idx[START_TOKEN]
  tgt_end_idx = dataset.tgt_tok2idx[END_TOKEN]

  model = Transformer(
    embedding_size,
    num_heads,
    num_encoder_layers,
    num_decoder_layers,
    dim_feedforward,
    dropout,
    dataset.src_vocab_size,
    dataset.tgt_vocab_size,
    src_pad_idx,
    tgt_pad_idx,
    device
  ).to(device)

  optimizer = optim.Adam(model.parameters(), lr=learning_rate)
  criterion = nn.CrossEntropyLoss(ignore_index=tgt_pad_idx)

  loss_history = []
  acc_history = []

  for epoch in range(num_epochs):
    batch_loss = []
    batch_acc = []

    model.train()
    for src, target in train_iter:
      src = src.to(device).T
      target = target.to(device).T

      target_input  = target[:-1, :]
      target_output = target[1:, :]

      output = model(src, target_input)
      pred = output.argmax(2)

      mask = (
        (target_output != tgt_end_idx) &
        (target_output != tgt_pad_idx)
      )

      correct  = (pred == target_output) & mask
      accuracy = correct.sum() / mask.sum()
      batch_acc.append(accuracy.item())

      output_flat = output.reshape(-1, output.shape[2])
      target_flat = target_output.reshape(-1)

      optimizer.zero_grad()
      loss = criterion(output_flat, target_flat)
      loss.backward()
      batch_loss.append(loss.item())

      torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
      optimizer.step()

    avg_acc  = sum(batch_acc) / len(batch_acc)
    avg_loss = sum(batch_loss) / len(batch_loss)
    acc_history.append(avg_acc)
    loss_history.append(avg_loss)

    if epoch % log_per == 0:
      print(f"Epoch {epoch + 1}/{num_epochs} - loss: {avg_loss:.4f} - train acc: {avg_acc * 100:.2f}%")

  test_bleu = bleu_score(model, test_iter, verbose=False)
  print(f"Epoch {epoch + 1}/{num_epochs} - loss: {avg_loss:.4f} - train acc: {avg_acc * 100:.2f}%")
  print(f"Test BLEU: {test_bleu * 100:.2f}%")

  return test_bleu, model.state_dict(), loss_history, acc_history

In [None]:
param_grid = {
  "embedding_size": [256, 512],
  "num_heads": [4, 8],
  "num_encoder_layers": [2, 3],
  "num_decoder_layers": [2, 3],
  "dropout": [0.1, 0.2],
  "learning_rate": [3e-4],
  "num_epochs":[20]
}

def param_combinations(grid):
  keys = list(grid.keys())
  for values in product(*grid.values()):
    yield dict(zip(keys, values))

best_config = None
best_bleu = -1.0
best_state_dict = None

for cfg in param_combinations(param_grid):
  print("\n==============================")
  print("Config:", cfg)
  print("==============================")

  test_bleu, state_dict, loss_hist, acc_hist = train(
    train_iter, test_iter, dataset, device, cfg
  )

  if test_bleu > best_bleu:
    best_bleu = test_bleu
    best_config = cfg
    best_state_dict = state_dict

print("\nBest config found:")
print(best_config)
print(f"Best test BLEU: {best_bleu * 100:.2f}%")


Config: {'embedding_size': 256, 'num_heads': 4, 'num_encoder_layers': 2, 'num_decoder_layers': 2, 'dropout': 0.1, 'learning_rate': 0.0003, 'num_epochs': 20}
Epoch 1/20 - loss: 8.3006 - train acc: 6.20%
Epoch 6/20 - loss: 5.4595 - train acc: 16.15%
Epoch 11/20 - loss: 4.9353 - train acc: 20.70%
Epoch 16/20 - loss: 4.5932 - train acc: 24.12%
Epoch 20/20 - loss: 4.3725 - train acc: 26.21%
Test BLEU: 4.59%

Config: {'embedding_size': 256, 'num_heads': 4, 'num_encoder_layers': 2, 'num_decoder_layers': 2, 'dropout': 0.2, 'learning_rate': 0.0003, 'num_epochs': 20}
Epoch 1/20 - loss: 8.3917 - train acc: 5.89%
Epoch 6/20 - loss: 5.6242 - train acc: 13.95%
Epoch 11/20 - loss: 5.1235 - train acc: 18.26%
Epoch 16/20 - loss: 4.8190 - train acc: 21.01%
Epoch 20/20 - loss: 4.6310 - train acc: 22.72%
Test BLEU: 3.68%

Config: {'embedding_size': 256, 'num_heads': 4, 'num_encoder_layers': 2, 'num_decoder_layers': 3, 'dropout': 0.1, 'learning_rate': 0.0003, 'num_epochs': 20}
Epoch 1/20 - loss: 8.1889 - 

In [None]:
saved_data = torch.load(TRAIN_STATS_PATH)
model_state_dict = saved_data["model_state_dict"]

embedding_size = 512
num_heads = 8
num_encoder_layers = 3
num_decoder_layers = 3
dropout = 0.10
dim_feedforward = 4 * embedding_size
src_pad_idx = dataset.src_tok2idx[PAD_TOKEN]
tgt_pad_idx = dataset.tgt_tok2idx[PAD_TOKEN]

model = Transformer(
    embedding_size,
    num_heads,
    num_encoder_layers,
    num_decoder_layers,
    dim_feedforward,
    dropout,
    dataset.src_vocab_size,
    dataset.tgt_vocab_size,
    src_pad_idx,
    tgt_pad_idx,
    device
).to(device)

model.load_state_dict(model_state_dict)
model.eval()

In [None]:
num_epochs = 100
learning_rate = 3e-4

embedding_size = 512
num_heads = 8
num_encoder_layers = 3
num_decoder_layers = 3
dropout = 0.10
dim_feedforward = 4 * embedding_size
src_pad_idx = dataset.src_tok2idx[PAD_TOKEN]
tgt_pad_idx = dataset.tgt_tok2idx[PAD_TOKEN]
tgt_start_idx = dataset.tgt_tok2idx[START_TOKEN]
tgt_end_idx = dataset.tgt_tok2idx[END_TOKEN]

model = Transformer(
    embedding_size,
    num_heads,
    num_encoder_layers,
    num_decoder_layers,
    dim_feedforward,
    dropout,
    dataset.src_vocab_size,
    dataset.tgt_vocab_size,
    src_pad_idx,
    tgt_pad_idx,
    device
).to(device)

optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss(ignore_index=tgt_pad_idx)

loss_history = []
acc_history = []
val_history = []

for epoch in range(num_epochs):
  batch_loss = []
  batch_acc = []

  model.train()
  for src, target in val_iter:
    src = src.to(device).T
    target = target.to(device).T
    target_input = target[:-1, :]
    target_output = target[1:, :]

    output = model(src, target_input)

    pred = output.argmax(2)
    mask = (target_output != tgt_end_idx) & (target_output != tgt_pad_idx)
    correct = (pred == target_output) & mask
    accuracy = correct.sum() / mask.sum()
    batch_acc.append(accuracy)

    output = output.reshape(-1, output.shape[2])
    target = target_output.reshape(-1)

    optimizer.zero_grad()

    loss = criterion(output, target)
    loss.backward()
    batch_loss.append(loss.item())

    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

    optimizer.step()

  avg_batch_acc = float(sum(batch_acc) / len(batch_acc))
  avg_batch_loss = sum(batch_loss) / len(batch_loss)
  acc_history.append(avg_batch_acc)
  loss_history.append(avg_batch_loss)

  torch.save({
      "loss_history": loss_history,
      "acc_history": acc_history,
      "epoch": epoch,
      "datapoints": MAX_LINE_COUNT,
      "model_state_dict": model.state_dict(),
      "optimizer_state_dict": optimizer.state_dict(),
      }, TRAIN_STATS_PATH)

  print(f"Epoch {epoch + 1} / {num_epochs} - loss: {avg_batch_loss:.4f} - train acc: {avg_batch_acc * 100:.2f}%")

In [None]:
should_validate = epoch % validation_epoch == 0
  if should_validate:
    val_cnt = 0
    avg_val_acc = 0
    for src, target in val_iter:
      src = src.to(device)
      target = target.to(device)

      pred = batch_translate_tensor(model, src, detokenize=False)
      pad_len = MAX_SEQ_LEN - pred.shape[1]
      padding = torch.full(
          (target.shape[0], pad_len),
          tgt_pad_idx,
          device=device
          )
      pred = torch.hstack([pred, padding])
      mask = (target != tgt_start_idx) & (target != tgt_end_idx) & (target != tgt_pad_idx)
      correct = (pred == target) & mask
      accuracy = correct.sum()
      avg_val_acc += accuracy.item()
      val_cnt += mask.sum()

    avg_val_acc /= val_cnt
    val_history.append(avg_val_acc)

  avg_batch_acc = float(sum(batch_acc) / len(batch_acc))
  avg_batch_loss = sum(batch_loss) / len(batch_loss)
  acc_history.append(avg_batch_acc)
  loss_history.append(avg_batch_loss)

  torch.save({
      "loss_history": loss_history,
      "acc_history": acc_history,
      "val_history": val_history,
      "epoch": epoch,
      "datapoints": MAX_LINE_COUNT,
      "model_state_dict": model.state_dict(),
      "optimizer_state_dict": optimizer.state_dict(),
      }, TRAIN_STATS_PATH)

  print_statement = f"Epoch {epoch + 1} / {num_epochs} - loss: {avg_batch_loss:.4f} - train acc: {avg_batch_acc * 100:.2f}%"

  print(print_statement, f"val acc: {avg_val_acc * 100:.2f}%") if should_validate else print(print_statement)

In [None]:
input_sent = "je suis Vivek"
actual_translation = "I am in the process of"
model.eval()
prediction = translate_sentence(model, input_sent)
print(f"Input sentence: {input_sent}")
print(f"Translated sentence: {prediction}")
print(f"Actual translation: {actual_translation}")

In [None]:
model.eval()
bleu_score(model, test_iter)