# Language translator: English to French

Vivek Viswam R. V. <br>
Rachel Messenger

### Imports

In [None]:
import os
import warnings

import torch
import torch.nn as nn
import torch.optim as optim
import spacy

from torch.utils.data import Dataset, DataLoader
from collections import Counter

!python -m pip install torchmetrics
!python -m spacy download fr_core_news_sm
!python -m spacy download en_core_web_sm

from torchmetrics.text import BLEUScore


warnings.filterwarnings('ignore')

Collecting fr-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.8.0/fr_core_news_sm-3.8.0-py3-none-any.whl (16.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.3/16.3 MB[0m [31m127.2 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('fr_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m139.5 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and inst

### Constants

In [None]:
ROOT_PATH = "/content/drive/MyDrive/language-translator"
EN_REL_PATH = "dataset/europarl-v7.fr-en.en"
FR_REL_PATH = "dataset/europarl-v7.fr-en.fr"
TRAIN_STATS_FILENAME = "training_stats.pth"

EN_PATH = os.path.join(ROOT_PATH, EN_REL_PATH)
FR_PATH = os.path.join(ROOT_PATH, FR_REL_PATH)
TRAIN_STATS_PATH = os.path.join(ROOT_PATH, TRAIN_STATS_FILENAME)


START_TOKEN = "<START>"
END_TOKEN = "<END>"
PAD_TOKEN = "<PAD>"
UNK_TOKEN = "<UNK>"
MAX_SENT_LEN = 100
MAX_SEQ_LEN = 102
MAX_LINE_COUNT = 200000

### Misc definitions

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
spacy_fr = spacy.load("fr_core_news_sm")
spacy_en = spacy.load("en_core_web_sm")

### Function definitions

In [None]:
tokenize_fr = lambda text: [ text for tok in spacy_fr.tokenizer(text) if (text:=tok.text.strip()) ]
tokenize_en = lambda text: [ text for tok in spacy_en.tokenizer(text) if (text:=tok.text.strip()) ]

def load_datasets(en_path=EN_PATH, fr_path=FR_PATH):
  with open(EN_PATH) as en_f, open(FR_PATH) as fr_f:
    return fr_f.read(), en_f.read()

def preprocess(fr_text, en_text, max_line_count=MAX_LINE_COUNT, max_sent_len=MAX_SENT_LEN):
  fr_text_lines = fr_text.splitlines()[:max_line_count]
  en_text_lines = en_text.splitlines()[:max_line_count]

  selected_idx = []
  final_fr_lines = []
  final_en_lines = []

  for idx in range(len(fr_text_lines)):
    fr_line = fr_text_lines[idx]
    en_line = en_text_lines[idx]

    fr_line_len = len(tokenize_fr(fr_line))
    en_line_len = len(tokenize_en(en_line))

    if fr_line_len <= max_sent_len and en_line_len <= max_sent_len:
      selected_idx.append(idx)
      final_fr_lines.append(fr_line)
      final_en_lines.append(en_line)

  processed_fr_text = "\n".join(final_fr_lines).strip().lower()
  processed_en_text = "\n".join(final_en_lines).strip().lower()

  return processed_fr_text, processed_en_text

def create_vocabulary(text, tokenizer, min_freq=2):
  token_freq = Counter(tokenizer(text))

  tokens = [ tok for tok, freq in token_freq.items() ]
  tokens = [ START_TOKEN, END_TOKEN, PAD_TOKEN ] + tokens
  tok2idx = { tok: idx for idx, tok in enumerate(tokens) }
  idx2tok = { idx: tok for idx, tok in enumerate(tokens) }

  return tok2idx, idx2tok

def tensorify_dataset(processed_text, lang_tok2idx, tokenizer):
  split_lines = processed_text.splitlines()
  final_lines = []
  for line in split_lines:
    tokens = tokenizer(line)
    tokens = [ START_TOKEN ] + tokens + [ END_TOKEN ]
    padding_size = max(0, MAX_SEQ_LEN - len(tokens))
    tokens += [ PAD_TOKEN ] * padding_size
    tokens = [ lang_tok2idx[tok] for tok in tokens ]
    final_lines.append(tokens)

  return torch.tensor(final_lines)

### Class definitions

In [None]:
class LangDataset(Dataset):
  def __init__(self, src, target):
    self.src = src
    self.target = target

  def __len__(self):
    return len(self.src)

  def __getitem__(self, idx):
    return self.src[idx], self.target[idx]

class Transformer(nn.Module):
  def __init__(self,
               emb_size,
               src_voc_size,
               target_voc_size,
               src_pad_idx,
               num_heads,
               num_encoder_layers,
               num_decoder_layers,
               forward_expansion,
               dropout,
               device
    ):

    super(Transformer, self).__init__()
    self.device = device
    self.src_word_embedding = nn.Embedding(src_voc_size, emb_size)
    self.src_position_embedding = nn.Embedding(MAX_SEQ_LEN, emb_size)
    self.target_word_embedding = nn.Embedding(target_voc_size, emb_size)
    self.target_position_embedding = nn.Embedding(MAX_SEQ_LEN, emb_size)
    self.device = device
    self.transformer = nn.Transformer(
        emb_size,
        num_heads,
        num_encoder_layers,
        num_decoder_layers,
        forward_expansion,
        dropout
    )

    self.fc_out = nn.Linear(emb_size, target_voc_size)
    self.dropout = nn.Dropout(dropout)
    self.src_pad_idx = src_pad_idx

  def make_src_mask(self, src):
    src_mask = src.transpose(0, 1) == self.src_pad_idx
    return src_mask.to(self.device)

  def make_trg_mask(self, trg):
    trg_pad_mask = (trg.transpose(0, 1) == self.src_pad_idx)
    return trg_pad_mask.to(self.device)

  def forward(self, src, trg):
    src_seq_len, N = src.shape
    trg_seq_len, N = trg.shape

    src_positions = (
        torch.arange(0, src_seq_len).unsqueeze(1).expand(src_seq_len, N).to(self.device)
    )

    trg_positions = (
        torch.arange(0, trg_seq_len).unsqueeze(1).expand(trg_seq_len, N)
        .to(self.device)
    )

    emb_src = self.dropout(
        (self.src_word_embedding(src) + self.src_position_embedding(src_positions))
    )

    emb_trg = self.dropout(
        (self.target_word_embedding(trg) + self.target_position_embedding(trg_positions))
    )

    src_padding_mask = self.make_src_mask(src)
    trg_mask = self.transformer.generate_square_subsequent_mask(trg_seq_len).to(self.device)
    trg_padding_mask = self.make_trg_mask(trg)


    out = self.transformer(
        emb_src,
        emb_trg,
        src_key_padding_mask=src_padding_mask,
        tgt_mask=trg_mask,
        tgt_key_padding_mask=trg_padding_mask
    )

    out = self.fc_out(out)

    return out

### Loading dataset and vocabularies

In [None]:
raw_fr_text, raw_en_text = load_datasets()
processed_fr, processed_en = preprocess(raw_fr_text, raw_en_text)

fr_tok2idx, fr_idx2tok = create_vocabulary(processed_fr, tokenize_fr)
en_tok2idx, en_idx2tok = create_vocabulary(processed_en, tokenize_en)

tensorified_fr = tensorify_dataset(processed_fr, fr_tok2idx, tokenize_fr).to(device)
tensorified_en = tensorify_dataset(processed_en, en_tok2idx, tokenize_en).to(device)

### Loading saved model

In [None]:
saved_data = torch.load(TRAIN_STATS_PATH)
model_state_dict = saved_data["model_state_dict"]

num_epochs = 100
learning_rate = 3e-4
batch_size = 512

src_vocab_size = len(fr_tok2idx)
trg_vocab_size = len(en_tok2idx)
embedding_size = 512
num_heads = 8
num_encoder_layers = 3
num_decoder_layers = 3
dropout = 0.10
max_len = MAX_SEQ_LEN
forward_expansion = 4 * embedding_size
src_pad_idx = fr_tok2idx[PAD_TOKEN]

model = Transformer(
    embedding_size,
    src_vocab_size,
    trg_vocab_size,
    src_pad_idx,
    num_heads,
    num_encoder_layers,
    num_decoder_layers,
    forward_expansion,
    dropout,
    device
).to(device)

model.load_state_dict(model_state_dict)
model.eval()

full_dataset = LangDataset(tensorified_fr, tensorified_en)
train_data, test_data = torch.utils.data.random_split(full_dataset, [0.9, 0.1])
train_iter = DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_iter = DataLoader(test_data, batch_size=batch_size, shuffle=True)

### Training

In [None]:
num_epochs = 100
learning_rate = 3e-4
batch_size = 512

src_vocab_size = len(fr_tok2idx)
trg_vocab_size = len(en_tok2idx)
embedding_size = 512
num_heads = 8
num_encoder_layers = 3
num_decoder_layers = 3
dropout = 0.10
max_len = MAX_SEQ_LEN
forward_expansion = 4 * embedding_size
src_pad_idx = fr_tok2idx[PAD_TOKEN]


full_dataset = LangDataset(tensorified_fr, tensorified_en)
train_data, test_data = torch.utils.data.random_split(full_dataset, [0.9, 0.1])
train_iter = DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_iter = DataLoader(test_data, batch_size=batch_size, shuffle=True)

model = Transformer(
    embedding_size,
    src_vocab_size,
    trg_vocab_size,
    src_pad_idx,
    num_heads,
    num_encoder_layers,
    num_decoder_layers,
    forward_expansion,
    dropout,
    device
).to(device)

optimizer = optim.Adam(model.parameters(), lr=learning_rate)

pad_idx = fr_tok2idx[PAD_TOKEN]
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

loss_history = []
acc_history = []

for epoch in range(num_epochs):
  batch_loss = []
  batch_acc = []

  for batch_idx, (inp_data, target) in enumerate(train_iter):
    src = inp_data.to(device).T
    target = target.to(device).T
    target_input = target[:-1, :]
    target_output = target[1:, :]

    output = model(src, target_input)

    pred = output.argmax(2)
    mask = (target_output != pad_idx)
    correct = (pred == target_output) & mask
    accuracy = correct.sum() / mask.sum()
    batch_acc.append(accuracy)

    output = output.reshape(-1, output.shape[2])
    target = target_output.reshape(-1)

    optimizer.zero_grad()

    loss = criterion(output, target)
    loss.backward()
    batch_loss.append(loss.item())

    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

    optimizer.step()

  avg_batch_acc = float(sum(batch_acc) / len(batch_acc))
  avg_batch_loss = sum(batch_loss) / len(batch_loss)
  acc_history.append(avg_batch_acc)
  loss_history.append(avg_batch_loss)

  torch.save({
      "loss_history": loss_history,
      "acc_history": acc_history,
      "epoch": epoch,
      "datapoints": MAX_LINE_COUNT,
      "model_state_dict": model.state_dict(),
      "optimizer_state_dict": optimizer.state_dict(),
      }, TRAIN_STATS_PATH)

  print(f"Epoch {epoch + 1} / {num_epochs} - loss: {avg_batch_loss:.4f} - accuracy: {avg_batch_acc * 100:.2f}%")




Epoch 1 / 100 - loss: 5.1525 - accuracy: 23.84%
Epoch 2 / 100 - loss: 3.9611 - accuracy: 35.30%
Epoch 3 / 100 - loss: 3.3871 - accuracy: 41.71%
Epoch 4 / 100 - loss: 2.9753 - accuracy: 46.60%
Epoch 5 / 100 - loss: 2.6873 - accuracy: 49.94%
Epoch 6 / 100 - loss: 2.4804 - accuracy: 52.35%
Epoch 7 / 100 - loss: 2.3216 - accuracy: 54.19%
Epoch 8 / 100 - loss: 2.1964 - accuracy: 55.65%
Epoch 9 / 100 - loss: 2.0909 - accuracy: 56.92%
Epoch 10 / 100 - loss: 2.0012 - accuracy: 58.08%
Epoch 11 / 100 - loss: 1.9243 - accuracy: 59.06%
Epoch 12 / 100 - loss: 1.8562 - accuracy: 59.99%
Epoch 13 / 100 - loss: 1.7966 - accuracy: 60.80%
Epoch 14 / 100 - loss: 1.7432 - accuracy: 61.52%
Epoch 15 / 100 - loss: 1.6950 - accuracy: 62.18%
Epoch 16 / 100 - loss: 1.6502 - accuracy: 62.81%
Epoch 17 / 100 - loss: 1.6099 - accuracy: 63.41%
Epoch 18 / 100 - loss: 1.5732 - accuracy: 63.91%
Epoch 19 / 100 - loss: 1.5378 - accuracy: 64.43%
Epoch 20 / 100 - loss: 1.5055 - accuracy: 64.90%
Epoch 21 / 100 - loss: 1.4748

In [None]:
def en_token_rejoin(tokens):
  sentence = ""
  for token in tokens:
    if token[0] in ["'", ".", ",", "!", "?", ":", ";"]:
      sentence = sentence.rstrip() + token
      continue

    sentence+= token + " "

  return sentence

def translate_sentence(model, sentence, max_length=MAX_SENT_LEN):
  if type(sentence) == str:
      tokens = [token.text.lower() for token in spacy_fr(sentence)]

  else:
      tokens = [token.lower() for token in sentence]

  tokens.insert(0, START_TOKEN)
  tokens.append(END_TOKEN)

  unknown_tokens = {}
  for idx, token in enumerate(tokens):
    if token not in fr_tok2idx:
      unknown_tokens[idx] = token
      tokens.remove(token)

  text_to_indices = [fr_tok2idx[token] for token in tokens]
  sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)

  outputs = [en_tok2idx[START_TOKEN]]
  for i in range(max_length):
      trg_tensor = torch.LongTensor(outputs).unsqueeze(1).to(device)

      with torch.no_grad():
          output = model(sentence_tensor, trg_tensor)

      best_guess = output.argmax(2)[-1, :].item()
      if best_guess == en_tok2idx[END_TOKEN]:
          break

      outputs.append(best_guess)

  translated_sentence = [en_idx2tok[idx] for idx in outputs]
  for idx, token in unknown_tokens.items():
    translated_sentence.insert(idx, token)

  return en_token_rejoin(translated_sentence[1:])

def batch_translate_tensor(model, batch_data, max_length=MAX_SEQ_LEN):
  data_len, seq_len = batch_data.shape

  src = batch_data.T.to(device)
  trg = torch.full(
      (1, data_len),
      en_tok2idx[START_TOKEN],
      dtype=torch.long,
      device=device
  )

  finished = torch.zeros(data_len, dtype=torch.bool, device=device)

  for _ in range(max_length):
    with torch.no_grad():
      output = model(src, trg)

    next_token_logits = output[-1, :, :]
    next_tokens = next_token_logits.argmax(dim=1)

    next_tokens = next_tokens.masked_fill(finished, fr_tok2idx[END_TOKEN])
    trg = torch.cat([trg, next_tokens.unsqueeze(0)], dim=0)

    finished |= (next_tokens == fr_tok2idx[END_TOKEN])
    if finished.all():
      break

  translations = []
  trg_np = trg.detach().cpu().numpy().T

  for seq in trg_np:
    words = []
    for idx in seq[1:]:
      if idx == en_tok2idx[END_TOKEN]:
        break
      words.append(en_idx2tok[idx])
    translations.append(" ".join(words))

  return translations

def bleu_score(model, test_iter, max_count=None):
  trgs = []
  pred_trgs = []
  cnt = 0

  for (x, y) in test_iter:
    src = x.to(device)
    trg = y[:, :-1]

    pred = batch_translate_tensor(model, src)
    pred_trgs.extend(pred)
    for sent_tok in trg:
      trgs.append([" ".join([ val for tok in sent_tok if (val:=en_idx2tok[tok.item()]) not in [START_TOKEN, END_TOKEN, PAD_TOKEN]])])

    cnt += src.shape[0]
    if max_count and cnt >= max_count:
      break

  bleu = BLEUScore().to(device)
  score = bleu(pred_trgs, trgs)

  print(f"Total tested datapoints: {cnt}")
  print(f"BLEU score: {score * 100:.2f}%")

In [None]:
input_sent = "je suis en train de"
actual_translation = "I am in the process of"
model.eval()
prediction = translate_sentence(model, input_sent)
print(f"Input sentence: {input_sent}")
print(f"Translated sentence: {prediction}")
print(f"Actual translation: {actual_translation}")

Input sentence: je suis en train de
Translated sentence: i am in the process of it.
Actual translation: I am in the process of


In [None]:
model.eval()
bleu_score(model, test_iter)

Total tested datapoints: 19868
BLEU score: 55.68%
