<a href="https://colab.research.google.com/github/wenjie-hoo/nn_uwr_22/blob/main/en2pl.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# EN2PL
### Sequence-to-Sequence translation Model with Attention

In [None]:
import collections
import itertools
import math
import numpy as np
import pandas as pd
import re
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from collections import Counter
from functools import partial
from pathlib import Path
from google_drive_downloader import GoogleDriveDownloader as gdd
from nltk import wordpunct_tokenize
from nltk.tokenize import word_tokenize
from torch.utils.data import Dataset, DataLoader, random_split
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence
from tqdm import tqdm_notebook, tqdm

In [None]:
!gdown https://opus.nlpl.eu/download.php?f=CCAligned/v1/moses/en-pl.txt.zip

Downloading...
From: https://opus.nlpl.eu/download.php?f=CCAligned/v1/moses/en-pl.txt.zip
To: /content/en-pl.txt.zip
100% 1.20G/1.20G [01:09<00:00, 17.2MB/s]


In [None]:
! unzip en-pl.txt.zip

Archive:  en-pl.txt.zip
  inflating: README                  
  inflating: LICENSE                 
  inflating: CCAligned.en-pl.en      
  inflating: CCAligned.en-pl.pl      
  inflating: CCAligned.en-pl.xml     


In [None]:
# ! gdown https://object.pouta.csc.fi/OPUS-100/v1.0/opus-100-corpus-en-pl-v1.0.tar.gz

Downloading...
From: https://object.pouta.csc.fi/OPUS-100/v1.0/opus-100-corpus-en-pl-v1.0.tar.gz
To: /content/opus-100-corpus-en-pl-v1.0.tar.gz
100% 39.3M/39.3M [00:02<00:00, 13.8MB/s]


In [None]:
# ! tar -xf /content/opus-100-corpus-en-pl-v1.0.tar.gz

In [None]:
def parallel_lines(file1_path, file2_path, output_file_path):
    with open(file1_path, 'r') as file1, open(file2_path, 'r') as file2, open(output_file_path, 'w') as output_file:
        for line1, line2 in zip(file1, file2):
            line1 = line1.strip()
            line2 = line2.strip()
            parallel_lines = f"{line1}\t{line2}\n"
            output_file.write(parallel_lines)
# file1_path = './opus-100-corpus/v1.0/supervised/en-pl/opus.en-pl-train.en'
# file2_path = './opus-100-corpus/v1.0/supervised/en-pl/opus.en-pl-train.pl'
file1_path = 'CCAligned.en-pl.en'
file2_path = 'CCAligned.en-pl.pl'
output_file_path = 'en_pl.txt'

parallel_lines(file1_path, file2_path, output_file_path)


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [None]:
def tokenize(text):
    text = str(text).lower()
    tokens = wordpunct_tokenize(text)
    tokens = [token for token in tokens
    if all(char.isalpha() for char in token)]
    return tokens

class EnglishPolishTranslations(Dataset):
    def __init__(self, path, max_vocab):
        self.max_vocab = max_vocab
        self.padding_token = '<PAD>'
        self.start_token = '<SOS>'
        self.end_token = '<EOS>'
        self.unknown_word_token = '<UNK>'
        self.flatten = lambda x: [sublst for lst in x for sublst in lst]

        df = pd.read_csv(path, names=['english', 'polish'], sep='\t')
        self.tokenize_df(df)
        self.rare_tokens(df)
        self.create_token2idx(df)

        df = self.remove_unk(df)
        self.add_start_and_end_to_tokens(df)
        self.tokens_to_indices(df)
    #0en 1pl
    def __getitem__(self, idx):
        pair = self.indices_pairs[idx]
        return pair[0], pair[1]

    def tokenize_df(self, df):
        df['tokens_inputs'] = df.english.apply(tokenize)
        df['tokens_targets'] = df.polish.apply(tokenize)

    def rare_tokens(self, df):
        common_tokens_inputs = self.get_most_common_tokens(df.tokens_inputs.tolist())
        common_tokens_targets = self.get_most_common_tokens(df.tokens_targets.tolist())

        df['tokens_inputs'] = df.tokens_inputs.apply(lambda tokens: [token if token in common_tokens_inputs else self.unknown_word_token for token in tokens])
        df['tokens_targets'] = df.tokens_targets.apply(lambda tokens: [token if token in common_tokens_targets else self.unknown_word_token for token in tokens])

    def get_most_common_tokens(self, tokens_series):
        all_tokens = [token for sublist in tokens_series for token in sublist]
        token_counts = Counter(all_tokens)
        most_common_tokens = {token for token, count in token_counts.most_common(self.max_vocab - 4)}
        return most_common_tokens

    def remove_unk(self, df, threshold=0.99):
        def calculate_ratio(tokens):
            if len(tokens) == 0:
                return False
            count = sum(1 for token in tokens if token != '<UNK>')
            return count / len(tokens) > threshold
        df = df[df['tokens_inputs'].apply(calculate_ratio)]
        df = df[df['tokens_targets'].apply(calculate_ratio)]
        return df

    def create_token2idx(self, df):
      unique_tokens_inputs = set(self.flatten(df.tokens_inputs))
      unique_tokens_targets = set(self.flatten(df.tokens_targets))
      special_tokens = [self.padding_token, self.start_token, self.end_token, self.unknown_word_token,]
      unique_tokens_inputs -= set(special_tokens)
      unique_tokens_targets -= set(special_tokens)
      unique_tokens_inputs = sorted(special_tokens) + sorted(unique_tokens_inputs)
      unique_tokens_targets = sorted(special_tokens) + sorted(unique_tokens_targets)

      self.token2idx_inputs = {token: idx for idx, token in enumerate(unique_tokens_inputs)}
      self.idx2token_inputs = {idx: token for token, idx in self.token2idx_inputs.items()}
      self.token2idx_targets = {token: idx for idx, token in enumerate(unique_tokens_targets)}
      self.idx2token_targets = {idx: token for token, idx in self.token2idx_targets.items()}

    def add_start_and_end_to_tokens(self, df):
        df['tokens_inputs'] = df.tokens_inputs.apply(lambda tokens: [self.start_token] + tokens + [self.end_token])
        df['tokens_targets'] = df.tokens_targets.apply(lambda tokens: [self.start_token] + tokens + [self.end_token])

    def tokens_to_indices(self, df):
        df['indices_inputs'] = df.tokens_inputs.apply(
            lambda tokens: [self.token2idx_inputs[token] for token in tokens])
        df['indices_targets'] = df.tokens_targets.apply(
            lambda tokens: [self.token2idx_targets[token] for token in tokens])
        self.indices_pairs = list(zip(df.indices_inputs, df.indices_targets))

    def __len__(self):
        return len(self.indices_pairs)

In [None]:
def clean(file_path, output_file_path):
    special_chars_pattern = re.compile(r'[^\w\s]')
    with open(file_path, 'r') as input_file, open(output_file_path, 'w') as output_file:
        for line in input_file:
            line = line.strip()
            line = special_chars_pattern.sub('', line)
            if not line.isdigit() and line:
                output_file.write(line + '\n')

file_path = 'en_pl.txt'
output_file_path = 'en2pl.txt'
clean(file_path, output_file_path)


In [None]:
# DATA_PATH = './en2pl.txt'

In [None]:
# not enough ram
# ! split -l 5000000 '--additional-suffix=.txt' en2pl.txt en2pl_
! split -l 500000 '--additional-suffix=.txt' en2pl.txt en2pl_

In [None]:
DATA_PATH = './en2pl_aa.txt'

In [None]:
dataset = EnglishPolishTranslations(DATA_PATH, max_vocab=10000)

272000

In [None]:
train_size = int(0.999 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

In [None]:
print(test_size)


272


### Collation function and dataloaders

In [None]:
def collate(batch):
    inputs = [torch.LongTensor(item[0]) for item in batch]
    targets = [torch.LongTensor(item[1]) for item in batch]

    padded_inputs = pad_sequence(inputs, padding_value=dataset.token2idx_targets[dataset.padding_token], batch_first=True)
    padded_targets = pad_sequence(targets, padding_value=dataset.token2idx_targets[dataset.padding_token], batch_first=True)

    lengths = torch.LongTensor([len(x) for x in inputs])
    lengths, permutation = lengths.sort(dim=0, descending=True)

    return padded_inputs[permutation].to(device), padded_targets[permutation].to(device), lengths.to(device)


In [None]:
batch_size = 512
train_loader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate)
test_loader = DataLoader(test_dataset, batch_size=1, collate_fn=collate)

## Seq2Seq with Attention

[seq2seq.ipynb](https://github.com/ZeweiChu/PyTorch-Course/blob/master/notebooks/7.seq2seq.ipynb)

[source](https://jalammar.github.io/visualizing-neural-machine-translation-mechanics-of-seq2seq-models-with-attention/)

https://zhuanlan.zhihu.com/p/135970560

https://github.com/wangshusen/DeepLearning/blob/master/Slides/10_Transformer_1.pdf

#### Define the Encoder

In [None]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, batch_size):
        super(Encoder, self).__init__()
        self.batch_size = batch_size
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.embedding = nn.Embedding(self.vocab_size, self.embedding_dim)
        self.gru = nn.GRU(self.embedding_dim, self.hidden_size, batch_first=True)

    def forward(self, inputs, lengths):
        embedded = self.embedding(inputs)
        packed = pack_padded_sequence(embedded, lengths, batch_first=True)
        output, hidden = self.gru(packed)
        output, _ = pad_packed_sequence(output)
        return output, hidden

    def init_hidden(self):
        return torch.randn(1, self.batch_size, self.hidden_size).to(device)


### Define the Decoder

In [None]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, decoder_hidden_size, encoder_hidden_size, batch_size):
        super(Decoder, self).__init__()
        self.batch_size = batch_size
        self.encoder_hidden_size = encoder_hidden_size
        self.decoder_hidden_size = decoder_hidden_size
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim

        self.embedding = nn.Embedding(self.vocab_size, self.embedding_dim)
        self.gru = nn.GRU(self.embedding_dim + self.encoder_hidden_size, self.decoder_hidden_size, batch_first=True)
        self.fc = nn.Linear(self.decoder_hidden_size, self.vocab_size)
        #attention weight
        self.W1 = nn.Linear(self.encoder_hidden_size, self.decoder_hidden_size)
        self.W2 = nn.Linear(self.encoder_hidden_size, self.decoder_hidden_size)
        self.V = nn.Linear(self.decoder_hidden_size, 1)

    def forward(self, targets, hidden, encoder_output):
        encoder_output = encoder_output.permute(1, 0, 2)
        hidden_with_time_axis = hidden.permute(1, 0, 2)
        #attention score
        score = torch.tanh(self.W1(encoder_output) + self.W2(hidden_with_time_axis))
        attention_weights = torch.softmax(self.V(score), dim=1)

        context_vector = torch.bmm(attention_weights.permute(0, 2, 1), encoder_output).squeeze(1)
        x = self.embedding(targets)
        x = torch.cat((context_vector.unsqueeze(1), x), dim=-1)

        output, state = self.gru(x, self.init_hidden())
        output = output.view(-1, output.size(2))
        x = self.fc(output)

        return x, state, attention_weights

    def init_hidden(self):
        return torch.randn(1, self.batch_size, self.decoder_hidden_size).to(device)

### Define a model that has both an Encoder and Decoder

In [None]:
criterion = nn.CrossEntropyLoss()

def loss_function(real, pred):
    mask = real.ge(1).float().to(device)
    loss_ = criterion(pred, real) * mask
    return torch.mean(loss_)


class EncoderDecoder(nn.Module):
    def __init__(self, inputs_vocab_size, targets_vocab_size, hidden_size,
                 embedding_dim, batch_size, targets_start_idx, targets_stop_idx):
        super(EncoderDecoder, self).__init__()
        self.batch_size = batch_size
        self.targets_start_idx = targets_start_idx
        self.targets_stop_idx = targets_stop_idx

        self.encoder = Encoder(inputs_vocab_size, embedding_dim,
                               hidden_size, batch_size).to(device)

        self.decoder = Decoder(targets_vocab_size, embedding_dim,
                               hidden_size, hidden_size, batch_size).to(device)

    def predict(self, inputs, lengths):
        self.batch_size = inputs.size(0)
        encoder_output, encoder_hidden = self.encoder(inputs.to(device), lengths,)
        decoder_hidden = encoder_hidden
        decoder_input = torch.LongTensor([[self.targets_start_idx]] * self.batch_size,)
        #output predictions instead of loss
        output = []
        for _ in range(20):
            predictions, decoder_hidden, _ = self.decoder(
                decoder_input.to(device),
                decoder_hidden.to(device),
                encoder_output.to(device),
            )
            prediction = torch.multinomial(F.softmax(predictions, dim=1), 1)
            decoder_input = prediction

            prediction = prediction.item()
            output.append(prediction)

            if prediction == self.targets_stop_idx:
                return output

        return output

    def forward(self, inputs, targets, lengths):
        self.batch_size = inputs.size(0)

        encoder_output, encoder_hidden = self.encoder(inputs.to(device), lengths,)
        decoder_hidden = encoder_hidden

        # Initialize the input of the decoder to be <SOS>
        decoder_input = torch.LongTensor([[self.targets_start_idx]] * self.batch_size,)
        loss = 0
        for timestep in range(1, targets.size(1)):
            predictions, decoder_hidden, _ = self.decoder(
                decoder_input.to(device),
                decoder_hidden.to(device),
                encoder_output.to(device),
            )
            decoder_input = targets[:, timestep].unsqueeze(1)
            loss += loss_function(targets[:, timestep], predictions)

        return loss / targets.size(1)

In [None]:
model = EncoderDecoder(
    inputs_vocab_size=len(dataset.token2idx_inputs),
    targets_vocab_size=len(dataset.token2idx_targets),
    hidden_size=256,
    embedding_dim=100,
    batch_size=batch_size,
    targets_start_idx=dataset.token2idx_targets[dataset.start_token],
    targets_stop_idx=dataset.token2idx_targets[dataset.end_token],
).to(device)

optimizer = optim.Adam([p for p in model.parameters() if p.requires_grad], lr=0.001)

In [None]:
model_h = EncoderDecoder(
    inputs_vocab_size=len(dataset.token2idx_inputs),
    targets_vocab_size=len(dataset.token2idx_targets),
    hidden_size=512,
    embedding_dim=200,
    batch_size=batch_size,
    targets_start_idx=dataset.token2idx_targets[dataset.start_token],
    targets_stop_idx=dataset.token2idx_targets[dataset.end_token],
).to(device)

optimizer = optim.Adam([p for p in model_h.parameters() if p.requires_grad], lr=0.01)

In [None]:
def train(model):
  model.train()
  losses = []
  for epoch in range(10):
      total_loss = total = 0
      progress_bar = tqdm_notebook(train_loader, desc='Training', leave=False)
      for inputs, targets, lengths in progress_bar:
          optimizer.zero_grad()
          loss = model(inputs, targets, lengths.cpu())
          loss.backward()
          losses.append(loss)
          optimizer.step()
          total_loss += loss.item()
          total += targets.size(1)

      train_loss = total_loss / total

      tqdm.write(f'epoch #{epoch + 1:3d}\ttrain_loss: {train_loss:.2e}\n')

In [None]:
# train(model)

In [None]:
train(model_h)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  progress_bar = tqdm_notebook(train_loader, desc='Training', leave=False)


Training:   0%|          | 0/531 [00:00<?, ?it/s]

In [None]:
# torch.save(model.state_dict(), 'en2pl.model')

In [None]:
# model.load_state_dict(torch.load('en2pl.model'))

In [None]:
model_h.eval()
total_loss = total = 0

NameError: ignored

### BLEU evaluation
https://aclanthology.org/P02-1040.pdf \
https://machinelearningmastery.com/calculate-bleu-score-for-text-python/


In [None]:
def bleu_score(candidate, references, n):
    candidate_ngrams = Counter(zip(*[candidate[i:] for i in range(n)]))
    reference_ngrams = [Counter(zip(*[ref[i:] for i in range(n)])) for ref in references]

    clipped_counts = dict()
    for ngram, count in candidate_ngrams.items():
        clipped_count = 0
        for ref_counts in reference_ngrams:
            clipped_count = max(clipped_count, min(ref_counts[ngram], count))
        clipped_counts[ngram] = clipped_count

    candidate_length = len(candidate)
    reference_length = min([len(ref) for ref in references], key=lambda x: abs(x - candidate_length))

    precision = sum(clipped_counts.values()) / float(candidate_length)
    brevity_penalty = min(1, math.exp(1 - reference_length / float(candidate_length)))

    bleu = brevity_penalty * precision

    return bleu


In [None]:
candidate_corpus = []
reference_corpus = []
with torch.no_grad():
    for inputs, targets, lengths in test_loader:
        input_tokens = [
            dataset.idx2token_inputs[idx]
            for idx in inputs.cpu()[0].numpy()[1:-1]
        ]
        print('>', ' '.join(input_tokens))
        target_tokens = [
            dataset.idx2token_targets[idx]
            for idx in targets.cpu()[0].numpy()[1:-1]
        ]

        target_sentences = [[''.join(tokens) for tokens in target_tokens]]
        print('=', ' '.join(target_tokens))
        reference_corpus.append([' '.join(target_tokens)])
        # Forward pass
        outputs = model.predict(inputs, lengths.cpu())
        output_tokens = [
            dataset.idx2token_targets[idx]
            for idx in outputs[:-1]
        ]
        predicted_sentences = [''.join(tokens) for tokens in output_tokens]
        candidate_corpus.append(' '.join(output_tokens))
        # print(predicted_sentences)
        print(' '.join(output_tokens))
        score = bleu_score(predicted_sentences, target_sentences , n=1)
        print(score)
        print()



> assessment
= ocena środków pomocy
ocena
0.1353352832366127

> thank you randy
= dziękuję randy
dziękuję randy
1.0

> thats something i could never adapt to
= widzisz do tego nie mógłbym się nigdy dostosować
to nie mogę jej nie mogę
0.11942188509563154

> its not me you should be arresting mr holmes
= chodź szybko to nie mnie powinien pan aresztować panie holmes
to nie ma już tracę możliwości
0.17113903967753066

> john taylor
= john taylor
john taylor
1.0

> he got out
= uciekł
wyszedł
0.0

> stay with me
= zostań ze mną
zostań ze mną
1.0

> if you were to drop the charges
= gdybyś mógł wycofać zarzuty
my także przestaniesz powiązanych
0.0

> how could you do this to me
= jak mogłaś mi to zrobić
ok jak to się ze mną
0.3333333333333333

> i already gave you people my dna
= już dałem wam moje dna
już dałam ci ludzie może zająć stare dna
0.25

> michelle was a beautiful woman
= michelle była piękną kobietą
michelle było piękny kobieta
0.25

> the hall is free
= akurat sala jest wolna
br

In [None]:
def calculate_ngram_counts(tokens, n):
    ngrams = zip(*[tokens[i:] for i in range(n)])
    return collections.Counter(ngrams)

def calculate_bleu_score(candidate, references, n=4):
    candidate_tokens = candidate.split()
    reference_tokens = [reference.split() for reference in references]

    candidate_ngram_counts = [
        calculate_ngram_counts(candidate_tokens, i) for i in range(1, n+1)
    ]
    reference_ngram_counts = [
        [calculate_ngram_counts(reference_tokens[j], i) for i in range(1, n+1)]
        for j in range(len(references))
    ]

    precisions = []
    for i in range(n):
        matches = sum(
            min(candidate_ngram_counts[i][token], max(
                reference_ngram_counts[j][i][token] for j in range(len(references))
            )) for token in candidate_ngram_counts[i]
        )
        total = sum(candidate_ngram_counts[i].values())
        precision = matches / total if total > 0 else 0
        precisions.append(precision)

    if any(precision == 0 for precision in precisions):
        return 0

    brevity_penalty = min(1, len(candidate_tokens) / min(len(reference) for reference in reference_tokens))

    bleu = math.exp(sum(math.log(precision) for precision in precisions) / n) * brevity_penalty * 100
    return bleu


def calculate_corpus_bleu_score(candidate_corpus, reference_corpus, n=1):
    assert len(candidate_corpus) == len(reference_corpus), "Corpus sizes do not match."

    total_bleu_score = 0.0
    num_sentences = len(candidate_corpus)

    for candidate, references in zip(candidate_corpus, reference_corpus):
        bleu_score = calculate_bleu_score(candidate, references, n)
        total_bleu_score += bleu_score

    corpus_bleu_score = total_bleu_score / num_sentences
    return corpus_bleu_score


corpus_bleu_score = calculate_corpus_bleu_score(candidate_corpus, reference_corpus)
print("Corpus BLEU score:", corpus_bleu_score)


Corpus BLEU score: 27.719681829080493


# Transformer - self-attention
https://github.com/wangshusen/DeepLearning/blob/master/Slides/10_Transformer_2.pdf

In [None]:
!pip install transformers -q

In [None]:
import transformers

In [None]:
input_vocab = dataset.token2idx_inputs
target_vocab = dataset.token2idx_targets
src_pad = torch.tensor(SRC_VOCAB[dataset.padding_token]).to(device)
src_sos = torch.tensor(SRC_VOCAB[dataset.start_token]).to(device)
src_eos = torch.tensor(SRC_VOCAB[dataset.end_token]).to(device)
trg_pad = torch.tensor(TRG_VOCAB[dataset.padding_token]).to(device)
trg_sos = torch.tensor(TRG_VOCAB[dataset.start_token]).to(device)
trg_eos = torch.tensor(TRG_VOCAB[dataset.end_token]).to(device)

In [None]:
train_loader = train_loader
test_loader = test_loader

NameError: ignored