In [55]:
from collections.abc import Callable
from torch import nn
from torch.utils.data import Dataset, DataLoader
import torch

In [56]:
import pandas as pd

eng_ger = pd.read_csv("/kaggle/input/eng2ger/english_german.csv")

eng_ger.head(20)[10:]

Unnamed: 0,English,German
10,hello,hallo
11,i try,ich probiere es
12,i won,ich hab gewonnen
13,i won,ich habe gewonnen
14,smile,lacheln
15,cheers,zum wohl
16,freeze,keine bewegung
17,freeze,stehenbleiben
18,got it,verstanden
19,got it,einverstanden


In [57]:
# eng_ger = eng_ger.head(100)

In [58]:
eng_ger["English"] = "<sos> " + eng_ger["English"].str.strip() + " <eos>"
eng_ger["German"] = "<sos> " + eng_ger["German"].str.strip() + " <eos>"

In [59]:
eng_ger.head(20)[10:]

Unnamed: 0,English,German
10,<sos> hello <eos>,<sos> hallo <eos>
11,<sos> i try <eos>,<sos> ich probiere es <eos>
12,<sos> i won <eos>,<sos> ich hab gewonnen <eos>
13,<sos> i won <eos>,<sos> ich habe gewonnen <eos>
14,<sos> smile <eos>,<sos> lacheln <eos>
15,<sos> cheers <eos>,<sos> zum wohl <eos>
16,<sos> freeze <eos>,<sos> keine bewegung <eos>
17,<sos> freeze <eos>,<sos> stehenbleiben <eos>
18,<sos> got it <eos>,<sos> verstanden <eos>
19,<sos> got it <eos>,<sos> einverstanden <eos>


In [60]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

en_tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n')
ger_tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n')

In [61]:
en_tokenizer.fit_on_texts(eng_ger["English"])
ger_tokenizer.fit_on_texts(eng_ger["German"])

In [62]:
english = en_tokenizer.texts_to_sequences(eng_ger["English"])
german = ger_tokenizer.texts_to_sequences(eng_ger["German"])

print(english[:10])
print(german[:10])

[[1, 3260, 2], [1, 3260, 2], [1, 462, 2], [1, 4151, 2], [1, 4151, 2], [1, 444, 2], [1, 79, 2], [1, 79, 2], [1, 219, 2], [1, 272, 2]]
[[1, 2207, 2], [1, 3781, 1525, 2], [1, 5262, 2], [1, 9957, 2], [1, 16860, 2], [1, 645, 2], [1, 211, 2], [1, 10, 16861, 2], [1, 12301, 2], [1, 781, 2]]


In [63]:
english = pad_sequences(english, padding='post')
german = pad_sequences(german, padding='post')

print(english[:10])
print(german[:10])

[[   1 3260    2    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0]
 [   1 3260    2    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0]
 [   1  462    2    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0]
 [   1 4151    2    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0 

In [64]:
print(len(english))

152820


In [65]:
test_size = 0.2
train_len = int((1-test_size)*len(english))
Xtrain, Xtest, ytrain, ytest = english[:train_len], english[train_len:], german[:train_len], german[train_len:]

print(len(Xtrain))
print(len(Xtest))

122256
30564


In [66]:
class dataset(Dataset):
    def __init__(self, english, german):
        self.en = english
        self.ger = german
    def __len__(self):
        return len(self.en)
    def __getitem__(self, idx):
        x = self.en[idx]
        y = self.ger[idx]
        return torch.tensor(x, dtype=torch.long),torch.tensor(y, dtype=torch.long)

train_loader = DataLoader(dataset=dataset(Xtrain, ytrain), batch_size=256, num_workers=4, shuffle=True)
test_loader = DataLoader(dataset=dataset(Xtest, ytest), batch_size=256, num_workers=4, shuffle=True)

In [67]:
for i, dat in enumerate(train_loader):
    print(dat)
    if i == 1:
        break

[tensor([[  1,  15,   5,  ...,   0,   0,   0],
        [  1,   7, 142,  ...,   0,   0,   0],
        [  1, 217, 188,  ...,   0,   0,   0],
        ...,
        [  1,  26, 218,  ...,   0,   0,   0],
        [  1,  72, 374,  ...,   0,   0,   0],
        [  1, 334, 324,  ...,   0,   0,   0]]), tensor([[   1,  596,    8,  ...,    0,    0,    0],
        [   1,    4,   16,  ...,    0,    0,    0],
        [   1,  109,    5,  ...,    0,    0,    0],
        ...,
        [   1,   77,  191,  ...,    0,    0,    0],
        [   1,    8,   70,  ...,    0,    0,    0],
        [   1, 1116,  267,  ...,    0,    0,    0]])]
[tensor([[    1,    49,   365,  ...,     0,     0,     0],
        [    1,     6,   672,  ...,     0,     0,     0],
        [    1,    15,    74,  ...,     0,     0,     0],
        ...,
        [    1,    26, 11845,  ...,     0,     0,     0],
        [    1,    22,   170,  ...,     0,     0,     0],
        [    1,   141,    35,  ...,     0,     0,     0]]), tensor([[   1,   

In [68]:
en_vocab_size = len(en_tokenizer.word_index)
ger_vocab_size = len(ger_tokenizer.word_index)

print(en_vocab_size)
print(ger_vocab_size)

14850
30945


In [69]:
import random
class Encoder(nn.Module):
    def __init__(self, vocab_size:int, embed_size:int, hidden_size:int):
        super().__init__()
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(vocab_size, embed_size, padding_idx=0)
        self.LSTM = nn.LSTM(input_size=embed_size, hidden_size=hidden_size, batch_first=True, num_layers=2, bidirectional=True)
    def forward(self, x):
        # (batch, seq)
        embedding = self.embedding(x)
        # (batch, seq, embed_size)
        outputs, (hn, cn) = self.LSTM(embedding)
        # outputs: (batch, seq, hidden_size),  hn/cn : (num_layers, batch, hidden_size)
        return hn, cn
class Decoder(nn.Module):
    def __init__(self, vocab_size:int, embed_size:int, hidden_size:int):
        super().__init__()
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(vocab_size, embed_size, padding_idx=0)
        self.LSTM = nn.LSTM(input_size=embed_size, hidden_size=hidden_size, batch_first=True, num_layers=4)
        self.linear = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, hn, cn): # x is last english word
        # x: (batch,)
        x = x.unsqueeze(1)
        # x: (batch, 1) -- making it a sequence of length 1 so as to be consistent when passing to LSTM
        embedding = self.embedding(x)
        # embedding: (batch, 1, embed_size)
        outputs, (hn, cn) = self.LSTM(embedding, (hn, cn))
        # hn/cn : (batch, hidden_size)
        # outputs: (batch, 1, hidden_size) -- since only recieves one token at a time
        predictions = self.linear(outputs)
        # predictions: (batch, 1, vocab_size)
        return predictions.squeeze(1), hn, cn # predictions: (batch, vocab_size)

class seq2seq(nn.Module):
    def __init__(self, en_vocab_size:int, ger_vocab_size:int, embed_size:int, hidden_size:int):
        super().__init__()
        self.en_vocab_size = en_vocab_size
        self.ger_vocab_size = ger_vocab_size
        # plus 1 for empty/blank token
        self.encoder = Encoder(en_vocab_size, embed_size, hidden_size)
        self.decoder = Decoder(ger_vocab_size, embed_size, hidden_size)
    
    def forward(self, x, y, epoch):
        hn, cn = self.encoder(x)
        # y: (batch_size, seq_size)
        batch_size = y.shape[0] # int
        target_len = y.shape[1] # int -> output sequence length
        token = y[:,0] # (batch,)
        outputs = torch.zeros(batch_size, target_len, self.ger_vocab_size, device=next(self.parameters()).device) # (batch, seq, vocab_size)
        outputs[:, 0, 1] = 1
        for i in range(1, target_len):
            predictions, hn, cn = self.decoder(token, hn, cn) # (batch, vocab_size)
            token = y[:, i] if random.random() < 0.5/(epoch+1) else predictions.argmax(dim=-1)
            outputs[:,i] = predictions # outputs[:,i] has the same shape as outputs[:, i, :] which is (batch, vocab_size)
        return outputs # (batch, seq, vocab_size)


In [70]:
from tqdm import tqdm

In [71]:
output = "/kaggle/working/weights.pt"

In [72]:
def fit(train_loader, test_loader, model:Callable, optimizer:Callable, loss_fn:Callable, epochs:int, device:torch.device):
    model.train()
    epoch_seq = list(range(1, epochs+1))
    for epoch in range(epochs):
        en_sentence = "who are you"
        loop = tqdm(enumerate(train_loader), total=len(train_loader))
        for batch, (x,y) in loop:
            x,y = x.to(device), y.to(device)
            optimizer.zero_grad()
            y_pred = model(x, y, epoch) 
            # y_pred: (batch, seq, vocab_size) | y: (batch, seq)
            y_pred = y_pred.reshape(-1, y_pred.shape[2]) # (batch * seq, vocab_size)
            y = y.reshape(-1) # (batch * seq,)
            loss = loss_fn(y_pred, y)
            loss.backward()
            optimizer.step()
            loop.set_description(f"EPOCH: {epoch+1}/{epochs}")
            correct = (y==y_pred.argmax(dim=-1)).sum().item()
            total = len(y)
            loop.set_postfix(loss=loss.item(), acc=correct/total)
        torch.save(model.state_dict(), output)

In [73]:
import os

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if os.path.exists(output):
    model = seq2seq(en_vocab_size=en_vocab_size, ger_vocab_size=ger_vocab_size, embed_size=256, hidden_size=128)
    model.load_state_dict(torch.load(output))
else:
    model = seq2seq(en_vocab_size=en_vocab_size, ger_vocab_size=ger_vocab_size, embed_size=256, hidden_size=128)
optimizer = torch.optim.Adam(params=model.parameters())
loss_fn = nn.CrossEntropyLoss()
model.to(device)

  model.load_state_dict(torch.load(output))


seq2seq(
  (encoder): Encoder(
    (embedding): Embedding(14850, 256, padding_idx=0)
    (LSTM): LSTM(256, 128, num_layers=2, batch_first=True, bidirectional=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(30945, 256, padding_idx=0)
    (LSTM): LSTM(256, 128, num_layers=4, batch_first=True)
    (linear): Linear(in_features=128, out_features=30945, bias=True)
  )
)

In [74]:
fit(train_loader=train_loader, test_loader=test_loader, model=model, optimizer=optimizer,loss_fn=loss_fn, epochs=10, device=device)

EPOCH: 1/10: 100%|██████████| 478/478 [05:58<00:00,  1.33it/s, acc=0.938, loss=0.471]
EPOCH: 2/10: 100%|██████████| 478/478 [05:59<00:00,  1.33it/s, acc=0.942, loss=0.454]
EPOCH: 3/10: 100%|██████████| 478/478 [05:59<00:00,  1.33it/s, acc=0.94, loss=0.441] 
EPOCH: 4/10: 100%|██████████| 478/478 [05:59<00:00,  1.33it/s, acc=0.944, loss=0.434]
EPOCH: 5/10: 100%|██████████| 478/478 [05:59<00:00,  1.33it/s, acc=0.94, loss=0.444] 
EPOCH: 6/10: 100%|██████████| 478/478 [05:59<00:00,  1.33it/s, acc=0.942, loss=0.424]
EPOCH: 7/10: 100%|██████████| 478/478 [05:59<00:00,  1.33it/s, acc=0.939, loss=0.432]
EPOCH: 8/10: 100%|██████████| 478/478 [05:59<00:00,  1.33it/s, acc=0.947, loss=0.388]
EPOCH: 9/10: 100%|██████████| 478/478 [05:59<00:00,  1.33it/s, acc=0.948, loss=0.376]
EPOCH: 10/10: 100%|██████████| 478/478 [05:59<00:00,  1.33it/s, acc=0.95, loss=0.379] 


In [75]:
model.to("cpu")

seq2seq(
  (encoder): Encoder(
    (embedding): Embedding(14850, 256, padding_idx=0)
    (LSTM): LSTM(256, 128, num_layers=2, batch_first=True, bidirectional=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(30945, 256, padding_idx=0)
    (LSTM): LSTM(256, 128, num_layers=4, batch_first=True)
    (linear): Linear(in_features=128, out_features=30945, bias=True)
  )
)

In [76]:
for x,y in train_loader:
    print(y[0], y[0].shape)
    break
# <sos> german is token 1

tensor([  1,  22,  16,   7, 330, 506, 128,   2,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0]) torch.Size([55])


In [77]:
def predict(model, x, target_len):
    # x = x.to(device)
    hn, cn = model.encoder(x)
    # y: (batch_size, seq_size)
    batch_size = x.shape[0] # int
    token = torch.ones(batch_size, device=next(model.parameters()).device, dtype=torch.long) # (batch,) # <sos> token is 1
    outputs = torch.zeros(batch_size, target_len, model.ger_vocab_size, device=next(model.parameters()).device)
    outputs[:,0,1] = 1
    for i in range(1, target_len):
        predictions, hn, cn = model.decoder(token, hn, cn) # (batch, vocab_size)
        outputs[:,i] = predictions # outputs[:,i] has the same shape as outputs[:, i, :] which is (batch, vocab_size)
    return outputs # (batch, seq, vocab_size)

def translate(model, text:str):
    print(text)
    tokenized = en_tokenizer.texts_to_sequences([text])
    padded_sequence = pad_sequences(tokenized, padding='post', maxlen=len(english[0]))
    x = torch.tensor(padded_sequence)
    predicted_sequence = predict(model, x, 7)
    output_sequence = predicted_sequence[0].argmax(dim=-1).detach().cpu().numpy()
    return ger_tokenizer.sequences_to_texts([output_sequence])[0]

idx = 9
print(translate(model, eng_ger["English"][idx]))
print()
print("original:")
print(eng_ger["German"][idx])

<sos> wait <eos>
<sos> warte <eos> <eos>

original:
<sos> warte <eos>


In [78]:
translate(model, "<sos> got <eos>")

<sos> got <eos>


'<sos> gehort <eos>'

In [79]:
translate(model, "<sos> you <eos>")

<sos> you <eos>


'<sos> du sie <eos>'

In [80]:
translate(model, "<sos> how are you <eos>")

<sos> how are you <eos>


'<sos> wie gehts dir <eos>'

In [81]:
translate(model, "<sos> what is your name? <eos>")

<sos> what is your name? <eos>


'<sos> wie heit dein <eos>'

In [84]:
translate(model, "<sos> who is the king <eos>")

<sos> who is the king <eos>


'<sos> wer ist der konig <eos> <eos>'

In [85]:
translate(model, "<sos> next <eos>")

<sos> next <eos>


'<sos> nachste mal <eos> <eos>'

In [86]:
translate(model, "<sos> what is the time <eos>")

<sos> what is the time <eos>


'<sos> wie viel ist zeit zeit zeit'

In [87]:
translate(model, "<sos> when should i come <eos>")

<sos> when should i come <eos>


'<sos> wann kommen kommen kommen kommen'

In [88]:
translate(model, "<sos> she is his wife <eos>")

<sos> she is his wife <eos>


'<sos> sie ist ist frau <eos>'

In [89]:
translate(model, "<sos> I am the one <eos>")

<sos> I am the one <eos>


'<sos> ich bin ist <eos>'