In [136]:
import math
import numpy as np
import pandas as pd
import pymysql
import pymysql.cursors as cursors
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from torchmetrics.functional.classification import f1_score

from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator


In [70]:
conn = pymysql.connect(
    host="1.251.203.204",
    port=33065,
    user="root",
    password="kdt5",
    db="Team4",
    charset="utf8",
)
cur = conn.cursor(cursors.DictCursor)


In [71]:
sql = """
select en.text as en, fr.text as fr
from language_en as en
inner join language_fr as fr
on en.id = fr.id
"""


In [72]:
cur.execute(sql)
langDF = pd.DataFrame(cur.fetchall())
cur.close()
conn.close()


In [73]:
langDF


Unnamed: 0,en,fr
0,In the beginning God created the heaven and th...,"Au commencement, Dieu créa les cieux et la terre."
1,"And the earth was without form, and void; and ...",La terre était informe et vide: il y avait des...
2,"And God said, Let there be light: and there wa...",Dieu dit: Que la lumière soit! Et la lumière fut.
3,"And God saw the light, that it was good: and G...",Dieu vit que la lumière était bonne; et Dieu s...
4,"And God called the light Day, and the darkness...","Dieu appela la lumière jour, et il appela les ..."
...,...,...
31097,"And the Spirit and the bride say, Come. And le...",Et l`Esprit et l`épouse disent: Viens. Et que ...
31098,For I testify unto every man that heareth the ...,Je le déclare à quiconque entend les paroles d...
31099,And if any man shall take away from the words ...,et si quelqu`un retranche quelque chose des pa...
31100,"He which testifieth these things saith, Surely...","Celui qui atteste ces choses dit: Oui, je vien..."


In [90]:
def generate_tokens(text_iter, language):
    language_index = {SRC_LANGUAGE: 0, TGT_LANGUAGE: 1}
    for text in text_iter:
        yield token_transform[language](text[language_index[language]])


SRC_LANGUAGE = "en"
TGT_LANGUAGE = "fr"
UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
special_symbols = ["<unk>", "<pad>", "<bos>", "<eos>"]


token_transform = {
    SRC_LANGUAGE: get_tokenizer("spacy", language="en_core_web_sm"),
    TGT_LANGUAGE: get_tokenizer("spacy", language="fr_core_news_sm"),
}

train_iter = langDF.values

len_en = pd.Series([len(token) for token in generate_tokens(train_iter, SRC_LANGUAGE)])
len_fr = pd.Series([len(token) for token in generate_tokens(train_iter, TGT_LANGUAGE)])
len_en.describe(), len_fr.describe()


(count    31102.000000
 mean        29.475789
 std         12.630380
 min          3.000000
 25%         20.000000
 50%         27.000000
 75%         37.000000
 max        103.000000
 dtype: float64,
 count    31102.000000
 mean        27.696515
 std         11.662017
 min          3.000000
 25%         19.000000
 50%         26.000000
 75%         35.000000
 max         96.000000
 dtype: float64)

In [92]:
def generate_tokens(text_iter, language):
    language_index = {SRC_LANGUAGE: 0, TGT_LANGUAGE: 1}
    for text in text_iter:
        yield token_transform[language](text[language_index[language]])


SRC_LANGUAGE = "en"
TGT_LANGUAGE = "fr"
UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
special_symbols = ["<unk>", "<pad>", "<bos>", "<eos>"]


token_transform = {
    SRC_LANGUAGE: get_tokenizer("spacy", language="en_core_web_sm"),
    TGT_LANGUAGE: get_tokenizer("spacy", language="fr_core_news_sm"),
}

vocab_transform = {}
for language in [SRC_LANGUAGE, TGT_LANGUAGE]:
    train_iter = langDF.values
    vocab_transform[language] = build_vocab_from_iterator(
        generate_tokens(train_iter, language),
        min_freq=1,
        specials=special_symbols,
        special_first=True,
    )

for language in [SRC_LANGUAGE, TGT_LANGUAGE]:
    vocab_transform[language].set_default_index(UNK_IDX)

print(vocab_transform)


{'en': Vocab(), 'fr': Vocab()}


In [93]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len, dropout=0.1):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model)
        )

        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer("pe", pe)

    def forward(self, x):
        x = x + self.pe[: x.size(0)]
        return self.dropout(x)


class TokenEmbedder(nn.Module):
    def __init__(self, vocab_size, emb_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size

    def forward(self, tokens):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)


class Seq2SeqTransformer(nn.Module):
    def __init__(
        self,
        num_encoder_layers,
        num_decoder_layers,
        emb_size,
        max_len,
        nhead,
        src_vocab_size,
        tgt_vocab_size,
        dim_feedforward,
        dropout=0.1,
    ):
        super().__init__()
        self.src_tok_emb = TokenEmbedder(src_vocab_size, emb_size)
        self.tgt_tok_emb = TokenEmbedder(tgt_vocab_size, emb_size)
        self.positional_encoding = PositionalEncoding(emb_size, max_len, dropout)
        self.transformer = nn.Transformer(
            d_model=emb_size,
            nhead=nhead,
            num_encoder_layers=num_encoder_layers,
            num_decoder_layers=num_decoder_layers,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
        )
        self.generator = nn.Linear(emb_size, tgt_vocab_size)

    def forward(
        self,
        src,
        trg,
        src_mask,
        tgt_mask,
        src_padding_mask,
        tgt_padding_mask,
        memory_key_padding_mask,
    ):
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
        outs = self.transformer(
            src=src_emb,
            tgt=tgt_emb,
            src_mask=src_mask,
            tgt_mask=tgt_mask,
            memory_mask=None,
            src_key_padding_mask=src_padding_mask,
            tgt_key_padding_mask=tgt_padding_mask,
            memory_key_padding_mask=memory_key_padding_mask,
        )
        return self.generator(outs)

    def encode(self, src, src_mask):
        return self.transformer.encoder(
            self.positional_encoding(self.src_tok_emb(src)), src_mask
        )

    def decode(self, tgt, memory, tgt_mask):
        return self.transformer.decoder(
            self.positional_encoding(self.tgt_tok_emb(tgt)), memory, tgt_mask
        )


In [114]:
from torchinfo import summary

BATCH_SIZE = 128
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

model = Seq2SeqTransformer(
    num_encoder_layers=6,
    num_decoder_layers=6,
    emb_size=8,
    max_len=128,
    nhead=8,
    src_vocab_size=len(vocab_transform[SRC_LANGUAGE]),
    tgt_vocab_size=len(vocab_transform[TGT_LANGUAGE]),
    dim_feedforward=512,
).to(DEVICE)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX).to(DEVICE)
optimizer = optim.Adam(model.parameters())

summary(model)


Layer (type:depth-idx)                                                 Param #
Seq2SeqTransformer                                                     --
├─TokenEmbedder: 1-1                                                   --
│    └─Embedding: 2-1                                                  108,664
├─TokenEmbedder: 1-2                                                   --
│    └─Embedding: 2-2                                                  210,880
├─PositionalEncoding: 1-3                                              --
│    └─Dropout: 2-3                                                    --
├─Transformer: 1-4                                                     --
│    └─TransformerEncoder: 2-4                                         --
│    │    └─ModuleList: 3-1                                            54,192
│    │    └─LayerNorm: 3-2                                             16
│    └─TransformerDecoder: 2-5                                         --
│    │    └─ModuleL

In [132]:
def sequential_transforms(*transforms):
    def func(txt_input):
        for transform in transforms:
            txt_input = transform(txt_input)
        return txt_input

    return func


def input_transform(token_ids):
    return torch.cat(
        (torch.tensor([BOS_IDX]), torch.tensor(token_ids), torch.tensor([EOS_IDX])),
    )


def collator(batch):
    src_batch, tgt_batch = [], []
    for src_sample, tgt_sample in batch:
        src_batch.append(text_transform[SRC_LANGUAGE](src_sample.rstrip("\n")))
        tgt_batch.append(text_transform[TGT_LANGUAGE](tgt_sample.rstrip("\n")))
    src_batch = pad_sequence(src_batch, padding_value=PAD_IDX)
    tgt_batch = pad_sequence(tgt_batch, padding_value=PAD_IDX)
    return src_batch, tgt_batch


text_transform = {}
for language in [SRC_LANGUAGE, TGT_LANGUAGE]:
    text_transform[language] = sequential_transforms(
        token_transform[language], vocab_transform[language], input_transform
    )

data_iter = langDF.values
dataloader = DataLoader(data_iter, batch_size=BATCH_SIZE, collate_fn=collator)
source_tensor, target_tensor = next(iter(dataloader))

print("(source, target): ")
print(next(iter(data_iter)))

print("source_batch:", source_tensor.shape)
print(source_tensor)

print("target_batch:", target_tensor.shape)
print(target_tensor)


(source, target): 
['In the beginning God created the heaven and the earth.'
 'Au commencement, Dieu créa les cieux et la terre.']
source_batch: torch.Size([63, 128])
tensor([[   2,    2,    2,  ...,    2,    2,    2],
        [ 292,   10,   10,  ...,   10,   10,   10],
        [   5,    5,   37,  ...,   31, 3261, 3261],
        ...,
        [   1,    1,    1,  ...,    1,    1,    1],
        [   1,    1,    1,  ...,    1,    1,    1],
        [   1,    1,    1,  ...,    1,    1,    1]])
target_batch: torch.Size([65, 128])
tensor([[   2,    2,    2,  ...,    2,    2,    2],
        [ 311,  108,   32,  ...,  287, 3698, 3698],
        [1035,   95,   37,  ...,   10,    4,    4],
        ...,
        [   1,    1,    1,  ...,    1,    1,    1],
        [   1,    1,    1,  ...,    1,    1,    1],
        [   1,    1,    1,  ...,    1,    1,    1]])


In [133]:
def generate_square_subsequnt_mask(s):
    mask = (torch.triu(torch.ones((s, s), device=DEVICE)) == 1).transpose(0, 1)
    mask = (
        mask.float()
        .masked_fill(mask == 0, float("-inf"))
        .masked_fill(mask == 1, float(0.0))
    )
    return mask


def create_mask(src, tgt):
    src_seq_len = src.shape[0]
    tgt_seq_len = tgt.shape[0]

    src_mask = torch.zeros((src_seq_len, src_seq_len), device=DEVICE).type(torch.float)
    tgt_mask = generate_square_subsequnt_mask(tgt_seq_len)

    src_padding_mask = (src == PAD_IDX).transpose(0, 1).to(DEVICE)
    tgt_padding_mask = (tgt == PAD_IDX).transpose(0, 1).to(DEVICE)
    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask


target_input = target_tensor[:-1, :]
target_out = target_tensor[1:, :]

source_mask, target_mask, src_padding_mask, tgt_padding_mask = create_mask(
    source_tensor, target_input
)

print("source_mask:", source_mask.shape)
print(source_mask)
print("target_mask:", target_mask.shape)
print(target_mask)
print("src_padding_mask:", src_padding_mask.shape)
print(src_padding_mask)
print("tgt_padding_mask:", tgt_padding_mask.shape)
print(tgt_padding_mask)


source_mask: torch.Size([63, 63])
tensor([[False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        ...,
        [False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False]])
target_mask: torch.Size([64, 64])
tensor([[0., -inf, -inf,  ..., -inf, -inf, -inf],
        [0., 0., -inf,  ..., -inf, -inf, -inf],
        [0., 0., 0.,  ..., -inf, -inf, -inf],
        ...,
        [0., 0., 0.,  ..., 0., -inf, -inf],
        [0., 0., 0.,  ..., 0., 0., -inf],
        [0., 0., 0.,  ..., 0., 0., 0.]])
src_padding_mask: torch.Size([128, 63])
tensor([[False, False, False,  ...,  True,  True,  True],
        [False, False, False,  ...,  True,  True,  True],
        [False, False, False,  ...,  True,  True,  True],
        ...,
        [False, False, False,  ...,  True,  True,  True],
 

In [137]:
from tqdm import tqdm


def run(epochs, epoch, model, optimizer, criterion, is_train, use_pbar=True):
    model.train() if is_train else model.eval()
    data_iter = langDF.values
    dataloader = DataLoader(data_iter, batch_size=BATCH_SIZE, collate_fn=collator)
    pbar = tqdm(dataloader, total=len(list(dataloader))) if use_pbar else dataloader
    losses = 0
    for source_batch, target_batch in pbar:
        source_batch = source_batch.to(DEVICE)
        target_batch = target_batch.to(DEVICE)

        target_input = target_batch[:-1, :]
        target_output = target_batch[1:, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(
            source_batch, target_input
        )

        logits = model(
            src=source_batch,
            trg=target_input,
            src_mask=src_mask,
            tgt_mask=tgt_mask,
            src_padding_mask=src_padding_mask,
            tgt_padding_mask=tgt_padding_mask,
            memory_key_padding_mask=src_padding_mask,
        )

        optimizer.zero_grad()
        loss = criterion(
            logits.reshape(-1, logits.shape[-1]), target_output.reshape(-1)
        )
        if is_train:
            loss.backward()
            optimizer.step()
            if use_pbar:
                pbar.set_description(
                    f"Epoch: {epoch}/{epochs} | Loss: {loss.item():.6f}"
                )
        losses += loss.item()
    return losses / len(list(dataloader))


In [138]:
EPOCHS = 5
pre_loss = 100
for epoch in range(1, EPOCHS + 1):
    loss = run(EPOCHS, epoch, model, optimizer, criterion, is_train=True)
    print(f"Epoch: {epoch}, Loss: {loss:.6f}")
    if pre_loss > loss:
        torch.save(model, f"bible_transformer.pth")
        torch.save(model.state_dict(), f"bible_transformer_state.pth")
        print(f"Model Saved! --- Epoch: {epoch}, Loss: {loss:.6f}")
        pre_loss = loss


Epoch: 1/5 | Loss: 6.843336: 100%|██████████| 243/243 [17:21<00:00,  4.29s/it]


Epoch: 1, Loss: 8.567610


Epoch: 2/5 | Loss: 6.298216: 100%|██████████| 243/243 [17:38<00:00,  4.35s/it]


Epoch: 2, Loss: 6.569177


Epoch: 3/5 | Loss: 6.190910: 100%|██████████| 243/243 [16:51<00:00,  4.16s/it]


Epoch: 3, Loss: 6.357454


Epoch: 4/5 | Loss: 6.069524: 100%|██████████| 243/243 [16:59<00:00,  4.20s/it]


Epoch: 4, Loss: 6.241412


Epoch: 5/5 | Loss: 6.090706:  28%|██▊       | 68/243 [05:29<15:28,  5.31s/it]