# Thực hành ở nhà Transformers

Hoàn thiện hàm huấn luyện cho mạng Transformer và tiến hành huấn luyện mô hình

### Cài đặt giải thuật tối ưu và huấn luyện mô hình

In [6]:
#import
!pip install torchtext==0.12.0
import time
import torch
import torch.nn as nn
import torch.optim as optim
import spacy
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from sklearn.model_selection import train_test_split
!python -m spacy download fr_core_news_sm
!python -m spacy download en_core_web_sm

Collecting fr-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.7.0/fr_core_news_sm-3.7.0-py3-none-any.whl (16.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.3/16.3 MB[0m [31m91.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: fr-core-news-sm
Successfully installed fr-core-news-sm-3.7.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('fr_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/

In [2]:
# Hàm train
def accuracy(predictions, targets):
    _, predicted = torch.max(predictions, dim=1)
    correct = (predicted == targets).item().sum()
    total = len(correct)
    return correct/total

def train_model(model, opt):
    model.train()
    criterion = nn.CrossEntropyLoss()
    total_loss = 0

    for epoch in range (opt.epochs):
        epoch_loss = 0
        epoch_accuracy = 0
        epoch_start_time = time.time()
        for i, data in enumerate(opt.train):
            SRC = data.src.to(opt.device)
            TRG = data.trg.to(opt.device)
            trg_input = TRG[:-1, :]
            trg_output = TRG[1:, :].contiguous().view(-1)

            opt.optimizer.zero_grad()
            output = model(SRC, trg_input)
            output = output.view(-1, output.shape[-1])
            loss = criterion(output, trg_output)
            epoch_loss += loss.item()
            acc = accuracy(output, trg_output)
            epoch_accuracy += acc.item()
            # elapsed_time = time.time() - epoch_start_time
            if (i+1) % opt.printevery == 0:
                print(f"Epoch: {epoch+1}, Step: {i+1}/{len(opt.train)}, Loss: {loss:.4f}, Accuracy: {acc:.4f}.")

In [3]:
#Hàm khởi tạo Opt
class Opt():
    def __init__(self):

        self.src_data = "data/english.txt"
        self.trg_data = "data/french.txt"
        self.src_lang = "en_core_web_sm"
        self.trg_lang = 'fr_core_news_sm'
        self.epochs = 2
        self.d_model=512
        self.n_layers=6
        self.heads=8
        self.dropout=0.1
        self.batchsize=1500
        self.printevery=100
        self.lr=0.0001
        self.max_strlen=80
        self.checkpoint = 0
        self.no_cuda = False
        self.load_weights = None
        self.optimizer = None

        self.device = 0

In [4]:
# Hàm read data, create fields, dataset, get_model

def read_data(opt):
    opt.src = spacy.load(opt.src_lang)
    opt.trg = spacy.load(opt.trg_lang)

    with open(opt.src_data, 'r', encoding='utf-8') as f:
        opt.src_sentences = f.readlines()  # Read source sentences

    with open(opt.trg_data, 'r', encoding='utf-8') as f:
        opt.trg_sentences = f.readlines()  # Read target sentences


def create_fields(opt):

    src_tokenizer = opt.src_data
    trg_tokenizer = opt.trg_data

    def tokenize_src(text):
        return [tok.text for tok in src_tokenizer(text)]

    def tokenize_trg(text):
        return [tok.text for tok in trg_tokenizer(text)]

    SRC = {
        'tokenizer': tokenize_src,
        'vocab': None,
        'init_token': '<sos>',
        'eos_token': '<eos>',
        'lower': True
    }

    TRG = {
        'tokenizer': tokenize_trg,
        'vocab': None,
        'init_token': '<sos>',
        'eos_token': '<eos>',
        'lower': True
    }
    return SRC, TRG

def create_dataset(opt, SRC, TRG):
    src_tokenized = [SRC['tokenizer'](sentence.strip()) for sentence in opt.src_sentences]
    trg_tokenized = [TRG['tokenizer'](sentence.strip()) for sentence in opt.trg_sentences]


    x_train, x_test, y_train, y_test = train_test_split(src_tokenized, trg_tokenized, test_size=0.2, random_state=42)

    train_data = [[src, trg] for src, trg in zip(x_train, y_train)]
    test_data = [[src, trg] for src, trg in zip(x_test, y_test)]

    # Tạo DataLoader cho tập huấn luyện và tập kiểm tra
    trainloader = torch.utils.data.DataLoader(train_data, shuffle=True, batch_size=opt.batch_size)
    testloader = torch.utils.data.DataLoader(test_data, shuffle=True, batch_size=opt.batch_size)

    return trainloader, testloader





In [7]:
#Hàm main
def main():
    opt = Opt()
    # opt.src_data = "data/english.txt"
    # opt.trg_data = "data/french.txt"
    # opt.src_lang = "en_core_web_sm"
    # opt.trg_lang = 'fr_core_news_sm'
    # opt.epochs = 2
    # opt.d_model=512
    # opt.n_layers=6
    # opt.heads=8
    # opt.dropout=0.1
    # opt.batchsize=1500
    # opt.printevery=100
    # opt.lr=0.0001
    # opt.max_strlen=80
    # opt.checkpoint = 0
    # opt.no_cuda = False
    # opt.load_weights = None

    opt.device = 0
    if opt.device == 0:
        assert torch.cuda.is_available()

    read_data(opt)
    SRC, TRG = create_fields(opt) #source, target
    opt.train = create_dataset(opt, SRC, TRG)
    model = get_model(opt, len(SRC.vocab), len(TRG.vocab)).to(device)

    opt.optimizer = torch.optim.Adam(model.parameters(), lr=opt.lr, betas=(0.9, 0.98), eps=1e-9)

    if opt.checkpoint > 0:
        print("model weights will be saved every %d minutes and at end of epoch to directory weights/"%(opt.checkpoint))

    train_model(model, opt)


    # for asking about further training use while true loop, and return
if __name__ == "__main__":
    main()



FileNotFoundError: [Errno 2] No such file or directory: 'data/english.txt'