<a href="https://colab.research.google.com/github/yOsmanthus/study_LLM/blob/main/transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import time
from tkinter import Variable
import math
import torch.nn as nn

class PostionalEncoder(nn.Module):
    def __init__(self, d_model, max_seq_len=80):
        super().__init__()
        self.d_model=d_model
        pe=torch.zeros(max_seq_len, d_model)
        for pos in range(max_seq_len):
            for i in range(0, d_model, 2):
                pe[pos, i] = math.sin(pos / (10000 ** ((2 * i)/d_model)))
                pe[pos, i + 1] = math.cos(pos / (10000 ** ((2 * (i + 1))/d_model)))
        pe=pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x * math.sqrt(self.d_model)
        seq_len = x.size(1)
        x = x + Variable(self.pe[:, :seq_len], requires_grad=False)
        return x

In [2]:
class MultiHeadAttention(nn.Module):
    def __init__(self, heads, d_model, dropout=0.1):
        super().__init__()
        self.d_model = d_model
        self.d_k = d_model // heads
        self.h = heads
        self.q_linear = nn.Linear(d_model, d_model)
        self.v_linear = nn.Linear(d_model, d_model)
        self.k_linear = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)
        self.out = nn.Linear(d_model, d_model)

    def attention(self, q, k, v, d_k, mask=None, dropout=None):
        scores = torch.matmul(q, k.transpose(-2, -1)) /  math.sqrt(d_k)
        if mask is not None:
            mask = mask.unsqueeze(1)
            scores = scores.masked_fill(mask == 0, -1e9)
        scores = F.softmax(scores, dim=-1)
        if dropout is not None:
            scores = dropout(scores)
        output = torch.matmul(scores, v)
        return output

    def forward(self, q, k, v, mask=None):
        bs = q.size(0)
        q = self.q_linear(q).view(bs, -1,self.h, self.d_k).transpose(1, 2)
        k = self.k_linear(k).view(bs, -1, self.h, self.d_k).transpose(1, 2)
        v = self.v_linear(v).view(bs, -1, self.h, self.d_k).transpose(1, 2)
        scores = self.attention(q, k, v, self.d_k, mask, self.dropout)
        concat = scores.transpose(1, 2).contiguous().view(bs, -1, self.d_model)
        output = self.out(concat)
        return output


In [3]:
class FeedForward(nn.Module):
  def __init__(self, d_model, d_ff=2048, dropout=0.1):
        super().__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(d_ff, d_model)

  def forward(self, x):
      x = self.dropout(F.relu(self.linear1(x)))
      x = self.linear2(x)
      return x

In [4]:
class Norm(nn.Module):
  def __init__(self, d_model, eps=1e-6):
      super().__init__()
      self.size = d_model
      self.alpha = nn.Parameter(torch.ones(self.size))
      self.bias = nn.Parameter(torch.zeros(self.size))
      self.eps = eps

  def forward(self, x):
      norm = self.alpha * (x - x.mean(dim=-1, keepdim=True)) / (x.std(dim=-1, keepdim=True) + self.eps) + self.bias
      return norm

In [5]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, heads, dropout=0.1):
        super().__init__()
        self.norm1 = Norm(d_model)
        self.norm2 = Norm(d_model)
        self.attn = MultiHeadAttention(heads, d_model)
        self.ff = FeedForward(d_model)
        self.dropout_1 = nn.Dropout(dropout)
        self.dropout_2 = nn.Dropout(dropout)

    def forward(self, x, mask):
        attn_output=self.attn(x, x, x, mask)
        x = x + self.dropout_1(attn_output)
        x = self.norm1(x)
        ff_output=self.ff(x)
        x = x + self.dropout_2(ff_output)
        x = self.norm2(x)
        return x

class Encoder(nn.Module):
    def __init__(self, vocab_size, d_model, N, heads,dropout=0.1):
        super().__init__()
        self.N = N
        self.embed = Embedder(vocab_size, d_model)
        self.pe = PositionalEncoding(d_model)
        self.layers = get_clones(EncoderLayer(d_model, heads, dropout), N)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src, mask):
        x=self.embed(src)
        x=self.pe(x)
        for i in range(self.N):
            x=self.layers[i](x,mask)
        return self.norm(x)

In [6]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, heads, dropout=0.1):
        super().__init__()
        self.norm1 = Norm(d_model)
        self.norm2 = Norm(d_model)
        self.norm3 = Norm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.dropout3 = nn.Dropout(dropout)
        self.attn1 = MultiHeadAttention(heads, d_model)
        self.attn2 = MultiHeadAttention(heads, d_model)
        self.ff = FeedForward(d_model)

    def forward(self, x, enc_outputs, src_mask, tgt_mask):
        attn1_output=self.attn1(x, x, x, tgt_mask)
        attn1_output=self.dropout1(attn1_output)
        x=x+attn1_output
        x=self.norm1(x)

        attn2_output=self.attn2(x, enc_outputs, enc_outputs, src_mask)
        attn2_output=self.dropout2(attn2_output)
        x=x+attn2_output
        x=self.norm2(x)

        ff_output=self.ff(x)
        ff_output=self.dropout3(ff_output)
        x=x+ff_output
        x=self.norm3(x)

        return x

class Decoder(nn.Module):
    def __init__(self, vocab_size, d_model, N, heads, dropout=0.1):
        super().__init__()
        self.N = N
        self.embed = Embedder(vocab_size, d_model)
        self.pe = PositionalEncoder(d_model,dropout)
        self.layers = get_clones(DecoderLayer(d_model, heads, dropout), N)
        self.norm= Norm(d_model)

    def forward(self, trg, enc_outputs, src_mask, tgt_mask):
        x=self.embed(trg)
        x=self.pe(x)
        for i in range(self.N):
            x=self.layers[i](x,enc_outputs,src_mask,tgt_mask)
        return self.norm(x)

In [7]:
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, trg_vocab_size, d_model, N, heads, dropout=0.1):
        super().__init__()
        self.encoder = Encoder(src_vocab_size, d_model, N, heads, dropout)
        self.decoder = Decoder(trg_vocab_size, d_model, N, heads, dropout)
        self.out = nn.Linear(d_model, trg_vocab_size)

    def forward(self, src, trg, src_mask, tgt_mask):
        enc_outputs = self.encoder(src, src_mask)
        dec_output = self.decoder(trg, enc_outputs, src_mask, tgt_mask)
        output = self.out(dec_output)
        return output

In [9]:
!pip install torchtext==0.6.0

[0mCollecting torchtext==0.6.0
  Downloading torchtext-0.6.0-py3-none-any.whl.metadata (6.3 kB)
Downloading torchtext-0.6.0-py3-none-any.whl (64 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.2/64.2 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[0mInstalling collected packages: torchtext
  Attempting uninstall: torchtext
    Found existing installation: torchtext 0.18.0
    Uninstalling torchtext-0.18.0:
      Successfully uninstalled torchtext-0.18.0
Successfully installed torchtext-0.6.0


In [7]:

from torchtext.data import Field, BucketIterator
from torchtext.datasets import Multi30k

# 定义源语言和目标语言的字段
SRC = Field(tokenize='spacy', tokenizer_language='en_core_web_sm', init_token='<sos>', eos_token='<eos>', lower=True)
TRG = Field(tokenize='spacy', tokenizer_language='fr_core_news_sm', init_token='<sos>', eos_token='<eos>', lower=True)

# 加载数据集
train_data, valid_data, test_data = Multi30k.splits(exts=('.en', '.fr'), fields=(SRC, TRG))

# 构建词汇表
SRC.build_vocab(train_data, min_freq=2)
TRG.build_vocab(train_data, min_freq=2)

# 创建数据迭代器
train_iter, valid_iter, test_iter = BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=32,
    device=device  # 设定使用的设备，CPU或GPU
)

# 打印一些示例
for batch in train_iter:
    print(batch.English)  # 源语言
    print(batch.French)   # 目标语言
    break  # 只打印第一个批次



OSError: /usr/local/lib/python3.10/dist-packages/torchtext/lib/libtorchtext.so: undefined symbol: _ZN5torch3jit17parseSchemaOrNameERKSs

In [6]:
!pip install torchtext==0.17.0



Collecting torchtext==0.17.0
  Downloading torchtext-0.17.0-cp310-cp310-manylinux1_x86_64.whl.metadata (7.6 kB)
Collecting torch==2.2.0 (from torchtext==0.17.0)
  Downloading torch-2.2.0-cp310-cp310-manylinux1_x86_64.whl.metadata (25 kB)
Collecting torchdata==0.7.1 (from torchtext==0.17.0)
  Downloading torchdata-0.7.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch==2.2.0->torchtext==0.17.0)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch==2.2.0->torchtext==0.17.0)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch==2.2.0->torchtext==0.17.0)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch==2.2.0->torcht

In [1]:
import time
import torch
import torch.nn as nn
import torch.nn.functional as F

d_model = 513
heads = 8
N = 6
src_vocab = 80
trg_vocab = 80

model = Transformer(src_vocab, trg_vocab, d_model, N, heads)

# 初始化模型参数
for p in model.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

optim = torch.optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

def train_model(epochs, print_every=100):
    model.train()
    start = time.time()
    temp = start
    total_loss = 0

    for epoch in range(epochs):
        for i, batch in enumerate(train_iter):
            arc = batch.English.transpose(0, 1)
            trg = batch.French.transpose(0, 1)

            trg_input = trg[:, :-1]
            targets = trg[:, 1:].contiguous().view(-1)
            src_mask, tgt_mask = create_mask(arc, trg_input)

            preds = model(arc, trg_input, src_mask, tgt_mask)
            optim.zero_grad()

            loss = F.cross_entropy(preds.view(-1, preds.size(-1)), targets, ignore_index=0)
            loss.backward()

            total_loss += loss.item()  # 修改为 loss.item()
            if (i + 1) % print_every == 0:
                loss_avg = total_loss / print_every
                print("time=%dm, epoch %d, iter=%d, loss=%.3f, %ds per %d iters" % (
                    (time.time() - start) / 60, epoch + 1, i + 1, loss_avg,
                    (time.time() - temp) / print_every, print_every))
                total_loss = 0
                temp = time.time()

# 假设 train_iter 和 create_mask 已正确定义


def translate(model,src,max_len=80,custom_string=False):
    model.eval()
    if custom_string:
        src=tokenize_en(src)
        src=Variable(torch.LongTensor([[EN_TEXT.vocab.stoi[t] for t in src]])).cuda()
        src_mask=(src!=input_pad).unsqueeze(-2)
        e_outputs=model.encoder(src,src_mask)
        outputs=torch.zeros(max_len).type_as(src.data)
        outputs[0]=torch.LongTensor([FR_TEXT.vocab.stoi["<sos>"]])

    for i in range(1,max_len):
        trg_mask=np.triu(np.ones((1,1,i,i)),k=1).astype('uint8')
        trg_mask=Variable(torch.from_numpy(trg_mask)==0).cuda()
        out=model.out(model.decoder(outputs[:i].unsqueeze(0),e_outputs,src_mask,trg_mask))
        out=F.softmax(out,dim=-1)
        val,ix=out[:,-1,:].data.topk(1)
        outputs[i]=ix[0][0]
        if ix[0][0]==FR_TEXT.vocab.stoi["<eos>"]:
            break
    return ' '.join([FR_TEXT.vocab.itos[i] for i in outputs[:i+1]])

NameError: name 'EN_TEXT' is not defined