In [40]:
import platform

import torch
import torch.nn as nn
import torch.nn.functional as F

In [41]:
if platform.system() == "Darwin":
    PYTORCH_ENABLE_MPS_FALLBACK=1
    device = "mps"
else:
    device = "cuda" if torch.cuda.is_available() else "cpu"

In [42]:
class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, hidden, drop_prob=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.fc1 = nn.Linear(d_model, hidden)
        self.fc2 = nn.Linear(hidden, d_model)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=drop_prob)
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        return x

In [43]:
import math
class ScaleDotProductAttention(nn.Module):
    def __init__(self):
        super(ScaleDotProductAttention, self).__init__()
        self.softmax = nn.Softmax(dim=-1)
        
    def forward(self, q, k, v, mask=None, e=1e-12):
        # the input is [batch_size, head, length, dim]
        # dim表示每个词向量的维度
        batch_size, head, length, dim = k.size()
        
        k_t = k.transpose(2, 3)
        score = (q @ k) / math.sqrt(dim)
        if mask is not None:
            score = score.masked_fill(mask == 0, -10000)
        
        score = self.softmax(score)
        
        v = score @ v
        
        return v, score

In [44]:
class transformer_blocks(nn.Module):
    
    def __init__(self):
        super(transformer_blocks, self).__init__()
        

In [45]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_head):
        super(MultiHeadAttention, self).__init__()
        self.n_head = n_head
        self.attention = ScaleDotProductAttention()
        self.w_q = nn.Linear(d_model, d_model)
        self.w_k = nn.Linear(d_model, d_model)
        self.w_v = nn.Linear(d_model, d_model)
        self.w_concat = nn.Linear(d_model, d_model)
        
        
    def forward(self, q, k, v, mask=None):
        q, k, v = self.w_q(q), self.w_k(k), self.w_v(v)
        
        q, k, v = self.split(q), self.split(k), self.split(v)
        
        out, attention = self.attention(q, k, v, mask=mask)
        out = self.concat(out)
        out = self.w_concat(out)
        
        return out
    
    def split(self, tensor):
        
        batch_size, length, d_model = tensor.size()
        d_tensor = d_model // self.n_head
        tensor = tensor.view(batch_size, length, self.n_head, d_tensor).transpose(1,2)
        
        return tensor
    
    def concat(self, tensor):
        batch_size, head, length, dim = tensor.size()
        d_model = head * dim
        
        tensor = tensor.transpose(1, 2).contiguous().view(batch_size, length, d_model)
        return tensor

In [46]:
class LayerNorm(nn.Module):
    def __init__(self, d_model, eps=1e-12):
        super(LayerNorm, self).__init__()
        self.gamma = nn.Parameter(torch.ones(d_model))
        self.beta = nn.Parameter(torch.zeros(d_model))
        self.eps = eps
        
    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        var = x.var(-1, unbiased=False, keepdim=True) # unbiased是否使用无偏估计
        
        out = (x - mean) / torch.sqrt(var + self.eps)
        out = self.gamma * out + self.beta
        return out

In [47]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len, device):
        super(PositionalEncoding, self).__init__()
        
        self.encoding = torch.zeros(max_len, d_model, device=device)
        self.encoding.requires_grad = False
        
        pos = torch.arange(0, max_len, device=device)
        pos = pos.float().unsqueeze(dim=-1)
        
        _2i = torch.arange(0, d_model, step=2, device=device).float()
        
        self.encoding[:, 0::2] = torch.sin(pos / (10000 ** (_2i / d_model)))
        self.encoding[:, 1::2] = torch.cos(pos / (10000 ** (_2i / d_model)))
        
    def forward(self, x):
        batch_size, seq_len = x.size()
        
        return self.encoding[:seq_len, :]

In [48]:
class TokenEmbedding(nn.Embedding):
    
    def __init__(self, vocab_size, d_model, device):
        super(TokenEmbedding, self).__init__(vocab_size, d_model, padding_idx=1)
        self.to(device)

In [49]:
class TransformerEmbedding(nn.Module):
    
    def __init__(self, vocab_size, d_model, max_len, drop_prob, device):
        super(TransformerEmbedding, self).__init__()
        self.tok_emb = TokenEmbedding(vocab_size, d_model, device)
        self.pos_emb = PositionalEncoding(d_model, max_len, device)
        self.drop_out = nn.Dropout(p=drop_prob)
    
    def forward(self, x):
        tok_emb = self.tok_emb(x)
        pos_emb = self.pos_emb(x)
        
        return self.drop_out(tok_emb + pos_emb)

In [57]:
class EncoderLayer(nn.Module):
    
    def __init__(self, d_model, ffn_hidden, n_head, drop_prob, device):
        super(EncoderLayer, self).__init__()
        self.attention = MultiHeadAttention(d_model=d_model, n_head=n_head)
        self.norm1 = LayerNorm(d_model=d_model)
        self.dropout1 = nn.Dropout(p=drop_prob)
        
        self.ffn = PositionwiseFeedForward(d_model=d_model, hidden=ffn_hidden, drop_prob=drop_prob)
        self.norm2 = LayerNorm(d_model=d_model)
        self.dropout2 = nn.Dropout(p=drop_prob)
        
        self.to(device)
    
    def forward(self, x, s_mask):
        x_ = x
        x = self.attention(q=x, k=x, v=x, mask=s_mask)
        
        x = self.dropout1(x)
        x = self.norm1(x + x_)
        
        x_ = x
        x = self.ffn(x)
        
        x = self.dropout2(x)
        x = self.norm2(x + x_)
        
        return x
    
class DecoderLayer(nn.Module):
    
    def __init__(self, d_model, ffn_hidden, n_head, drop_prob):
        super(DecoderLayer, self).__init__()
        self.self_attention = MultiHeadAttention(d_model=d_model, n_head=n_head)
        self.norm1 = LayerNorm(d_model=d_model)
        self.dropout1 = nn.Dropout(drop_prob)
        
        self.enc_dec_attention = MultiHeadAttention(d_model=d_model, n_head=n_head)
        self.norm2 = LayerNorm(d_model=d_model)
        self.dropout2 = nn.Dropout(drop_prob)
        
        self.ffn = PositionwiseFeedForward(d_model=d_model, hidden=ffn_hidden, drop_prob=drop_prob)
        self.norm3 = LayerNorm(d_model=d_model)
        self.dropout3 = nn.Dropout(drop_prob)
    
    def forward(self, dec, enc, t_mask, s_mask):
        x_ = dec
        x = self.self_attention(q=dec, k=dec, v=dec, mask=t_mask)
        
        x = self.dropout1(x)
        x = self.norm1(x + x_)
        
        if enc is not None:
            x_ = x
            x = self.enc_dec_attention(q=x, k=enc, v=enc, mask=s_mask)
            
            x = self.dropout2(x)
            x = self.norm2(x + x_)
        
        x_ = x
        x = self.ffn(x)
        
        x = self.dropout3(x)
        x = self.norm3(x + x_)
        return x 

In [58]:
class Encoder(nn.Module):
    
    def __init__(self, enc_voc_size, max_len, d_model, ffn_hidden, n_head, n_layers, drop_prob, device):
        super().__init__()
        self.emb = TransformerEmbedding(d_model=d_model,
                                       max_len=max_len,
                                       vocab_size=enc_voc_size,
                                       drop_prob=drop_prob,
                                       device=device)
        self.layers = nn.ModuleList([EncoderLayer(d_model=d_model,
                                                 ffn_hidden=ffn_hidden,
                                                 n_head=n_head,
                                                  drop_prob=drop_prob,
                                                 device=device)
                                                  for _ in range(n_layers)])
        
    def forward(self, x, s_mask=None):
        x = self.emb(x)
        for layer in self.layers:
            x = layer(x, s_mask)
        
        return x

In [59]:
class Decoder(nn.Module):
    def __init__(self, dec_voc_size, max_len, d_model, ffn_hidden, n_head, n_layers, drop_prob, device):
        super().__init__()
        self.emb = TransformerEmbedding(d_model=d_model,
                                        drop_prob=drop_prob,
                                        max_len=max_len,
                                        vocab_size=dec_voc_size,
                                        device=device)
        self.layers = nn.ModuleList([DecoderLayer(d_model=d_model,
                                                 ffn_hidden=ffn_hidden,
                                                 n_head=n_head,
                                                 drop_prob=drop_prob)
                                    for _ in range(n_layers)])
        self.fc = nn.Linear(d_model, dec_voc_size)
        
    def forward(self, trg, enc_src, trg_mask, src_mask):
        trg = self.emb(trg)
        
        for layer in self.layers:
            trg = layer(trg, enc_src, trg_mask, src_mask)
        
        output = self.fc(trg)
        return output

In [60]:
class Transformer(nn.Module):
    
    def __init__(self, src_pad_idx, trg_pad_idx, trg_sos_idx, enc_voc_size, dec_voc_size, d_model, n_head, max_len,
                ffn_hidden, n_layers, drop_prob, device):
        super(Transformer, self).__init__()
        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx
        self.trg_sos_idx = trg_sos_idx
        self.device = device
        self.encoder = Encoder(d_model=d_model,
                               n_head=n_head,
                               max_len=max_len,
                               ffn_hidden=ffn_hidden,
                               enc_voc_size=enc_voc_size,
                               drop_prob=drop_prob,
                               n_layers=n_layers,
                               device=device)
        
        self.decoder = Decoder(d_model=d_model,
                               n_head=n_head,
                               max_len=max_len,
                               ffn_hidden=ffn_hidden,
                               dec_voc_size=dec_voc_size,
                               drop_prob=drop_prob,
                               n_layers=n_layers,
                               device=device)
    
    def forward(self, src, trg):
        src_mask = self.make_pad_mask(src, src, self.src_pad_idx, self.src_pad_idx)
        src_trg_mask = self.make_pad_mask(trg, src, self.trg_pad_idx, self.src_pad_idx)
        trg_mask = self.make_pad_mask(trg, trg, self.trg_pad_idx, self.trg_pad_idx) * self.make_no_peak_mask(trg, trg)
        
        enc_src = self.encoder(src, src_mask)
        output = self.decoder(trg, enc_src, trg_mask, src_trg_mask)
        return output
    
    def make_pad_mask(self, q, k, q_pad_idx, k_pad_idx):
        len_q, len_k = q.size(1), k.size(1)
        
        k = k.ne(k_pad_idx).unsqueeze(1).unsqueeze(2)
        k = k.repeat(1, 1, len_q, 1)
        
        q = q.ne(q_pad_idx).unsqueeze(1).unsqueeze(3)
        q = q.repeat(1, 1, 1, len_k)
        
        mask = k & q
        return mask
    
    def make_no_peak_mask(self, q, k):
        len_q, len_k = q.size(1), k.size(1)
        mask = torch.tril(torch.ones(len_q, len_k)).type(torch.BoolTensor).to(self.device)
        return mask

In [32]:
vocab_size = 10000
max_len = 512
d_model = 768
ffn_hidden = 256
drop_prob = 0.9
n_head = 12
n_layers = 12
drop_porb = 0.9

In [None]:
import spacy

class Tokenizer:
    
    def __init__(self):
        self.spacy_de = spacy.load('de_core_news_sm')
        self.spacy_en = spacy.load('en_core_web_sm')
        
    def tokenizer_de(self, text):
        return [tok.text for tok in self.spacy_de.tokenizer(text)]
    
    def tokenizer_en(self, text):
        return [tok.text for tok in self.spacy_en.tokenizer(text)]

In [31]:
model = Transformer()

TypeError: __init__() missing 12 required positional arguments: 'src_pad_idx', 'trg_pad_idx', 'trg_sos_idx', 'enc_voc_size', 'dec_voc_size', 'd_model', 'n_head', 'max_len', 'ffn_hidden', 'n_layers', 'drop_prob', and 'device'

In [66]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [67]:
model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-zh")

Downloading:   0%|          | 0.00/312M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [4]:
# checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
checkpoint = "Helsinki-NLP/opus-mt-en-zh"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [33]:
tokenizer.pad_token_id

65000

In [20]:
raw_inputs = [
    "I've been waiting for a this course my whole life.",
    "I hate this so much!",
]
inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt")

In [31]:
inputs

{'input_ids': tensor([[   28,    23,   411,    74,  6783,    15,    13,    58,  1306,   162,
          1709,   684,     6,     0],
        [   28,  7781,    58,   194,   961,    50,     0, 65000, 65000, 65000,
         65000, 65000, 65000, 65000]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]])}

In [61]:
src_pad_idx = tokenizer.pad_token_id
trg_pad_idx = tokenizer.pad_token_id
trg_sos_idx = tokenizer.eos_token_id

enc_voc_size = tokenizer.vocab_size
dec_voc_size = tokenizer.vocab_size

d_model = 512
decoder_attention_heads = 8
decoder_ffn_dim = 2048
decoder_layerdrop = 0
decoder_layers = 6
de_start_token_idx = 65000

encoder_attention_heads = 8
encoder_ffn_dim = 2048
encoder_layerdrop = 0
encoder_layers = 6
eos_token_id = 0

max_len = 512
pad_token_id = 65000
vocab_size = 65001

In [62]:
model = Transformer(src_pad_idx=pad_token_id, 
                    trg_pad_idx=pad_token_id, 
                    trg_sos_idx=de_start_token_idx, 
                    enc_voc_size=vocab_size,
                    dec_voc_size=vocab_size,
                    d_model=d_model, 
                    n_head=encoder_attention_heads, 
                    max_len=max_len,
                    ffn_hidden=encoder_ffn_dim,
                    n_layers=encoder_layers,
                    drop_prob=0,
                    device=device)

In [64]:
inputs_data = inputs['input_ids']
inputs_data

tensor([[   28,    23,   411,    74,  6783,    15,    13,    58,  1306,   162,
          1709,   684,     6,     0],
        [   28,  7781,    58,   194,   961,    50,     0, 65000, 65000, 65000,
         65000, 65000, 65000, 65000]])

In [65]:
model(inputs_data)

TypeError: forward() missing 1 required positional argument: 'trg'