In [12]:
import torch.nn as nn
import math
import torch
import torch.nn.functional as F
from torch.autograd import Variable
from torch import optim
from torch.utils.data import DataLoader
import tqdm
from Datasets import Datasets

dataset = Datasets("C:\Attention\data\\train.txt")

dataset.bulid_vocab(dataset.en_data,dataset.ch_data)

dataloader = DataLoader(dataset, batch_size=16, num_workers=0,collate_fn=dataset.collate_fn)


maxlen = 128
d_model = 512
units = 512
dropout_rate = 0.2
numofblock = 4
numofhead = 4
# encoder_vocab = len(dataset.ch_vocab)
vocab_size = len(dataset.en_vocab)
epochs = 10

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


def get_padding_mask(seq_q,seq_k):
    # print(seq_k.shape)
    # print(seq_q.shape)
    batch_size, len_q = seq_q.size()
    batch_size, len_k = seq_k.size()
    padding_mask = seq_k.data.eq(1).unsqueeze(1)
    return padding_mask.expand(batch_size,len_q,len_k)


class TokenEmbedding(nn.Module):
    def __init__(self,vocab_size,emb_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size,emb_size)

    def forward(self,x):
        # print(x.shape)

        return self.embedding(x).to(DEVICE) # shape = (batch_size,input_seq_len,emb_dim)


class MultiHeadAttention(nn.Module):
    def __init__(self, num_units, num_heads, dropout_rate, mask=False):
        super().__init__()
        self.num_units = num_units
        self.num_head = num_heads
        self.dropout_rate = dropout_rate
        self.mask = mask
        self.linearQ = nn.Linear(self.num_units,self.num_units)
        self.linearK = nn.Linear(self.num_units,self.num_units)
        self.linearV = nn.Linear(self.num_units,self.num_units)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(self.dropout_rate)
        self.LayerNormalization = nn.LayerNorm(d_model)
        self.Q = nn.Sequential(self.linearQ,self.relu)
        self.K = nn.Sequential(self.linearK,self.relu)
        self.V = nn.Sequential(self.linearV,self.relu)


    def forward(self, queries, keys, values, self_padding_mask, enc_dec_padding_mask):
        '''
        :param queries: shape:(batch_size,input_seq_len,d_model)
        :param keys: shape:(batch_size,input_seq_len,d_model)
        :param values: shape:(batch_size,input_seq_len,d_model)
        :return: None
        '''
        q, k, v = self.Q(queries), self.K(keys), self.V(values)

        q_split, k_split, v_split = torch.chunk(q,self.num_head,dim=-1), torch.chunk(k,self.num_head,dim=-1), torch.chunk(v,self.num_head,dim=-1)
        q_, k_, v_ = torch.stack(q_split,dim=1), torch.stack(k_split,dim=1), torch.stack(v_split,dim=1)
        # shape : (batch_size, num_head, input_seq_len, depth = d_model/num_head)
        a = torch.matmul(q_,k_.permute(0,1,3,2)) # a = q * k^T(后两个维度)
        a = a / (k_.size()[-1] ** 0.5) # shape:(batch_size,num_head,seq_len,seq_len)
        batch_size_shape = a.shape[0]
        seq_len_shape = a.shape[2]
        if self.mask:
            self_padding_mask = self_padding_mask.unsqueeze(1).repeat(1, self.num_head, 1, 1)
            masked = torch.ones((batch_size_shape,1,seq_len_shape,seq_len_shape))
            masked = Variable((1 - torch.tril(masked, diagonal=0)) * (-2 ** 32 + 1)).to(DEVICE)

            assert masked.shape[-1] == self_padding_mask.shape[-1]
            a = a + masked
            a.masked_fill_(self_padding_mask,-1e9)
        else:
            enc_dec_padding_mask = enc_dec_padding_mask.unsqueeze(1).repeat(1, self.num_head, 1, 1)
            a.masked_fill_(enc_dec_padding_mask,-1e9)

        a = F.softmax(a,dim=-1)

        a = torch.matmul(a,v_)
        a = torch.reshape(a.permute(0, 2, 1, 3), shape=(q.shape[0],q.shape[1],q.shape[2]))
        a = self.dropout(a)
        a += queries
        a = self.LayerNormalization(a).to(DEVICE)
        return a


class FC(nn.Module):
    def __init__(self,input_channels,units=(2048,512)):
        super().__init__()
        self.input_channels = input_channels
        self.units = units
        self.layer1 = nn.Linear(self.input_channels,units[0])
        self.layer2 = nn.Linear(self.units[0],self.units[1])
        self.relu = nn.ReLU()
        self.LayerNormalization = nn.LayerNorm(d_model)


    def forward(self,x):
        outputs = self.layer1(x)
        outputs = self.relu(outputs)
        outputs = self.layer2(outputs)
        outputs += x
        outputs = self.LayerNormalization(outputs)
        return outputs.to(DEVICE)


class PositionalEncoding(nn.Module):
    def __init__(self, d_model=d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        """
        x: [seq_len, batch_size, d_model]
        """
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)


class EncoderLayer(nn.Module):
    def __init__(self):
        super(EncoderLayer, self).__init__()
        self.mask_self_attention = MultiHeadAttention(units,numofhead,dropout_rate,True)
        self.fc = FC(d_model)

    def forward(self,inputs,padding_mask):
        outputs = self.mask_self_attention(inputs,inputs,inputs,padding_mask,None)
        outputs = self.fc(outputs)
        return outputs.to(DEVICE)



class DecoderLayer(nn.Module):
    def __init__(self):
        super(DecoderLayer, self).__init__()
        self.self_attention = MultiHeadAttention(units,numofhead,dropout_rate,mask=False)
        self.fc = FC(d_model)

    def forward(self,enc_outputs,padding_mask):
        # enc_outputs = self.self_attention(enc_outputs,enc_outputs,enc_outputs,None,padding_mask)
        outputs = self.fc(enc_outputs)
        return outputs.to(DEVICE)



class Decoder(nn.Module):
    def __init__(self):
        super(Decoder, self).__init__()
        self.layers = nn.ModuleList([DecoderLayer() for _ in range(numofblock)])


    def forward(self,x,padding_mask):
        for layer in self.layers:
            x = layer(x,padding_mask)
        return x


class Encoder(nn.Module):
    def __init__(self,vocab_size):
        super(Encoder, self).__init__()
        self.pe = PositionalEncoding()
        self.embedding = TokenEmbedding(vocab_size,units)
        self.layers = nn.ModuleList([EncoderLayer() for _ in range(numofblock)])


    def forward(self,inputs):
        outputs = self.embedding(inputs)
        outputs = self.pe(outputs.transpose(0, 1)).transpose(0, 1)

        padding_mask = get_padding_mask(inputs,inputs)
        for layer in self.layers:
            outputs = layer(outputs,padding_mask)
        return outputs,padding_mask



class CTG(nn.Module):
    def __init__(self,vocab_size):
        super(CTG, self).__init__()
        self.Encoder = Encoder(vocab_size)
        self.Decoder = Decoder()
        self.linear = nn.Linear(d_model,vocab_size)

    def forward(self,x,epoch=None):
        enc_outputs,padding_mask = self.Encoder(x)
        # print(enc_outputs.shape)
        enc_outputs = self.Decoder(enc_outputs,padding_mask)
        logits = self.linear(enc_outputs)
        # if epoch == 9:
        #     print(logits)
        logits = logits.view(-1, logits.size(-1))
        return logits

model = CTG(vocab_size).to(DEVICE)
criterion = nn.CrossEntropyLoss(ignore_index=1)
optimizer = optim.SGD(model.parameters(), lr=1e-3, momentum=0.99)
# #
for epoch in tqdm.tqdm(range(epochs)):
    total = []
    for _,dec_inputs,dec_outputs in dataloader:

        dec_inputs,dec_outputs= dec_inputs.to(DEVICE),dec_outputs.to(DEVICE)
        # for i in dec_inputs:
        #     print(dataset.idx2enwords(i))
        outputs = model(dec_inputs,epoch)

        loss = criterion(outputs,dec_outputs.contiguous().view(-1))
        optimizer.zero_grad()
        loss.backward()
        total.append(loss)
        optimizer.step()
    print(sum(total)/len(total))


 10%|█         | 1/10 [00:37<05:36, 37.36s/it]

tensor(5.7588, device='cuda:0', grad_fn=<DivBackward0>)


 20%|██        | 2/10 [01:14<05:00, 37.51s/it]

tensor(4.8137, device='cuda:0', grad_fn=<DivBackward0>)


 30%|███       | 3/10 [01:52<04:23, 37.58s/it]

tensor(4.4237, device='cuda:0', grad_fn=<DivBackward0>)


 40%|████      | 4/10 [02:30<03:45, 37.65s/it]

tensor(4.1544, device='cuda:0', grad_fn=<DivBackward0>)


 50%|█████     | 5/10 [03:08<03:08, 37.67s/it]

tensor(3.9464, device='cuda:0', grad_fn=<DivBackward0>)


 60%|██████    | 6/10 [03:45<02:30, 37.71s/it]

tensor(3.7600, device='cuda:0', grad_fn=<DivBackward0>)


 70%|███████   | 7/10 [04:23<01:53, 37.71s/it]

tensor(3.5670, device='cuda:0', grad_fn=<DivBackward0>)


 80%|████████  | 8/10 [05:01<01:15, 37.71s/it]

tensor(3.3860, device='cuda:0', grad_fn=<DivBackward0>)


 90%|█████████ | 9/10 [05:39<00:37, 37.73s/it]

tensor(3.2031, device='cuda:0', grad_fn=<DivBackward0>)


100%|██████████| 10/10 [06:16<00:00, 37.69s/it]

tensor(3.0372, device='cuda:0', grad_fn=<DivBackward0>)





In [13]:

def greedy_decoder(model, start_symbol):
    """贪心编码
    For simplicity, a Greedy Decoder is Beam search when K=1. This is necessary for inference as we don't know the
    target sequence input. Therefore we try to generate the target input word by word, then feed it into the transformer.
    Starting Reference: http://nlp.seas.harvard.edu/2018/04/03/attention.html#greedy-decoding
    :param model: Transformer Model
    :param enc_input: The encoder input
    :param start_symbol: The start symbol. In this example it is 'S' which corresponds to index 4
    :return: The target input
    """
    inputs = torch.zeros(1, 0).long()
    terminal = False
    next_symbol = start_symbol
    while not terminal:
        # 预测阶段：inputs序列会一点点变长（每次添加一个新预测出来的单词）
        inputs = torch.cat([inputs.to(DEVICE), torch.tensor([[next_symbol]], dtype=inputs.dtype).to(DEVICE)],
                              -1)
        # print("inputs:")
        # print(inputs)
        dec_outputs,_ = model.Encoder(inputs)
        dec_outputs = model.Decoder(dec_outputs,_)
        dec_outputs = model.linear(dec_outputs)
        # projected = model.linear(dec_outputs)
        prob = dec_outputs.squeeze(0).max(dim=-1, keepdim=False)[1]
        # print("prob:")
        # print(dataset.idx2enwords(prob))
        # 增量更新（我们希望重复单词预测结果是一样的）
        # 我们在预测是会选择性忽略重复的预测的词，只摘取最新预测的单词拼接到输入序列中
        next_word = prob.data[-1]  # 拿出当前预测的单词(数字)。我们用x'_t对应的输出z_t去预测下一个单词的概率，不用z_1,z_2..z_{t-1}
        next_symbol = next_word
        # print(dataset.idx2en(next_word))
        if next_symbol == dataset.en_vocab["<eos>"]:
            terminal = True
        # print(next_word)

    # greedy_dec_predict = torch.cat(
    #     [inputs.to(device), torch.tensor([[next_symbol]], dtype=enc_input.dtype).to(device)],
    #     -1)
    greedy_dec_predict = inputs[:, 1:]
    return greedy_dec_predict

for i in range(20):
    greedy_dec_predict = greedy_decoder(model, start_symbol=dataset.en_vocab["<bos>"])
    # print(input[i], '->', greedy_dec_predict.squeeze())
    print(" ".join([dataset.idx2en(n.item()) for n in greedy_dec_predict.squeeze()]))

i don't have the address now.
i don't have to do it right away.
i don't think that she was a little cold.
i don't have a car.
i don't have the slightest idea.
i don't have the slightest idea.
i don't think that she will do this.
i don't think that she can speak english.
i don't have a car.
i don't have a car.
i don't have the slightest idea.
i don't have a car.
i don't think that can do any good.
i don't think that he will do it right away.
i don't have a car.
i don't think that can help you.
i don't think that she can speak english.
i don't have a car.
i don't think that can do any good.
i don't think that she can do it.


In [14]:
def top_k_logits(logits, k):
    v, ix = torch.topk(logits, k)
    out = logits.clone()
    out[out < v[:, [-1]]] = -float('Inf')
    return out
@torch.no_grad()
def sample(model, x, steps, temperature=1.0, sample=False, top_k=None):
    """
    take a conditioning sequence of indices in x (of shape (b,t)) and predict the next token in
    the sequence, feeding the predictions back into the model each time. Clearly the sampling
    has quadratic complexity unlike an RNN that is only linear, and has a finite context window
    of block_size, unlike an RNN that has an infinite context window.
    """
    block_size = 35
    model.eval()
    for k in range(steps):
        x_cond = x if x.size(1) <= block_size else x[:, -block_size:] # crop context if needed
        # print(x_cond)
        logits = model(x_cond)
        # pluck the logits at the final step and scale by temperature
        # logits = logits[:, -1, :] / temperature
        # optionally crop probabilities to only the top k options
        if top_k is not None:
            logits = top_k_logits(logits, top_k)
        # apply softmax to convert to probabilities
        probs = F.softmax(logits, dim=-1)
        # sample from the distribution or take the most likely
        if sample:
            ix = torch.multinomial(probs, num_samples=1)
        else:
            _, ix = torch.topk(probs, k=1, dim=-1)
        # append to the sequence and continue
        # print(dataset.idx2enwords(x[-1]))
        ix = ix[-1]
        # print(ix.shape)
        # print(dataset.idx2enwords(ix))
        ix = ix.unsqueeze(0)
        x = torch.cat((x, ix), dim=1)

    return x


In [15]:
x = dataset.words2idx("<bos> anyone can do".split(),'en').unsqueeze(0).to(DEVICE)
# x = torch.tensor([2,3,4], dtype=torch.long)[None, ...].to(DEVICE) # context conditioning
y = sample(model, x, steps=30, temperature=1.0, sample=True, top_k=None)[0]
print(dataset.idx2enwords(y))

<bos> anyone can do it? <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> on. <eos> something to page several books. <eos> something for a year due to the 98 the improve


In [17]:

def get_sequence():
    s = dataset.words2idx("<bos> a".split(),"en")
    s = s.unsqueeze(0).to(DEVICE)

    # print(s.shape)
    flag = True
    data = torch.tensor([]).long().to(DEVICE).unsqueeze(0)
    count = 0
    while flag:
        # print(s,data)
        s = torch.cat((s,data),dim=-1)
        dec_outputs = model(s.to(DEVICE),1)
        prob = F.softmax(dec_outputs, dim=-1)
        # prob = dec_outputs.squeeze(0)
        prob = torch.multinomial(prob, num_samples=1)
        # print(prob)
        data = prob[-1].unsqueeze(0)


#         print(data)
        count += 1
        if data == 3:
            flag = False
        if count == 20:
            flag = False
    # print()

    # print(prob)
#     print(s)
    # for i in prob:
    print(dataset.idx2enwords(s[-1]))
for i in range(10):
    get_sequence()

<bos> a car consists of people.
<bos> a doctor wouldn't do not make sense.
<bos> a prince has rose in the traffic accident.
<bos> a sudden out of four books have failed.
<bos> a famous slightly and some me?" the politician.
<bos> a fire broke out after the drowsy after the island.
<bos> a walk died.
<bos> a bird in my team air folded the park. you have to see you think.
<bos> a beautiful comic book judge
<bos> a pity of air curtain.
