In [19]:
import torch

# 准备数据
# 先用这个数据作为训练数据，模型训练完成之后，
# 向模型输入这个source_sentence,模型能输出target_sentence 那就很满足。
from transformer.MyTransformer import MyTransformer

source_sentences = ["你真是个傻逼", "你咋这么能呢", "我将带头冲锋", "在东南亚打自由搏击", "铃儿响叮当"]
target_sentences = ["你真是个小可爱", "你就是个菜鸡", "我先撤你们上", "这货真牛逼", "今天真开心"]

sequence_empty_pad = "[PAD]"
sequence_start_pad = "[CLS]"
sequence_end_pad = "[SEP]"

sequence_max_length = 12


In [20]:

# 建立词典
# 提取所有字符
all_characters = set()
all_characters.add(sequence_empty_pad)
all_characters.add(sequence_start_pad)
all_characters.add(sequence_end_pad)
for sentence in source_sentences + target_sentences:
    for char in sentence:
        all_characters.add(char)

# 去重并排序
sorted_characters = sorted(all_characters)

vocabulary_size = len(sorted_characters)  # 词汇表大小

# 构建字典
char_to_index = {char: idx for idx, char in enumerate(sorted_characters)}
index_to_char = {idx: char for idx, char in enumerate(sorted_characters)}

pad_index = char_to_index[sequence_empty_pad]


def tokenize(sequence):
    return [token for token in sequence]


def convert_tokens_to_ids(tokens):
    return [char_to_index[token_item] for token_item in tokens]


def convert_ids_to_tokens(ids):
    return [index_to_char[id_item] for id_item in ids]


# 打印结果
print("字符字典:")
print(char_to_index)


字符字典:
{'[CLS]': 0, '[PAD]': 1, '[SEP]': 2, '上': 3, '东': 4, '个': 5, '么': 6, '亚': 7, '今': 8, '们': 9, '你': 10, '傻': 11, '儿': 12, '先': 13, '冲': 14, '击': 15, '南': 16, '叮': 17, '可': 18, '呢': 19, '咋': 20, '响': 21, '在': 22, '天': 23, '头': 24, '将': 25, '小': 26, '就': 27, '带': 28, '开': 29, '当': 30, '心': 31, '我': 32, '打': 33, '搏': 34, '撤': 35, '是': 36, '爱': 37, '牛': 38, '由': 39, '真': 40, '能': 41, '自': 42, '菜': 43, '货': 44, '这': 45, '逼': 46, '铃': 47, '锋': 48, '鸡': 49}


In [21]:
vocabulary_size

50

In [22]:

model = MyTransformer(i_vocabulary_size=vocabulary_size,
                      t_vocabulary_size=vocabulary_size,
                      src_pad_idx=pad_index,
                      tgt_pad_idx=pad_index,
                      d_model=16,
                      num_heads=8,
                      num_encoder_layers=6,
                      num_decoder_layers=6,
                      dim_feedforward=64,
                      dropout=0.1
                      )

batch_size = 3


cpu


In [32]:

def get_tokens_batch():
    ixs = torch.randint(0, len(source_sentences), (batch_size,))
    print(ixs)
    source_tokens_batch = []
    target_tokens_batch = []
    for i in ixs:
        source_tokens = ['[CLS]'] + tokenize(source_sentences[i]) + ['[SEP]']
        if len(source_tokens) < sequence_max_length:
            # 填充（PAD）并创建注意力掩码
            padding_length_source = sequence_max_length - len(source_tokens)
            source_tokens += [sequence_empty_pad] * padding_length_source
        source_tokens_batch.append(source_tokens)

        target_tokens = ['[CLS]'] + tokenize(target_sentences[i]) + ['[SEP]']
        if len(target_tokens) < sequence_max_length:
            # 填充（PAD）并创建注意力掩码
            padding_length_target = sequence_max_length - len(target_tokens)
            target_tokens += [sequence_empty_pad] * padding_length_target
        target_tokens_batch.append(target_tokens)

    return source_tokens_batch, target_tokens_batch


x, y = get_tokens_batch()


tensor([3, 0, 4])


In [33]:
x, y = get_batch()

tensor([4, 3, 0])


In [34]:
x

[['[CLS]',
  '铃',
  '儿',
  '响',
  '叮',
  '当',
  '[SEP]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]'],
 ['[CLS]', '在', '东', '南', '亚', '打', '自', '由', '搏', '击', '[SEP]', '[PAD]'],
 ['[CLS]',
  '你',
  '真',
  '是',
  '个',
  '傻',
  '逼',
  '[SEP]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]']]

In [35]:
y

[['[CLS]',
  '今',
  '天',
  '真',
  '开',
  '心',
  '[SEP]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]'],
 ['[CLS]',
  '这',
  '货',
  '真',
  '牛',
  '逼',
  '[SEP]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]'],
 ['[CLS]',
  '你',
  '真',
  '是',
  '个',
  '小',
  '可',
  '爱',
  '[SEP]',
  '[PAD]',
  '[PAD]',
  '[PAD]']]

In [43]:

def embedding(tokens_batch):
    return torch.tensor([convert_tokens_to_ids(tokens) for tokens in tokens_batch])


x_embedding = embedding(x)
y_embedding = embedding(y)



In [46]:
x_embedding.shape

torch.Size([3, 12])

In [47]:
y_embedding.shape

torch.Size([3, 12])

In [49]:


def create_trg_self_mask(target_len, device=None):
    # Prevent leftward information flow in self-attention.
    ones = torch.ones(target_len, target_len, dtype=torch.uint8,
                      device=device)
    t_self_mask = torch.triu(ones, diagonal=1).unsqueeze(0)

    return t_self_mask


In [52]:
t_self = create_trg_self_mask(12)

In [53]:
t_self.unsqueeze(1)

tensor([[[[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
          [0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
          [0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1],
          [0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1],
          [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1],
          [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1],
          [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1],
          [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1],
          [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1],
          [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1],
          [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
          [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]]], dtype=torch.uint8)

In [54]:
t_self

tensor([[[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
         [0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
         [0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1],
         [0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1],
         [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1],
         [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1],
         [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1],
         [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]], dtype=torch.uint8)

In [60]:
import numpy as np
def get_attn_subsequent_mask(seq):
  '''
  Build attention mask matrix for decoder when it autoregressing.

  parameters:
  seq: [batch, target_len]

  return:
  subsequent_mask: [batch, target_len, target_len] 
  '''
  attn_shape = [seq.size(0), seq.size(1), seq.size(1)] # [batch, target_len, target_len]
  subsequent_mask = np.triu(np.ones(attn_shape), k=1) # [batch, target_len, target_len] 
  subsequent_mask = torch.from_numpy(subsequent_mask)

  return subsequent_mask # [batch, target_len, target_len] 

In [62]:
np.ones([1,2,3])

array([[[1., 1., 1.],
        [1., 1., 1.]]])

torch.Size([36])

In [65]:
 y_embedding[:, 1:].shape

torch.Size([3, 11])

In [66]:
convert_tokens_to_ids(sequence_start_pad)

KeyError: '['