> 训练语料[THUCTC/THUCNews/游戏/](http://thuctc.thunlp.org/#%E8%8E%B7%E5%8F%96%E9%93%BE%E6%8E%A5), 剔除字频小于100的. 其他字符全部保留.<br>
> 总参数量 1300万, 在一张RTX3060上,训练了48小时左右.

In [1]:
import glob, json
import os, time, datetime, sys, signal, pickle
from collections import Counter
import math

import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader

# from optimizer import Adam16

## 点积缩放注意力

In [2]:
class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size, n_embd, dropout, block_size):
        super().__init__()
        # head_size = 384/n_head 
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        # 下三角真，等价于valid_lens
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # x.shape = [16, 256, 384]
        # T = seq_len
        # [batch_size, seq_len, channel_dim]
        B,T,C = x.shape
        k = self.key(x)   # (B,T,C)
        q = self.query(x) # (B,T,C)
        # compute attention scores ("affinities")
        w = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
        # BTT 必定是一个方阵,所以使用下三角阵实现掩码
        w = w.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        w = F.softmax(w, dim=-1) # (B, T, T)
        w = self.dropout(w)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = w @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

In [3]:
torch.manual_seed(42)
head_test = Head(head_size=64, n_embd=384, dropout=0.2, block_size=256)
x = torch.randn(16, 256, 384)
x.shape

torch.Size([16, 256, 384])

In [4]:
head_test(x).shape

torch.Size([16, 256, 64])

> <font color=red>此处实现的点积缩放注意力过程:</font>

In [5]:
tril = torch.tril(torch.ones(256, 256))

In [6]:
torch.tril(torch.ones(4, 4)) == 0

tensor([[False,  True,  True,  True],
        [False, False,  True,  True],
        [False, False, False,  True],
        [False, False, False, False]])

In [7]:
w = torch.randn( 2, 4, 4)
w.shape

torch.Size([2, 4, 4])

In [8]:
w

tensor([[[-0.2952, -0.1414,  0.0954,  2.3579],
         [-0.7269, -0.6404,  1.7583, -1.1488],
         [ 0.8572, -1.4686, -0.0623,  0.2973],
         [-0.4093, -0.7080,  0.5000, -0.6843]],

        [[ 0.0481, -0.6847, -0.4713,  1.7480],
         [-0.0438, -0.1857,  0.2502, -0.4137],
         [ 0.2577, -0.3792, -1.3037,  1.5165],
         [-0.2302, -1.4977, -0.8842, -0.5028]]])

In [9]:
w = w.masked_fill(tril[:4, :4] == 0, float('-inf'))
w

tensor([[[-0.2952,    -inf,    -inf,    -inf],
         [-0.7269, -0.6404,    -inf,    -inf],
         [ 0.8572, -1.4686, -0.0623,    -inf],
         [-0.4093, -0.7080,  0.5000, -0.6843]],

        [[ 0.0481,    -inf,    -inf,    -inf],
         [-0.0438, -0.1857,    -inf,    -inf],
         [ 0.2577, -0.3792, -1.3037,    -inf],
         [-0.2302, -1.4977, -0.8842, -0.5028]]])

In [10]:
F.softmax(w, dim=-1)

tensor([[[1.0000, 0.0000, 0.0000, 0.0000],
         [0.4784, 0.5216, 0.0000, 0.0000],
         [0.6683, 0.0653, 0.2665, 0.0000],
         [0.2006, 0.1488, 0.4981, 0.1524]],

        [[1.0000, 0.0000, 0.0000, 0.0000],
         [0.5354, 0.4646, 0.0000, 0.0000],
         [0.5751, 0.3042, 0.1207, 0.0000],
         [0.3902, 0.1098, 0.2029, 0.2971]]])

> <font color=green>在seq_len维度,既句子维度,每次都掩蔽1个单词,形成一个下三角阵</font>

## 多头注意力机制

In [11]:
class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, n_head, head_size, n_embd, dropout, block_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size, n_embd, dropout, block_size) for _ in range(n_head)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # 这里的多头注意力是可以并行计算的,这里选择的是串行, 这里可以改为并行么?在decoder阶段.
        # x = BTC, h(x)=BTC
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        # out = [B, T, C*n_head] = [batch_size, seq_len, (384/6)*6] =  [16, 256, 384]
        out = self.dropout(self.proj(out))
        return out

## 基于位置的前馈神经网络

In [12]:
class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd, dropout):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

## GPT Block

![](./imgs/gpt_architecture.png)

In [13]:
class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head, dropout, block_size):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size, n_embd, dropout, block_size)
        self.ffwd = FeedFoward(n_embd, dropout)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

## GPT 训练用语言模型

In [53]:
class GPTLanguageModel(nn.Module):

    def __init__(self, n_embd, n_head, n_layer, vocab_size, dropout, block_size, device):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head, dropout, block_size) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)
        self.device = device
        self.block_size = block_size

    def forward(self, idx, targets=None):
        B, T = idx.shape
        
        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=self.device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def top_k_logits(logits, k) -> torch.Tensor:
        if k == 0:
            return logits
        values, _ = torch.topk(logits, k)
        min_values = values[:, -1]
        return torch.where(logits < min_values, torch.ones_like(logits, dtype=logits.dtype) * -1e10, logits)
        
    def generate(self, idx, max_new_tokens, temperature: float = 1.0, top_k: int = 0) -> torch.Tensor:
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -self.block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] / temperature # becomes (B, C)
            # apply top-k sampling
            # logits = self.top_k_logits(logits, k=top_k)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

## 构建GPT训练用语言模型输入

[数据下载地址](http://thuctc.thunlp.org/#%E8%8E%B7%E5%8F%96%E9%93%BE%E6%8E%A5)

In [23]:
data_path = '/mnt/Wangquanjun/DATA/THUCTC/THUCNews/游戏/'

In [24]:
path_arr = glob.glob(data_path+'*.txt')
path_arr[0:2]

['/mnt/Wangquanjun/DATA/THUCTC/THUCNews/游戏/425201.txt',
 '/mnt/Wangquanjun/DATA/THUCTC/THUCNews/游戏/413773.txt']

### 统计词频 & 去掉低频词
> 基于字符的统计,因此不需要分词. <br>
> 不使用停用词, 空格都是需要保留的.<br>
> <font color=green>低频词需要去掉</font><br>

In [25]:
# 最小词频:小于最小词频的词将被过滤掉
MIN_FREQ = 100

In [26]:
ch_freq = {}
lines = []
for path in path_arr:
    with open(path, 'r', encoding='utf-8-sig') as f:
        line = f.read()
        # 去掉特殊的空白字符,归一化为' ' 
        line = line.strip().replace(u'\u3000', ' ').replace(u'\xa0', u' ').replace(u'\u202f', u' ')
        lines.append(line)
        for ch in line:
            if ch not in ch_freq:
                ch_freq[ch] = 0
            ch_freq[ch] += 1
ch_freq = {k: v for k, v in sorted(ch_freq.items(), key=lambda item: item[1], reverse=True)}

In [27]:
len(ch_freq)

5817

In [28]:
vocabs = []
for ch , num in ch_freq.items():
    if num > MIN_FREQ:
        vocabs.append(ch)

In [29]:
len(vocabs)

2941

### stoi itos

In [30]:
stoi = {ch:i for i, ch in enumerate(vocabs)}
itos = {i:ch for i, ch in enumerate(vocabs)}

In [32]:
import pickle
with open('./models/model/stoi.pk', 'wb') as f:
    pickle.dump(stoi, f)
    
with open('./models/model/itos.pk', 'wb') as f:
    pickle.dump(itos, f)

In [23]:
SET_CHAR_LIST = set(vocabs)

In [24]:
train_data = ''
val_data = ''
for line in lines:
    line = ''.join([ch for ch in line if ch in SET_CHAR_LIST])
    n = int(0.9 * len(line))
    train_data += line[:n]
    val_data += line[n:]

### DataLoader

In [25]:
vocab_size = len(vocabs)
vocab_size

2941

In [26]:
torch.manual_seed(42)
block_size = 256 # what is the maximum context length for predictions?
n_embd = 384 # embedding size (dimensionality of the hidden state)
n_head = 6 # number of heads in multi-head attention in pytorch
n_layer = 6 # number of layers in the transformer model
dropout = 0.2 # dropout rate (probability of zeroing out activations)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [27]:
# hyperparameters for training (will be written to the data cache file)
batch_size = 16 # how many independent sequences will we process in parallel?
eval_interval = 2000 # how often to evaluate the model on train and val sets
# eval_interval = 20 # how often to evaluate the model on train and val sets
max_iters = 2000_000
# max_iters = 2000
learning_rate = 3e-4 # 3e-4 is the default in the original paper 
eval_iters = 200 # how many batches to use for evaluation

models_path = './models/model'
average_power_usage = 550 # watts

In [28]:
train_data = torch.tensor([stoi[ch] for ch in train_data], dtype=torch.long)
val_data = torch.tensor([stoi[ch] for ch in val_data], dtype=torch.long)

In [29]:
print(train_data.shape)
print(val_data.shape)

torch.Size([19185178])
torch.Size([2143728])


#### 方案-1: 
> <font color=red>内存占用太大</font>

In [30]:
arr = torch.arange(20)
print(arr)
b_s = 3
arr_x = []
arr_y = []
for i in range(arr.__len__()-b_s):
    x = arr[i: i+b_s]
    y = arr[i+1: i+1+b_s]
    print(i, x, y)
    arr_x.append(x)
    arr_y.append(y)

print(torch.stack(arr_x, dim=0)[0])
print(torch.stack(arr_y, dim=0)[0])
print(torch.stack(arr_x, dim=0).shape)

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19])
0 tensor([0, 1, 2]) tensor([1, 2, 3])
1 tensor([1, 2, 3]) tensor([2, 3, 4])
2 tensor([2, 3, 4]) tensor([3, 4, 5])
3 tensor([3, 4, 5]) tensor([4, 5, 6])
4 tensor([4, 5, 6]) tensor([5, 6, 7])
5 tensor([5, 6, 7]) tensor([6, 7, 8])
6 tensor([6, 7, 8]) tensor([7, 8, 9])
7 tensor([7, 8, 9]) tensor([ 8,  9, 10])
8 tensor([ 8,  9, 10]) tensor([ 9, 10, 11])
9 tensor([ 9, 10, 11]) tensor([10, 11, 12])
10 tensor([10, 11, 12]) tensor([11, 12, 13])
11 tensor([11, 12, 13]) tensor([12, 13, 14])
12 tensor([12, 13, 14]) tensor([13, 14, 15])
13 tensor([13, 14, 15]) tensor([14, 15, 16])
14 tensor([14, 15, 16]) tensor([15, 16, 17])
15 tensor([15, 16, 17]) tensor([16, 17, 18])
16 tensor([16, 17, 18]) tensor([17, 18, 19])
tensor([0, 1, 2])
tensor([1, 2, 3])
torch.Size([17, 3])


In [31]:
class GptDataset(Dataset):
    
    def __init__(self, data, block_size):
        arr_x = []
        arr_y = []
        for i in range(data.__len__()-block_size):
            x = data[i: i+block_size]
            y = data[i+1: i+1+block_size]
            arr_x.append(x)
            arr_y.append(y)
    
        self.x = torch.stack(arr_x, dim=0)
        self.y = torch.stack(arr_y, dim=0)
        
    def __getitem__(self, index):
        return self.x[index], self.y[index]
    
    def __len__(self):
        return self.x.shape[0]

In [None]:
train_dataset = GptDataset(train_data, block_size=block_size)
val_dataset = GptDataset(val_data, block_size=block_size)

In [None]:
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

In [None]:
test_loader = DataLoader(LBC_test, batch_size=batch_size, shuffle=False)

#### 方案-2: 随机抽样

In [30]:
def get_train_batch(batch_size=16, block_size=256, device=None):
    # generate a small batch of data of inputs x and targets y
    ix = torch.randint(len(train_data) - block_size, (batch_size,))
    x = torch.stack([train_data[i:i+block_size] for i in ix])
    y = torch.stack([train_data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

In [31]:
def get_val_batch(batch_size=16, block_size=256, device=None):
    # generate a small batch of data of inputs x and targets y
    ix = torch.randint(len(val_data) - block_size, (batch_size,))
    x = torch.stack([val_data[i:i+block_size] for i in ix])
    y = torch.stack([val_data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

In [32]:
x, y = get_train_batch(device=device)
print(x.shape)
print(y.shape)

torch.Size([16, 256])
torch.Size([16, 256])


## 训练

In [15]:
torch.manual_seed(42)
block_size = 256 # what is the maximum context length for predictions?
n_embd = 384 # embedding size (dimensionality of the hidden state)
n_head = 6 # number of heads in multi-head attention in pytorch
n_layer = 6 # number of layers in the transformer model
dropout = 0.2 # dropout rate (probability of zeroing out activations)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
vocab_size = 2941

In [16]:
model = GPTLanguageModel(n_embd=n_embd,
                         n_head=n_head,
                         n_layer=n_layer,
                         vocab_size=vocab_size,
                         dropout=dropout,
                         block_size=block_size,
                         device=device)

In [34]:
model.to(device)

GPTLanguageModel(
  (token_embedding_table): Embedding(2941, 384)
  (position_embedding_table): Embedding(256, 384)
  (blocks): Sequential(
    (0): Block(
      (sa): MultiHeadAttention(
        (heads): ModuleList(
          (0): Head(
            (key): Linear(in_features=384, out_features=64, bias=False)
            (query): Linear(in_features=384, out_features=64, bias=False)
            (value): Linear(in_features=384, out_features=64, bias=False)
            (dropout): Dropout(p=0.2, inplace=False)
          )
          (1): Head(
            (key): Linear(in_features=384, out_features=64, bias=False)
            (query): Linear(in_features=384, out_features=64, bias=False)
            (value): Linear(in_features=384, out_features=64, bias=False)
            (dropout): Dropout(p=0.2, inplace=False)
          )
          (2): Head(
            (key): Linear(in_features=384, out_features=64, bias=False)
            (query): Linear(in_features=384, out_features=64, bias=False)
    

Partially added support for training at fp16 precision for decreased memory usage ([source](https://gist.github.com/ajbrock/075c0ca4036dc4d8581990a6e76e07a3))

In [35]:
# optimizer = Adam16(model.parameters(), lr=learning_rate) # for fp16
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [36]:
def save_last_checkpoint(suffix: str = "last"):
    """ Saves the model and training state to disk """
    torch.save(model.state_dict(), os.path.join(models_path, f"model-{suffix}.pt"))

In [37]:
t0 = time.time()
best_score = None
iter_range = range(max_iters)

In [38]:
iter = 0
t2 = 0

> valid loss

In [39]:
def estimate_loss(model, eval_iters, batch_size, block_size, device=None):
    model.eval()
    losses = torch.zeros(eval_iters)
    with torch.no_grad():
        for k in range(eval_iters):
            X, Y = get_val_batch(batch_size=batch_size, block_size=block_size, device=device)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
    model.train()
    return losses.mean()

In [40]:
iter_range

range(0, 2000000)

In [None]:
for iter in iter_range:
    
    if iter % eval_interval == 0 or iter == max_iters -1:
        t1 = time.time()
        score = estimate_loss(model, 
                              eval_iters=eval_iters, 
                              batch_size=batch_size, 
                              block_size=block_size, 
                              device=device)
        t2 = time.time()
        if best_score is None:
            best_score = score
            
        if score < best_score:
            save_last_checkpoint('best-val')
            
        save_last_checkpoint()
        
        t3 = time.time()
        if iter > 0:
            remaining_time = ((time.time()-t0)/60/60) / iter * (max_iters-iter) # h
        else:
            remaining_time = 0.0
        
        power_used = (time.time()-t0)/60/60*average_power_usage/1000 # kWh
        training_time = str(datetime.timedelta(seconds=int(time.time()-t0)))
    
        print(f" evaluation took {t2-t1:.2f} seconds. model saved in {t3-t2:.2f} seconds. Total time wasted training: {training_time}, approx. {power_used:.3f} kWh used, remaining time: {remaining_time:.2f} hours.")
    
    
    
    model.train()
    x, y = get_train_batch(batch_size, block_size, device)
    logits, loss = model(x, y)
    
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

 evaluation took 4.52 seconds. model saved in 0.05 seconds. Total time wasted training: 0:00:09, approx. 0.001 kWh used, remaining time: 0.00 hours.
 evaluation took 4.51 seconds. model saved in 0.10 seconds. Total time wasted training: 0:02:50, approx. 0.026 kWh used, remaining time: 47.21 hours.
 evaluation took 4.51 seconds. model saved in 0.10 seconds. Total time wasted training: 0:05:31, approx. 0.051 kWh used, remaining time: 45.92 hours.
 evaluation took 4.51 seconds. model saved in 0.10 seconds. Total time wasted training: 0:08:12, approx. 0.075 kWh used, remaining time: 45.47 hours.
 evaluation took 4.51 seconds. model saved in 0.10 seconds. Total time wasted training: 0:10:53, approx. 0.100 kWh used, remaining time: 45.23 hours.
 evaluation took 4.51 seconds. model saved in 0.10 seconds. Total time wasted training: 0:13:35, approx. 0.125 kWh used, remaining time: 45.07 hours.
 evaluation took 4.51 seconds. model saved in 0.10 seconds. Total time wasted training: 0:16:16, appr

In [43]:
1+1

2

## 预测

In [54]:
import pickle
with open('./models/model/stoi.pk', 'rb') as f:
    stoi = pickle.load(f)
    
with open('./models/model/itos.pk', 'rb') as f:
    itos = pickle.load(f)

In [55]:
torch.manual_seed(42)
block_size = 256 # what is the maximum context length for predictions?
n_embd = 384 # embedding size (dimensionality of the hidden state)
n_head = 6 # number of heads in multi-head attention in pytorch
n_layer = 6 # number of layers in the transformer model
dropout = 0.2 # dropout rate (probability of zeroing out activations)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
vocab_size = 2941
batch_size = 4 # 返回4组解

In [56]:
model = GPTLanguageModel(n_embd=n_embd,
                         n_head=n_head,
                         n_layer=n_layer,
                         vocab_size=vocab_size,
                         dropout=dropout,
                         block_size=block_size,
                         device=device)

In [57]:
# 加载模型
model_path = './models/model/model-best-val.pt'
model.load_state_dict(torch.load(model_path))

<All keys matched successfully>

In [58]:
model.to(device=device)

GPTLanguageModel(
  (token_embedding_table): Embedding(2941, 384)
  (position_embedding_table): Embedding(256, 384)
  (blocks): Sequential(
    (0): Block(
      (sa): MultiHeadAttention(
        (heads): ModuleList(
          (0): Head(
            (key): Linear(in_features=384, out_features=64, bias=False)
            (query): Linear(in_features=384, out_features=64, bias=False)
            (value): Linear(in_features=384, out_features=64, bias=False)
            (dropout): Dropout(p=0.2, inplace=False)
          )
          (1): Head(
            (key): Linear(in_features=384, out_features=64, bias=False)
            (query): Linear(in_features=384, out_features=64, bias=False)
            (value): Linear(in_features=384, out_features=64, bias=False)
            (dropout): Dropout(p=0.2, inplace=False)
          )
          (2): Head(
            (key): Linear(in_features=384, out_features=64, bias=False)
            (query): Linear(in_features=384, out_features=64, bias=False)
    

In [62]:
nsamples = 128

In [63]:
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

In [64]:
sentence = '魔兽世界'
context = encode(sentence)

In [65]:
context

[249, 453, 131, 145]

In [68]:
context = torch.tensor(context, device=device).unsqueeze(0).repeat(batch_size, 1)

In [69]:
context

tensor([[249, 453, 131, 145],
        [249, 453, 131, 145],
        [249, 453, 131, 145],
        [249, 453, 131, 145]], device='cuda:0')

In [70]:
from torchinfo import summary

In [71]:
summary(model, input_data=context)

Layer (type:depth-idx)                        Output Shape              Param #
GPTLanguageModel                              [4, 4, 2941]              --
├─Embedding: 1-1                              [4, 4, 384]               1,129,344
├─Embedding: 1-2                              [4, 384]                  98,304
├─Sequential: 1-3                             [4, 4, 384]               --
│    └─Block: 2-1                             [4, 4, 384]               --
│    │    └─LayerNorm: 3-1                    [4, 4, 384]               768
│    │    └─MultiHeadAttention: 3-2           [4, 4, 384]               590,208
│    │    └─LayerNorm: 3-3                    [4, 4, 384]               768
│    │    └─FeedFoward: 3-4                   [4, 4, 384]               1,181,568
│    └─Block: 2-2                             [4, 4, 384]               --
│    │    └─LayerNorm: 3-5                    [4, 4, 384]               768
│    │    └─MultiHeadAttention: 3-6           [4, 4, 384]            

In [72]:
generated = model.generate(context, max_new_tokens=nsamples, temperature=0.7)

In [73]:
for result in generated:
    response = decode(result.tolist())
    print(response)
    print('====='*10)

魔兽世界》(Works II)。
  DOTA 是第一款RPG游戏，而最大的乐趣就是以丰富的游戏体验来投入这一个世界。玩家的真实情感令人难以置信，对于能够让玩家喜爱的游戏来一次包容了，也能够让玩家感受到真正的足球游戏的乐趣。我们仅仅可以将这一切列给玩家带来的乐趣和
魔兽世界：翼妖王之怒》中的消息，但丁的剧情盈集于一个全新的角色，这也是一个全新的世界。
  “我们是一个游戏的玩家，在我们的脑海里，我们总是会把游戏中的一些人物做出来让大家都说的好，但丁的作品会和身世都有一些，比如《魔兽世界》和《星际》都是一些怪异的游戏。我们在《
魔兽世界》，借助于中国大陆获得冠军，并且在不断扩大海外为中国游戏玩家提供丰富的网游体验。同时，盛大游戏都进行的帮化、业界和玩家问流、产品的理念与传统文化的融合，得到了赞扬。中国网络游戏行业发展的管发力和持续的支持下，也是中国网游公司的一次大胆尝学和魅力不亚于玩家，
魔兽世界》延迟、《星际争霸2》延于PC平台，所以在欧美已经推出了大量的版本，曾经在美国、中国等地上推出了《魔兽世界》，当然这里也不会就是一次全面的实现了。
  对于一款暴雪人兽世界，暴雪不准备和玩家分享暴雪的建议，但是国内的游戏公司很强吸引他们的玩家和赞助商！
 


In [74]:
sentence = '刀剑神域'
context = encode(sentence)
context = torch.tensor(context, device=device).unsqueeze(0).repeat(batch_size, 1)
generated = model.generate(context, max_new_tokens=nsamples, temperature=0.7)
for result in generated:
    response = decode(result.tolist())
    print(response)
    print('====='*10)

刀剑神域》中的武器，还有可能是一把特种剑。
  当然，游戏的武器和兵种也是都变成了的，玩家可以选择要使用的武器，这个武器只能是以前的武器，在游戏中花钱，就是用铁片武器打造武器。
  同时，游戏的武器有两种模型，其中一种是开枪、弹枪，武器是随机弹道，但是这种模型的武
刀剑神域》就是在韩国发行，而韩国公司只是一个非常的原始化。
  此前，韩国网游业界也曾以“新”的方式收费了韩国网游业，但是韩国国内游戏业有着不错的表现，也许是一个成熟的产业。但是，韩国国内游戏业界的巨头们，大家会说的是中国玩家游戏是不是很喜欢他们呢？
  Base
刀剑神域OL
  《剑网3》运营团队为广大剑卡迷准备了多重精彩活动，让大家感受武卡世界的精彩魅力。
  【活动时间】：
  10月15日起开启
  【活动内容】：
  1、活动期间，玩家只需要和其他玩家一同游秘，便可以参加，活动期间，玩家可以通过打怪获得更多的经验
刀剑神域》中的一些修过程，让玩家感受3D画面的真实感。
  2008年度ChinaJoy原创大赛“玩家最期待的竞大网络游戏”。这项“最受欢迎的竞大网络游戏”之称。
  《精灵传说》是以“精灵传说”为背景，采用“精灵传说”设计竞分帮秀，技能非常多的，每一个精灵都有着


## 结论：
> 1. 语句还算通顺， 毕竟参数有限.<br>