In [109]:
import numpy as np
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer
from einops import rearrange
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# 贪心搜索 greedy search

$$
\hat{y_t} = \arg\max_y P(y | \hat{Y}_{<t}, X)
$$

In [8]:
def greedy_decoding(model, tokenizer, input_ids, max_tokens=300):
    # 使用torch.inference_mode()上下文管理器来禁用梯度计算，这在推理时可以提高性能并减少内存使用。
    with torch.inference_mode():
        for _ in range(max_tokens):
            # 将当前的input_ids输入到模型中，获取模型的输出:
            # logits：模型的原始输出，是一个三维张量，形状为(batch_size, sequence_length, vocab_size)，表示每个token在词汇表中的预测分数。
            # last_hidden_state：这是模型的最后一层隐藏状态，通常用于进一步的任务，如命名实体识别或问答系统。
            # lhidden_states：这是一个元组，包含了模型每一层的隐藏状态。这可以用于分析模型内部的表示或进行更复杂的下游任务。
            # lattentions：对于带有注意力机制的模型（如Transformer），这个属性包含了每一层的注意力权重。这可以用于可视化和分析模型的注意力分布。
            # lcross_attentions：在序列到序列的模型中，这个属性包含了编码器和解码器之间的交叉注意力权重。
            # lloss：如果在训练过程中提供了标签，模型输出可能包含一个损失值，表示模型预测与真实标签之间的差异。
            # lpast_key_values：在某些模型（如GPT-2）中，这个属性包含了过去的键值对，用于加速生成任务中的解码过程。
            outputs = model(input_ids)
            # 从模型输出中提取最后一个token的logits（未归一化的预测概率）。[:, -1, :]表示取最后一个时间步的所有token的logits
            # 第一个维度: 批次（batch）维度: 表示选择这个维度上的所有元素，即选择整个批次。这意味着对批次中的每个样本都执行相同的操作。
            # 第二个维度: 序列（sequence）维度: -1 表示选择这个维度上的最后一个元素。
            # 第三个维度: 词汇表（vocabulary）维度: 表示选择这个维度上的所有元素，即选择所有可能的token的logits。
            next_token_logits = outputs.logits[:, -1, :]
            # 使用argmax函数在logits上找到概率最高的token ID，即预测的下一个token。
            next_token = torch.argmax(next_token_logits, dim=-1)
            # 检查预测的下一个token是否是结束符（eos_token_id）。如果是，则停止生成
            if next_token == tokenizer.eos_token_id:
                break
            # 将预测的下一个token添加到input_ids的末尾
            input_ids = torch.cat([input_ids, rearrange(next_token, 'c -> 1 c')], dim=-1)
            # rearrange(next_token, 'c -> 1 c')用于将next_token的形状从(batch_size,)变为(batch_size, 1)，以便与input_ids拼接。
            # 拆分：可以将一个维度拆分成多个维度。例如，rearrange(data, '(a b) -> a b', a=2, b=5) 将一个维度拆分成两个维度
            # 合并：可以将多个维度合并成一个维度。例如，rearrange(data, 'a b -> (a b)') 将两个维度合并成一个维度
        # 使用分词器将最终的 input_ids 序列解码成文本。
        generated_text = tokenizer.decode(input_ids[0])
    return generated_text

In [9]:
# Load a pretrained model and tokenizer (e.g., GPT-2)
model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Ensure the model is in evaluation mode
model.eval()

# Provide an initial prompt
prompt = "Once upon a time in a distant land,"

# Tokenize the input
input_ids = tokenizer(prompt, return_tensors="pt").input_ids

# Generate text using greedy decoding
generated_text = greedy_decoding(model, tokenizer, input_ids)

# Print the result
print("Generated Text:")
print(generated_text)

Generated Text:
Once upon a time in a distant land, the sun was shining, and the moon was shining. The sun was shining, and the moon was shining. The sun was shining, and the moon was shining. The sun was shining, and the moon was shining. The sun was shining, and the moon was shining. The sun was shining, and the moon was shining. The sun was shining, and the moon was shining. The sun was shining, and the moon was shining. The sun was shining, and the moon was shining. The sun was shining, and the moon was shining. The sun was shining, and the moon was shining. The sun was shining, and the moon was shining. The sun was shining, and the moon was shining. The sun was shining, and the moon was shining. The sun was shining, and the moon was shining. The sun was shining, and the moon was shining. The sun was shining, and the moon was shining. The sun was shining, and the moon was shining. The sun was shining, and the moon was shining. The sun was shining, and the moon was shining. The sun 

# 集束搜索 beam search

beam search 在贪心搜索上进一步扩大了搜索范围，贪心搜索每下一步只考虑当前最优的 top-1 结果，beam search 考虑最优的 top-k 个结果。

<img src="./images/beam.png" alt="示例图片" width="1100">

In [69]:
def beam_search(LM_prob, beam_size=3):
    batch, seqlen, vocab_size = LM_prob.shape
    # 对 LM_prob 取对数
    log_LM_prob = LM_prob.log() # torch.Tensor().log()
    # 先选择第 0 个位置的最大 beam_size 个token, log_emb_prob 与 indices 的 shape 为 (batch, beam)
    log_beam_prob, indices = log_LM_prob[:, 0, :].topk(beam_size, sorted = True)
    # torch.Tensor([1, 2, 3, 4, 5]).topk(size, sorted) 
    # -> return torch.return_types.topk(values=tensor([5., 4., 3.]),indices=tensor([4, 3, 2]))
    # unsqueeze(): 在多维数组（张量）中添加单维度
    indices = indices.unsqueeze(-1) # tensor([4, 3, 2]) -> tensor([[4], [3], [2]])
    # indices: [batch_size, beam_size, 1], 记录 k-top 的索引
    # 对每个长度进行beam_search
    for i in range(1, seqlen):
        # log_beam_prob: [batch, beam] -> [batch, beam, 1]
        # [:, i, :] 选择了第 i 个时间步的所有元素，结果是一个形状为 [batch_size, vocab_size] 的二维张量
        # log_LM_prob[:, i, :].unsqueeze(1): [batch, 1, vocab_size]
        # repeat(1, beam_size, 1): 沿着第二维度重复 beam_size 次 -> [batch, beam, vocab_size]
        # [batch, beam, 1] + [batch, beam, vocab_size]: 前面的在第三个维度上会广播
        # 因为取了对数，所以概率是相加
        log_beam_prob = log_beam_prob.unsqueeze(-1) + log_LM_prob[:, i, :].unsqueeze(1).repeat(1, beam_size, 1)
        # -> log_beam_prob: [batch_size, beam_size, vocab_size]: 每个beam的可能产生的概率
        
        # 选出当前步（所有分叉: view(batch, -1)）概率最高的 k 个 token
        log_beam_prob, index = log_beam_prob.view(batch, -1).topk(beam_size, sorted=True)
        # log_beam_prob, index: [batch, beam], 喂给下一步
        
        # 下面的计算: beam_id 选出新 beam 来源于之前的那个 beam; index 代表真实的 token_id
        beam_id = index // vocab_size # 从第几个 beam 来的, (见图)一共有 beam_size * vocab_size 个概率，整除即可
        index = index % vocab_size
        # beam_id, index: [batch, beam]
        mid = torch.Tensor([])
        # 对 batch 内每个样本循环，选出 beam 的同时拼接上新生成的 token_id
        for j, bid, idx in zip(range(batch), beam_id, index):
            # indices: [batch_size, beam_size, 1], 记录 k-top 的索引
            # x 记录索引 chain
            # indices[j][bid]: [1]
            # idx: [] --unsqueeze(-1)--> [1]
            # 拼接成链
            x = torch.cat([indices[j][bid], idx.unsqueeze(-1)], -1) # [1] -> [链长(步长)]
            # mid: [链的数量, 链长(步长)]
            # x: -> [1, 链长(步长)]
            # 在第 1 个维度拼接
            mid = torch.cat([mid, x.unsqueeze(0)], 0)
        indices = mid # [batch_size, beam_size, 链长(步长)]
        
    return indices, log_beam_prob
            

In [71]:
if __name__ == '__main__':
    # 建立一个语言模型 LM_prob(batch, seqlen, vocab_size)
    LM_prob = F.softmax(torch.randn([32, 20, 1000]), dim=-1)
    # 最终返回每个候选，以及每个候选的 log_prob, shape 为 [batch, beam_size, seqlen]
    indices, log_prob = beam_search(LM_prob, beam_size=3)
    print(indices)
    print(log_prob)

tensor([[[ 77., 315.,  56.,  ..., 589., 811., 440.],
         [ 77., 315.,  56.,  ..., 589., 811., 440.],
         [ 77., 315.,  56.,  ..., 589., 811., 440.]],

        [[605., 414.,  77.,  ..., 366., 440., 105.],
         [605., 414.,  77.,  ..., 366., 440., 105.],
         [605., 414.,  77.,  ..., 366., 440., 105.]],

        [[924., 764., 667.,  ..., 368., 694.,  79.],
         [924., 764., 667.,  ..., 368., 694.,  79.],
         [924., 764., 667.,  ..., 368., 694., 877.]],

        ...,

        [[506., 389.,  27.,  ..., 204., 820.,  55.],
         [506., 389.,  27.,  ..., 204., 820.,  55.],
         [506., 389.,  27.,  ..., 204., 820.,  55.]],

        [[355., 221.,  55.,  ..., 222., 162., 215.],
         [355., 221., 364.,  ..., 222., 162., 215.],
         [355., 221.,  55.,  ..., 222., 162., 215.]],

        [[471., 790., 712.,  ..., 891., 474., 873.],
         [471., 790., 712.,  ..., 891., 474., 873.],
         [471., 790., 712.,  ..., 891., 474., 873.]]])
tensor([[-83.5129, -

In [72]:
# 1. 加载 GPT-2 模型和分词器
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# 2. 准备输入数据
input_text = "The quick brown fox"
input_ids = tokenizer.encode(input_text, return_tensors='pt')

# 3. 获取模型的输出概率
with torch.no_grad():
    outputs = model(input_ids)
    LM_prob = torch.nn.functional.softmax(outputs.logits, dim=-1)

In [73]:
# 5. 调用 beam_search 函数
beam_size = 3
indices, log_beam_prob = beam_search(LM_prob, beam_size)

# 6. 解码输出
for i in range(beam_size):
    decoded_text = tokenizer.decode(indices[0, i].long(), skip_special_tokens=True)
    print(f"Beam {i+1}: {decoded_text}")

Beam 1: 
-iees
Beam 2: 
 answeriees
Beam 3: 
 andiees


# 温度 Temperature

<img src="./images/temp.png" alt="示例图片" width="1100">

In [97]:
def temperature_sampling(logits, temperature=1.0):
    logits = logits / temperature
    probabilities = F.softmax(logits, dim=-1)
    # torch.multinomial 函数从给定的概率分布中抽取样本。它确保每个样本的抽取是独立的，并且每个类别的概率由输入张量指定。
    sampled_token = torch.multinomial(probabilities, 1)
    return sampled_token, torch.max(probabilities)

In [74]:
# 1. 加载 GPT-2 模型和分词器
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# 2. 准备输入数据
input_text = "The quick brown fox"
input_ids = tokenizer.encode(input_text, return_tensors='pt')

# 3. 获取模型的 logits 输出
with torch.no_grad():
    outputs = model(input_ids)
    logits = outputs.logits

In [100]:
# 5. 调用 temperature_sampling 函数
sampled_token, prob = temperature_sampling(logits[:, -1, :], temperature=1)

# 6. 解码输出
sampled_token_id = sampled_token.item()
decoded_text = tokenizer.decode([sampled_token_id], skip_special_tokens=True)
print(f"Sampled token: {decoded_text}\t\t Prob: {prob}")

Sampled token:  killed		 Prob: 0.12967140972614288


In [104]:
print("High temperature:")
for i in range(10):
    # 5. 调用 temperature_sampling 函数
    sampled_token, prob = temperature_sampling(logits[:, -1, :], temperature=1)

    # 6. 解码输出
    sampled_token_id = sampled_token.item()
    decoded_text = tokenizer.decode([sampled_token_id], skip_special_tokens=True)
    print(f"Sampled token: {decoded_text}\t\t Prob: {prob}")

print("\n\nLow temperature:")
for i in range(10):
    # 5. 调用 temperature_sampling 函数
    sampled_token, prob = temperature_sampling(logits[:, -1, :], temperature=0.1)

    # 6. 解码输出
    sampled_token_id = sampled_token.item()
    decoded_text = tokenizer.decode([sampled_token_id], skip_special_tokens=True)
    print(f"Sampled token: {decoded_text}\t\t Prob: {prob}")

print("\n\nNormal temperature:")
for i in range(10):
    # 5. 调用 temperature_sampling 函数
    sampled_token, prob = temperature_sampling(logits[:, -1, :], temperature=0.5)

    # 6. 解码输出
    sampled_token_id = sampled_token.item()
    decoded_text = tokenizer.decode([sampled_token_id], skip_special_tokens=True)
    print(f"Sampled token: {decoded_text}\t\t Prob: {prob}")

High temperature:
Sampled token: 's		 Prob: 0.12967140972614288
Sampled token: es		 Prob: 0.12967140972614288
Sampled token:  that		 Prob: 0.12967140972614288
Sampled token:  didn		 Prob: 0.12967140972614288
Sampled token:  or		 Prob: 0.12967140972614288
Sampled token:  gut		 Prob: 0.12967140972614288
Sampled token:  jumped		 Prob: 0.12967140972614288
Sampled token:  is		 Prob: 0.12967140972614288
Sampled token: 's		 Prob: 0.12967140972614288
Sampled token:  jumped		 Prob: 0.12967140972614288


Low temperature:
Sampled token: es		 Prob: 0.9999668598175049
Sampled token: es		 Prob: 0.9999668598175049
Sampled token: es		 Prob: 0.9999668598175049
Sampled token: es		 Prob: 0.9999668598175049
Sampled token: es		 Prob: 0.9999668598175049
Sampled token: es		 Prob: 0.9999668598175049
Sampled token: es		 Prob: 0.9999668598175049
Sampled token: es		 Prob: 0.9999668598175049
Sampled token: es		 Prob: 0.9999668598175049
Sampled token: es		 Prob: 0.9999668598175049


Normal temperature:
Sampled tok

# Top-K

在每个时间步选择条件概率排名前 K 的词语，然后在这 K 个词语中进行随机采样

In [105]:
def top_K_sampling(model, tokenizer, input_ids, max_tokens=100, top_k=50, temperature=1):
    for _ in range(max_tokens):
        with torch.inference_mode():
            outputs = model(input_ids)
            next_token_logits = outputs.logits[:, -1, :]
            top_k_logits, top_k_indices = torch.topk(next_token_logits, top_k)
            top_k_probs = F.softmax(top_k_logits / temperature, dim=-1)
            next_token_index = torch.multinomial(top_k_probs, num_samples=1)
            # torch.gather() 函数从输入张量中按照指定的索引提取元素，并将这些元素放置在输出张量的相应位置
            next_token = top_k_indices.gather(-1, next_token_index)
            if next_token == tokenizer.eos_token_id:
                break
            input_ids = torch.cat([input_ids, next_token], dim=-1)
    generated_text = tokenizer.decode(input_ids[0])
    
    return generated_text

In [115]:
# 1. 加载 GPT-2 模型和分词器
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# 2. 准备输入数据
input_text = "The quick brown fox"
input_ids = tokenizer.encode(input_text, return_tensors='pt')

# 4. 调用 top_K_sampling 函数
generated_text = top_K_sampling(model, tokenizer, input_ids, max_tokens=100, top_k=1, temperature=1)

# 5. 打印生成的文本
print("Top-1 Generated Text:")
print(generated_text)

# 4. 调用 top_K_sampling 函数
generated_text = top_K_sampling(model, tokenizer, input_ids, max_tokens=100, top_k=10, temperature=1)

# 5. 打印生成的文本
print("\n\n Top-10 Generated Text:")
print(generated_text)

Top-1 Generated Text:
The quick brown foxes are a great way to get a little bit of a kick out of your dog.

The quick brown foxes are a great way to get a little bit of a kick out of your dog. The quick brown foxes are a great way to get a little bit of a kick out of your dog. The quick brown foxes are a great way to get a little bit of a kick out of your dog. The quick brown foxes are a great way to get a little bit


 Top-10 Generated Text:
The quick brown foxes are so much faster than the brown foxes, it is amazing that they can survive under the cover of the forest."

"You mean, that's what you said?"

"The only thing you could see was the trees and the grass," said the foxes. "That was just the way I saw the forest. I was just going to go see it and it didn't matter where."

"Then I'll give them a ride back to the castle."



# Top-P (Nucleus)

在每个时间步根据模型输出的概率分布选择概率累计超过给定阈值 p 的词语集和，然后在这个词语集合中进行随机采样。

In [116]:
def top_p_sampling(model, tokenizer, input_ids, max_tokens=100, top_p=0.95, temperature=1):
    for _ in range(max_tokens):
        with torch.inference_mode():
            outputs = model(input_ids)
            next_token_logits = outputs.logits[:, -1, :] # [batch. vocab]
            # torch.sort(): 用于对输入张量进行排序。返回排序后的张量以及排序后元素的原始索引。
            sorted_logits, sorted_indices = torch.sort(next_token_logits, descending=True)
            sorted_probabilities = F.softmax(sorted_logits, dim=-1)
            # torch.cumsum 函数沿着指定的维度计算输入张量的累积和
            # e.g. Input Tensor: tensor([1, 2, 3, 4]) -> Cumulative Sum Tensor: tensor([ 1,  3,  6, 10])
            cumulative_probs = torch.cumsum(sorted_probabilities, dim=-1)
            sorted_indices_to_remove = cumulative_probs > top_p
            # [F, F, F, .., F, T, T, ..., T]
            sorted_indices_to_remove[..., 0] = False # 保证至少采样一个
            # ... 是一个 Ellipsis 对象，表示选择所有前面的维度。这在多维数组中用于选择所有前面的维度而不显式写出每个维度。
            indices_to_remove = sorted_indices[sorted_indices_to_remove]
            # scatter_ 是一个就地操作，用于根据索引将值更新到张量中。
            # 第一个参数 dim=-1 指定了操作的维度，-1 表示最后一个维度（通常是词汇表的维度）。
            # 第二个参数 indices_to_remove[None, :] 是一个索引张量，指定了要更新的索引位置。
            #           None 用于增加一个新的维度，以匹配 next_token_logits 的维度。
            # 第三个参数 float('-inf') 是要更新的值，这里使用负无穷大来表示这些位置的 logits 应该被设置为一个非常小的值，
            #           这样在后续的 softmax 操作中，这些位置的概率接近于零。
            next_token_logits.scatter_(-1, indices_to_remove[None, :], float('-inf'))
            prob = F.softmax(next_token_logits, dim=-1)
            next_token = torch.multinomial(prob, num_samples=1)
            # torch.gather() 函数从输入张量中按照指定的索引提取元素，并将这些元素放置在输出张量的相应位置
            if next_token == tokenizer.eos_token_id:
                break
            input_ids = torch.cat([input_ids, next_token], dim=-1)
    generated_text = tokenizer.decode(input_ids[0])
    
    return generated_text

In [119]:
# 1. 加载 GPT-2 模型和分词器
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# 2. 准备输入数据
input_text = "The quick brown fox"
input_ids = tokenizer.encode(input_text, return_tensors='pt')

# 4. 调用 top_K_sampling 函数
generated_text = top_p_sampling(model, tokenizer, input_ids, max_tokens=100, top_p=1, temperature=1)

# 5. 打印生成的文本
print("1 Generated Text/sample from all vocab:")
print(generated_text)

# 4. 调用 top_K_sampling 函数
generated_text = top_p_sampling(model, tokenizer, input_ids, max_tokens=100, top_p=0.95, temperature=1)

# 5. 打印生成的文本
print("\n\n0.95 Generated Text:")
print(generated_text)

# 4. 调用 top_K_sampling 函数
generated_text = top_p_sampling(model, tokenizer, input_ids, max_tokens=100, top_p=0, temperature=1)

# 5. 打印生成的文本
print("\n\n0 Generated Text/greedy:")
print(generated_text)

1 Generated Text/sample from all vocab:
The quick brown foxes in hipster regalia bought carpet and interior paintings down the street to show off a saxophone necktie, white cap, velvet lapels and a female fleece skirt. Gonewood Street got high-PPP (resistive effective resin spray), a 25-gallon load of generic powder, Lauocha Powder, Vinegar Dioxide, weed salts and paprika with propane. The Rx Ecuador kit wasn't as powerful, but the plasma installer matched levels being used in local


0.95 Generated Text:
The quick brown fox happily stirred his ear in case his other two brother's feathers suddenly disappeared... By the time they heard his voice knocking, Ruby felt her heartbeat slowing as he struggled to try and get up. Her mind was going crazy. Running away would mean the worst, and he'd be no good living for him.

Weiss decided she was going to have to get up and find shelter first. Ruby tried to find something, but she couldn't find a place. Her mind began to trick itself into think