In [1]:
import torch

def main():
    print("Hello from torch-test!")
    if torch.backends.mps.is_available():
        print("Excellent! MPS backend is available.")
    else:
        print("MPS backend is not available: Something went wrong! Are you running this on a Mac with Apple Silicon chip?")

if __name__ == "__main__":
    main()

Hello from torch-test!
Excellent! MPS backend is available.


In [2]:
import transformers
print(transformers.__version__)
from transformers import AutoModelForCausalLM, AutoTokenizer


4.56.1


In [13]:
# model_name = "Qwen/Qwen3-0.6B"
model_name = "/Users/jingweixu/Downloads/llama3_2_1b"

tokenizer = AutoTokenizer.from_pretrained(model_name)
# print(tokenizer)
input = "南京大学"

embed_index = tokenizer(input)
print(embed_index)

{'input_ids': [128000, 59563, 47653, 102667], 'attention_mask': [1, 1, 1, 1]}


In [60]:
# Tokenizer方法详解：encode, decode, tokenize

print("=== Tokenizer方法详解 ===")

# 使用之前加载的tokenizer
# model_name = "/Users/jingweixu/Downloads/llama3_2_1b"
# tokenizer = AutoTokenizer.from_pretrained(model_name)

# 测试文本
text = "南京大学是一所优秀的大学"
print(f"原始文本: {text}")

print("\n--- 1. tokenize() 方法 ---")
print("tokenize() 只做分词，返回字符串token列表，不添加特殊符号")
tokens = tokenizer.tokenize(text)
print(f"tokenize结果: {tokens}")
print(f"token数量: {len(tokens)}")


=== Tokenizer方法详解 ===
原始文本: 南京大学是一所优秀的大学

--- 1. tokenize() 方法 ---
tokenize() 只做分词，返回字符串token列表，不添加特殊符号
tokenize结果: ['åįĹ', 'äº¬', 'å¤§åŃ¦', 'æĺ¯ä¸Ģ', 'æīĢ', 'ä¼ĺç§Ģ', 'çļĦ', 'å¤§åŃ¦']
token数量: 8


In [64]:

print("\n--- 2. encode() 方法 ---")
print("encode() 将文本转换为token ID，可选择是否添加特殊符号")
# 不添加特殊符号
ids_no_special = tokenizer.encode(text, add_special_tokens=False)
print(f"encode(无特殊符号): {ids_no_special}")

# 添加特殊符号（默认行为）
ids_with_special = tokenizer.encode(text, add_special_tokens=True)
print(f"encode(有特殊符号): {ids_with_special}")



--- 2. encode() 方法 ---
encode() 将文本转换为token ID，可选择是否添加特殊符号
encode(无特殊符号): [59563, 47653, 102667, 107226, 32938, 126047, 9554, 102667]
encode(有特殊符号): [128000, 59563, 47653, 102667, 107226, 32938, 126047, 9554, 102667]


In [65]:

print("\n--- 3. tokenizer() 方法 ---")
print("tokenizer() 是完整编码方法，返回字典包含input_ids和attention_mask")
encoded = tokenizer(text)
print(f"tokenizer()结果: {encoded}")
print(f"input_ids: {encoded['input_ids']}")
print(f"attention_mask: {encoded['attention_mask']}")



--- 3. tokenizer() 方法 ---
tokenizer() 是完整编码方法，返回字典包含input_ids和attention_mask
tokenizer()结果: {'input_ids': [128000, 59563, 47653, 102667, 107226, 32938, 126047, 9554, 102667], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}
input_ids: [128000, 59563, 47653, 102667, 107226, 32938, 126047, 9554, 102667]
attention_mask: [1, 1, 1, 1, 1, 1, 1, 1, 1]


In [66]:

print("\n--- 4. decode() 方法 ---")
print("decode() 将token ID转换回文本")
# 解码无特殊符号的ID
decoded_no_special = tokenizer.decode(ids_no_special)
print(f"解码(无特殊符号): '{decoded_no_special}'")

# 解码有特殊符号的ID
decoded_with_special = tokenizer.decode(ids_with_special)
print(f"解码(有特殊符号): '{decoded_with_special}'")

# 解码完整编码结果
decoded_full = tokenizer.decode(encoded['input_ids'])
print(f"解码(完整编码): '{decoded_full}'")



--- 4. decode() 方法 ---
decode() 将token ID转换回文本
解码(无特殊符号): '南京大学是一所优秀的大学'
解码(有特殊符号): '<|begin_of_text|>南京大学是一所优秀的大学'
解码(完整编码): '<|begin_of_text|>南京大学是一所优秀的大学'


In [67]:

print("\n--- 5. 特殊符号处理 ---")
print("查看特殊符号的作用")
print(f"原始文本长度: {len(text)}")
print(f"tokenize后token数: {len(tokens)}")
print(f"无特殊符号ID数: {len(ids_no_special)}")
print(f"有特殊符号ID数: {len(ids_with_special)}")

# 查看特殊token
print(f"\n特殊token信息:")
print(f"BOS token: {tokenizer.bos_token} (ID: {tokenizer.bos_token_id})")
print(f"EOS token: {tokenizer.eos_token} (ID: {tokenizer.eos_token_id})")
print(f"PAD token: {tokenizer.pad_token} (ID: {tokenizer.pad_token_id})")
print(f"UNK token: {tokenizer.unk_token} (ID: {tokenizer.unk_token_id})")



--- 5. 特殊符号处理 ---
查看特殊符号的作用
原始文本长度: 12
tokenize后token数: 10
无特殊符号ID数: 8
有特殊符号ID数: 9

特殊token信息:
BOS token: <|begin_of_text|> (ID: 128000)
EOS token: <|eot_id|> (ID: 128009)
PAD token: <|eot_id|> (ID: 128009)
UNK token: None (ID: None)


In [70]:

print("\n--- 6. 批量处理示例 ---")
texts = ["你好", "南京大学", "人工智能"]
print(f"批量文本: {texts}")

# 批量编码
batch_encoded = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
print(f"批量编码结果:")
print(f"input_ids形状: {batch_encoded['input_ids'].shape}")
print(f"attention_mask形状: {batch_encoded['attention_mask'].shape}")
print(f"input_ids:\n{batch_encoded['input_ids']}")

# 批量解码
batch_decoded = [tokenizer.decode(ids, skip_special_tokens=True) for ids in batch_encoded['input_ids']]
print(f"批量解码结果: {batch_decoded}")



--- 6. 批量处理示例 ---
批量文本: ['你好', '南京大学', '人工智能']
批量编码结果:
input_ids形状: torch.Size([3, 4])
attention_mask形状: torch.Size([3, 4])
input_ids:
tensor([[128000,  57668,  53901, 128009],
        [128000,  59563,  47653, 102667],
        [128000,  17792,  49792, 118034]])
批量解码结果: ['你好', '南京大学', '人工智能']


# PyTorch Tensor操作详解

本部分将详细讲解PyTorch中常用的tensor操作，包括形状变换、数学运算、索引选择等，并结合LLM中的实际应用场景。


In [15]:
# 补充：Broadcasting（广播）机制详解

print("=== Broadcasting 基础概念 ===")
import torch

# 1. 标量与tensor的广播
print("--- 标量与tensor ---")
a = torch.tensor([1, 2, 3, 4])
b = 2
c = a + b
print(f"a: {a}")
print(f"b: {b}")
print(f"a + b: {c}")
print(f"形状: a={a.shape}, b=标量, c={c.shape}")


=== Broadcasting 基础概念 ===
--- 标量与tensor ---
a: tensor([1, 2, 3, 4])
b: 2
a + b: tensor([3, 4, 5, 6])
形状: a=torch.Size([4]), b=标量, c=torch.Size([4])


In [18]:

# 2. 不同形状tensor的广播
print("\n--- 不同形状tensor的广播 ---")
A = torch.randn(3, 4)
B = torch.randn(4)
print(f"A形状: {A.shape}")
print(f"B形状: {B.shape}")

print("A:", A)
print("B:", B)
# 广播过程：B [4] -> [1, 4] -> [3, 4]
C = A + B
print(f"A + B形状: {C.shape}")
print("C:", C)
print(f"广播是否成功: {C.shape == (3, 4)}")



--- 不同形状tensor的广播 ---
A形状: torch.Size([3, 4])
B形状: torch.Size([4])
A: tensor([[ 0.5040,  0.4135,  1.1673, -0.1042],
        [-0.3271,  1.1358,  0.3800,  0.4512],
        [-1.0532, -1.5775,  0.7869,  1.0367]])
B: tensor([-0.2392,  0.6887,  1.0550, -1.1013])
A + B形状: torch.Size([3, 4])
C: tensor([[ 0.2647,  1.1022,  2.2223, -1.2055],
        [-0.5663,  1.8245,  1.4349, -0.6501],
        [-1.2924, -0.8888,  1.8419, -0.0646]])
广播是否成功: True


In [20]:
# Broadcasting结果一致性验证

print("=== 使用allclose验证Broadcasting结果一致性 ===")

# 1. 标量广播验证
print("--- 1. 标量广播验证 ---")
a = torch.tensor([1.0, 2.0, 3.0, 4.0])
b = 2.0

# 方法1：使用broadcasting
result_broadcast = a + b

# 方法2：手动编程
result_manual = a + torch.full_like(a, b)

print(torch.full_like(a,b))

print(f"原始tensor: {a}")
print(f"标量: {b}")
print(f"Broadcasting结果: {result_broadcast}")
print(f"手动编程结果: {result_manual}")
print(f"结果是否一致: {torch.allclose(result_broadcast, result_manual)}")
print(f"最大差异: {torch.max(torch.abs(result_broadcast - result_manual)):.10f}")


=== 使用allclose验证Broadcasting结果一致性 ===
--- 1. 标量广播验证 ---
tensor([2., 2., 2., 2.])
原始tensor: tensor([1., 2., 3., 4.])
标量: 2.0
Broadcasting结果: tensor([3., 4., 5., 6.])
手动编程结果: tensor([3., 4., 5., 6.])
结果是否一致: True
最大差异: 0.0000000000


In [23]:

# 2. 向量广播验证
print("\n--- 2. 向量广播验证 ---")
A = torch.randn(3, 4)
B = torch.randn(4)

# 方法1：使用broadcasting
result_broadcast = A + B

# 方法2：手动扩展
B_expanded = B.unsqueeze(0).expand(3, 4)

print("B:",B)
print("B_expanded:", B_expanded)
print("shape of B_expanded:", B_expanded.shape)

result_manual = A + B_expanded

print(f"矩阵A形状: {A.shape}")
print(f"向量B形状: {B.shape}")
print(f"Broadcasting结果形状: {result_broadcast.shape}")
print(f"手动编程结果形状: {result_manual.shape}")
print(f"结果是否一致: {torch.allclose(result_broadcast, result_manual)}")
print(f"最大差异: {torch.max(torch.abs(result_broadcast - result_manual)):.10f}")



--- 2. 向量广播验证 ---
B: tensor([-1.1137,  1.0270, -0.8012,  0.7476])
B_expanded: tensor([[-1.1137,  1.0270, -0.8012,  0.7476],
        [-1.1137,  1.0270, -0.8012,  0.7476],
        [-1.1137,  1.0270, -0.8012,  0.7476]])
shape of B_expanded: torch.Size([3, 4])
矩阵A形状: torch.Size([3, 4])
向量B形状: torch.Size([4])
Broadcasting结果形状: torch.Size([3, 4])
手动编程结果形状: torch.Size([3, 4])
结果是否一致: True
最大差异: 0.0000000000


In [27]:

# 3. 复杂广播验证
print("\n--- 3. 复杂广播验证 ---")
A = torch.randn(2, 3, 4)
B = torch.randn(3, 1)

# 方法1：使用broadcasting
result_broadcast = A + B

# 方法2：手动扩展
B_expanded = B.unsqueeze(0).expand(2, 3, 4)
result_manual = A + B_expanded

print("B:",B)
print("B_expanded:", B_expanded)
print("shape of B_expanded:", B_expanded.shape)


print(f"3D tensor A形状: {A.shape}")
print(f"2D tensor B形状: {B.shape}")
print(f"Broadcasting结果形状: {result_broadcast.shape}")
print(f"手动编程结果形状: {result_manual.shape}")
print(f"结果是否一致: {torch.allclose(result_broadcast, result_manual)}")
print(f"最大差异: {torch.max(torch.abs(result_broadcast - result_manual)):.10f}")



--- 3. 复杂广播验证 ---
B: tensor([[1.7184],
        [0.7701],
        [0.8592]])
B_expanded: tensor([[[1.7184, 1.7184, 1.7184, 1.7184],
         [0.7701, 0.7701, 0.7701, 0.7701],
         [0.8592, 0.8592, 0.8592, 0.8592]],

        [[1.7184, 1.7184, 1.7184, 1.7184],
         [0.7701, 0.7701, 0.7701, 0.7701],
         [0.8592, 0.8592, 0.8592, 0.8592]]])
shape of B_expanded: torch.Size([2, 3, 4])
3D tensor A形状: torch.Size([2, 3, 4])
2D tensor B形状: torch.Size([3, 1])
Broadcasting结果形状: torch.Size([2, 3, 4])
手动编程结果形状: torch.Size([2, 3, 4])
结果是否一致: True
最大差异: 0.0000000000


In [29]:

# 4. 矩阵乘法中的广播验证
print("\n--- 4. 矩阵乘法中的广播验证 ---")
A = torch.randn(3, 4)
B = torch.randn(2, 4, 5)

# 方法1：使用matmul的广播
result_broadcast = torch.matmul(A, B)

# 方法2：手动扩展A然后计算
A_expanded = A.unsqueeze(0).expand(2, 3, 4)
result_manual = torch.bmm(A_expanded, B)


print("A:",A)
print("A_expanded:", A_expanded)
print("shape of A_expanded:", A_expanded.shape)

print(f"矩阵A形状: {A.shape}")
print(f"批量矩阵B形状: {B.shape}")
print(f"Broadcasting matmul结果形状: {result_broadcast.shape}")
print(f"手动bmm结果形状: {result_manual.shape}")
print(f"结果是否一致: {torch.allclose(result_broadcast, result_manual)}")
print(f"最大差异: {torch.max(torch.abs(result_broadcast - result_manual)):.10f}")



--- 4. 矩阵乘法中的广播验证 ---
A: tensor([[ 0.3280,  1.1160,  0.6442,  1.5937],
        [-0.2867,  1.2730,  0.4498,  1.7053],
        [-0.9767,  0.2935,  0.9173, -0.4949]])
A_expanded: tensor([[[ 0.3280,  1.1160,  0.6442,  1.5937],
         [-0.2867,  1.2730,  0.4498,  1.7053],
         [-0.9767,  0.2935,  0.9173, -0.4949]],

        [[ 0.3280,  1.1160,  0.6442,  1.5937],
         [-0.2867,  1.2730,  0.4498,  1.7053],
         [-0.9767,  0.2935,  0.9173, -0.4949]]])
shape of A_expanded: torch.Size([2, 3, 4])
矩阵A形状: torch.Size([3, 4])
批量矩阵B形状: torch.Size([2, 4, 5])
Broadcasting matmul结果形状: torch.Size([2, 3, 5])
手动bmm结果形状: torch.Size([2, 3, 5])
结果是否一致: True
最大差异: 0.0000000000


In [30]:

# 5. 批量归一化中的广播验证
print("\n--- 5. 批量归一化中的广播验证 ---")
x = torch.randn(32, 64, 28, 28)

# 方法1：使用keepdim=True的广播
mean_broadcast = torch.mean(x, dim=(0, 2, 3), keepdim=True)
std_broadcast = torch.std(x, dim=(0, 2, 3), keepdim=True)
normalized_broadcast = (x - mean_broadcast) / (std_broadcast + 1e-8)

# 方法2：手动扩展
mean_manual = torch.mean(x, dim=(0, 2, 3))  # [64]
std_manual = torch.std(x, dim=(0, 2, 3))    # [64]
mean_expanded = mean_manual.view(1, 64, 1, 1).expand(32, 64, 28, 28)
std_expanded = std_manual.view(1, 64, 1, 1).expand(32, 64, 28, 28)
normalized_manual = (x - mean_expanded) / (std_expanded + 1e-8)

print(f"输入形状: {x.shape}")
print(f"Broadcasting方法结果形状: {normalized_broadcast.shape}")
print(f"手动扩展方法结果形状: {normalized_manual.shape}")
print(f"结果是否一致: {torch.allclose(normalized_broadcast, normalized_manual)}")
print(f"最大差异: {torch.max(torch.abs(normalized_broadcast - normalized_manual)):.10f}")



--- 5. 批量归一化中的广播验证 ---
输入形状: torch.Size([32, 64, 28, 28])
Broadcasting方法结果形状: torch.Size([32, 64, 28, 28])
手动扩展方法结果形状: torch.Size([32, 64, 28, 28])
结果是否一致: True
最大差异: 0.0000000000


In [31]:

# 6. 注意力机制中的广播验证
print("\n--- 6. 注意力机制中的广播验证 ---")
scores = torch.randn(2, 8, 8)
mask = torch.tensor([1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0])

# 方法1：使用unsqueeze的广播
masked_scores_broadcast = scores + mask.unsqueeze(0).unsqueeze(0)

# 方法2：手动扩展
mask_expanded = mask.view(1, 1, 8).expand(2, 8, 8)
masked_scores_manual = scores + mask_expanded

print(f"注意力分数形状: {scores.shape}")
print(f"mask形状: {mask.shape}")
print(f"Broadcasting方法结果形状: {masked_scores_broadcast.shape}")
print(f"手动扩展方法结果形状: {masked_scores_manual.shape}")
print(f"结果是否一致: {torch.allclose(masked_scores_broadcast, masked_scores_manual)}")
print(f"最大差异: {torch.max(torch.abs(masked_scores_broadcast - masked_scores_manual)):.10f}")

print("\n=== 总结 ===")
print("✓ 所有broadcasting运算结果都与手动编程结果完全一致")
print("✓ 最大差异都在机器精度范围内（~1e-10）")
print("✓ 这证明了broadcasting机制的正确性和可靠性")



--- 6. 注意力机制中的广播验证 ---
注意力分数形状: torch.Size([2, 8, 8])
mask形状: torch.Size([8])
Broadcasting方法结果形状: torch.Size([2, 8, 8])
手动扩展方法结果形状: torch.Size([2, 8, 8])
结果是否一致: True
最大差异: 0.0000000000

=== 总结 ===
✓ 所有broadcasting运算结果都与手动编程结果完全一致
✓ 最大差异都在机器精度范围内（~1e-10）
✓ 这证明了broadcasting机制的正确性和可靠性


In [33]:

# 4. 广播失败的情况
print("\n--- 广播失败示例 ---")
try:
    A = torch.randn(3, 4)
    B = torch.randn(5)  # 不兼容的维度
    C = A + B
except RuntimeError as e:
    print(f"广播失败: {e}")

print("\n--- 广播规则总结 ---")
print("1. 从右向左对齐维度")
print("2. 维度必须兼容：相等、其中一个为1、或其中一个不存在")
print("3. 不兼容的维度会自动扩展")



--- 广播失败示例 ---
广播失败: The size of tensor a (4) must match the size of tensor b (5) at non-singleton dimension 1

--- 广播规则总结 ---
1. 从右向左对齐维度
2. 维度必须兼容：相等、其中一个为1、或其中一个不存在
3. 不兼容的维度会自动扩展


In [35]:
# 1. 形状变换操作：view vs reshape

print("=== view() vs reshape() 对比 ===")
import torch

# 创建示例tensor
x = torch.randn(2, 3, 4)
print(f"原始tensor形状: {x.shape}")
print(f"原始tensor是否连续: {x.is_contiguous()}")


=== view() vs reshape() 对比 ===
原始tensor形状: torch.Size([2, 3, 4])
原始tensor是否连续: True


In [36]:

# view() 操作
y1 = x.view(6, 4)  # 2*3=6
print(f"view(6, 4)后形状: {y1.shape}")
print(f"view后是否连续: {y1.is_contiguous()}")

# reshape() 操作
y2 = x.reshape(6, 4)
print(f"reshape(6, 4)后形状: {y2.shape}")
print(f"reshape后是否连续: {y2.is_contiguous()}")



view(6, 4)后形状: torch.Size([6, 4])
view后是否连续: True
reshape(6, 4)后形状: torch.Size([6, 4])
reshape后是否连续: True


In [39]:
print("\n=== 不连续tensor的处理 ===")
# 转置操作会使tensor不连续
x_transposed = x.transpose(0, 1)
print(f"转置后是否连续: {x_transposed.is_contiguous()}")

# view() 在不连续tensor上会报错
try:
    y3 = x_transposed.view(12, 2)
except RuntimeError as e:
    print(f"view()报错: {e}")


=== 不连续tensor的处理 ===
转置后是否连续: False
view()报错: view size is not compatible with input tensor's size and stride (at least one dimension spans across two contiguous subspaces). Use .reshape(...) instead.


In [40]:



# reshape() 会自动处理连续性问题
y4 = x_transposed.reshape(12, 2)
print(f"reshape()成功: {y4.shape}")

print("\n=== 内存共享验证 ===")
# 验证view()是否共享内存
x[0, 0, 0] = 999
print(f"修改原始tensor后，view结果: {y1[0, 0]}")
print(f"修改原始tensor后，reshape结果: {y2[0, 0]}")


reshape()成功: torch.Size([12, 2])

=== 内存共享验证 ===
修改原始tensor后，view结果: 999.0
修改原始tensor后，reshape结果: 999.0


In [41]:
# 2. 转置操作：transpose vs permute

print("=== transpose() - 交换两个维度 ===")
x = torch.randn(2, 3, 4, 5)
print(f"原始tensor形状: {x.shape}")

# transpose() 交换维度1和3
y1 = x.transpose(1, 3)
print(f"transpose(1, 3)后形状: {y1.shape}")

# 验证转置效果
print(f"原始tensor[0, 1, 2, 3] = {x[0, 1, 2, 3]}")
print(f"转置后tensor[0, 3, 2, 1] = {y1[0, 3, 2, 1]}")
print(f"两者是否相等: {x[0, 1, 2, 3] == y1[0, 3, 2, 1]}")


=== transpose() - 交换两个维度 ===
原始tensor形状: torch.Size([2, 3, 4, 5])
transpose(1, 3)后形状: torch.Size([2, 5, 4, 3])
原始tensor[0, 1, 2, 3] = -1.43731689453125
转置后tensor[0, 3, 2, 1] = -1.43731689453125
两者是否相等: True


In [42]:

print("\n=== permute() - 重新排列所有维度 ===")
# permute() 重新排列维度
y2 = x.permute(0, 3, 1, 2)
print(f"permute(0, 3, 1, 2)后形状: {y2.shape}")

# 验证permute效果
print(f"原始tensor[0, 1, 2, 3] = {x[0, 1, 2, 3]}")
print(f"permute后tensor[0, 3, 1, 2] = {y2[0, 3, 1, 2]}")
print(f"两者是否相等: {x[0, 1, 2, 3] == y2[0, 3, 1, 2]}")

print("\n=== 连续性问题 ===")
print(f"原始tensor是否连续: {x.is_contiguous()}")
print(f"transpose后是否连续: {y1.is_contiguous()}")
print(f"permute后是否连续: {y2.is_contiguous()}")

# 如果需要连续tensor，可以调用contiguous()
y1_cont = y1.contiguous()
print(f"contiguous()后是否连续: {y1_cont.is_contiguous()}")



=== permute() - 重新排列所有维度 ===
permute(0, 3, 1, 2)后形状: torch.Size([2, 5, 3, 4])
原始tensor[0, 1, 2, 3] = -1.43731689453125
permute后tensor[0, 3, 1, 2] = -1.43731689453125
两者是否相等: True

=== 连续性问题 ===
原始tensor是否连续: True
transpose后是否连续: False
permute后是否连续: False
contiguous()后是否连续: True


In [44]:
# 3. 维度操作：squeeze vs unsqueeze

print("=== squeeze() - 移除大小为1的维度 ===")
x = torch.randn(1, 3, 1, 4)
print(f"原始tensor形状: {x.shape}")

# squeeze() 移除所有大小为1的维度
y1 = x.squeeze()
print(f"squeeze()后形状: {y1.shape}")

# squeeze(dim) 只移除指定维度
y2 = x.squeeze(0)  # 只移除第0维
print(f"squeeze(0)后形状: {y2.shape}")

y3 = x.squeeze(2)  # 只移除第2维
print(f"squeeze(2)后形状: {y3.shape}")


=== squeeze() - 移除大小为1的维度 ===
原始tensor形状: torch.Size([1, 3, 1, 4])
squeeze()后形状: torch.Size([3, 4])
squeeze(0)后形状: torch.Size([3, 1, 4])
squeeze(2)后形状: torch.Size([1, 3, 4])


In [45]:

print("\n=== unsqueeze() - 在指定位置插入大小为1的维度 ===")
x = torch.randn(3, 4)
print(f"原始tensor形状: {x.shape}")

# unsqueeze() 在指定位置插入维度
y1 = x.unsqueeze(0)  # 在第0维插入
print(f"unsqueeze(0)后形状: {y1.shape}")

y2 = x.unsqueeze(-1)  # 在最后一维插入
print(f"unsqueeze(-1)后形状: {y2.shape}")

y3 = x.unsqueeze(1)  # 在第1维插入
print(f"unsqueeze(1)后形状: {y3.shape}")



=== unsqueeze() - 在指定位置插入大小为1的维度 ===
原始tensor形状: torch.Size([3, 4])
unsqueeze(0)后形状: torch.Size([1, 3, 4])
unsqueeze(-1)后形状: torch.Size([3, 4, 1])
unsqueeze(1)后形状: torch.Size([3, 1, 4])


In [47]:
# 6. 索引和选择操作

print("=== gather() - 按索引收集元素 ===")
# 创建示例tensor
x = torch.randn(3, 4)
print(f"原始tensor:\n{x}")

# 创建索引
indices = torch.tensor([0, 2, 1])
print(f"索引: {indices}")


=== gather() - 按索引收集元素 ===
原始tensor:
tensor([[ 0.3435,  0.6903,  1.0170, -0.5715],
        [ 1.4580,  1.1166,  0.1565, -0.3784],
        [-2.4508, -1.1759,  2.2299,  1.1512]])
索引: tensor([0, 2, 1])


In [52]:

# 在第1维上按索引收集元素
y = torch.gather(x, 1, indices.unsqueeze(1))
print(f"gather结果:\n{y}")
print(f"解释: 每行按indices选择元素")

# 验证结果
print(f"第0行选择第{indices[0]}列: {x[0, indices[0]]}")
print(f"第1行选择第{indices[1]}列: {x[1, indices[1]]}")
print(f"第2行选择第{indices[2]}列: {x[2, indices[2]]}")


gather结果:
tensor([[ 0.3435],
        [ 0.1565],
        [-1.1759]])
解释: 每行按indices选择元素
第0行选择第0列: 0.34346070885658264
第1行选择第2列: 0.15652576088905334
第2行选择第1列: -1.175888180732727


In [53]:

print("\n=== scatter() - 按索引分散元素 ===")
# 创建目标tensor
x_scatter = torch.zeros(3, 4)
values = torch.randn(3, 2)
indices_scatter = torch.tensor([[0, 2], [1, 3], [0, 1]])
print(f"目标tensor:\n{x_scatter}")
print(f"要分散的值:\n{values}")
print(f"分散索引:\n{indices_scatter}")



=== scatter() - 按索引分散元素 ===
目标tensor:
tensor([[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]])
要分散的值:
tensor([[ 0.9913, -0.8953],
        [-1.0821,  0.8034],
        [-0.0603, -0.0386]])
分散索引:
tensor([[0, 2],
        [1, 3],
        [0, 1]])


In [54]:

# 分散元素
x_scatter.scatter_(1, indices_scatter, values)
print(f"scatter后结果:\n{x_scatter}")

print("\n=== index_select() - 按索引选择 ===")
# 按索引选择行
x = torch.randn(5, 3)
indices = torch.tensor([0, 2, 4])
selected = torch.index_select(x, 0, indices)
print(f"原始tensor形状: {x.shape}")
print(f"选择的索引: {indices}")
print(f"选择结果形状: {selected.shape}")
print(f"选择结果:\n{selected}")


scatter后结果:
tensor([[ 0.9913,  0.0000, -0.8953,  0.0000],
        [ 0.0000, -1.0821,  0.0000,  0.8034],
        [-0.0603, -0.0386,  0.0000,  0.0000]])

=== index_select() - 按索引选择 ===
原始tensor形状: torch.Size([5, 3])
选择的索引: tensor([0, 2, 4])
选择结果形状: torch.Size([3, 3])
选择结果:
tensor([[-0.9093, -0.1662,  0.6354],
        [ 0.3478, -0.7638,  0.6484],
        [-1.5842,  1.6928,  0.2203]])


In [55]:

print("\n=== masked_select() - 按掩码选择 ===")
# 创建掩码
x = torch.randn(3, 4)
mask = x > 0.5
print(f"原始tensor:\n{x}")
print(f"掩码 (>0.5):\n{mask}")

# 按掩码选择
selected_values = torch.masked_select(x, mask)
print(f"按掩码选择的值: {selected_values}")
print(f"选择的值数量: {len(selected_values)}")



=== masked_select() - 按掩码选择 ===
原始tensor:
tensor([[-0.1824,  0.3434, -1.7025, -1.3546],
        [-0.0932, -1.2863,  0.3139,  1.1156],
        [-0.1857,  1.6313, -1.2955, -0.0053]])
掩码 (>0.5):
tensor([[False, False, False, False],
        [False, False, False,  True],
        [False,  True, False, False]])
按掩码选择的值: tensor([1.1156, 1.6313])
选择的值数量: 2


In [56]:
# load the tokenizer and the model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    dtype="auto",
    device_map="auto"
)

In [59]:
print(model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((2048,), eps=1e-05)
    (rotary_emb):