# 第2讲: 特征空间的变换 - 前反向运行视角理解深度学习模型

本notebook将演示深度学习模型的核心概念：
- Tensor基础操作
- 矩阵乘法和空间变换
- 线性层的构建和使用
- 反向传播和自动求导
- 完整的训练循环


## 1. 导入必要的库


In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt

# 设置随机种子以便结果可复现
torch.manual_seed(42)
np.random.seed(42)

print(f"PyTorch版本: {torch.__version__}")
print(f"CUDA可用: {torch.cuda.is_available()}")
print(f"MPS可用: {torch.backends.mps.is_available()}")


PyTorch版本: 2.8.0
CUDA可用: False
MPS可用: True


## 2. Tensor基础概念

Tensor是PyTorch中最基础的数据结构，可以理解为多维数组。


In [2]:
# 创建不同维度的Tensor
print("=== Tensor维度演示 ===")

# 标量 (0维)
scalar = torch.tensor(3.14)
print(f"标量: {scalar}, shape: {scalar.shape}")

# 向量 (1维)
vector = torch.tensor([1, 2, 3, 4])
print(f"向量: {vector}, shape: {vector.shape}")

# 矩阵 (2维)
matrix = torch.tensor([[1, 2], [3, 4]])
print(f"矩阵: {matrix}, shape: {matrix.shape}")

# 高维Tensor (常用于输入输出/激活)
high_dim = torch.randn(2, 3, 4, 5)
print(f"高维Tensor: shape: {high_dim.shape}")

# 随机初始化
random_tensor = torch.randn(3, 4)
print(f"随机Tensor: {random_tensor}")


=== Tensor维度演示 ===
标量: 3.140000104904175, shape: torch.Size([])
向量: tensor([1, 2, 3, 4]), shape: torch.Size([4])
矩阵: tensor([[1, 2],
        [3, 4]]), shape: torch.Size([2, 2])
高维Tensor: shape: torch.Size([2, 3, 4, 5])
随机Tensor: tensor([[-0.2387, -0.5050, -2.4752, -0.9316],
        [-0.1335,  0.3415, -0.0716, -0.0909],
        [-1.3297, -0.5426,  0.5471,  0.6431]])


## 3. 矩阵乘法操作

深度学习模型的核心是矩阵乘法，实现特征空间的变换：$Y = XW + b$


In [12]:
print("=== 矩阵乘法演示 ===")

# 创建输入矩阵 X (batch_size=2, input_dim=3)
X = torch.randn(2, 3)
print(f"输入 X: {X}")
print(f"X shape: {X.shape}")

# 创建权重矩阵 W (input_dim=3, output_dim=4)
W = torch.randn(3, 4)
print(f"权重 W: {W}")
print(f"W shape: {W.shape}")

# 创建偏置 b (output_dim=4)
b = torch.randn(4)
print(f"偏置 b: {b}")
print(f"b shape: {b.shape}")

# 方法1: 使用 @ 操作符
Y1 = X @ W + b
print(f"\n方法1 - @ 操作符结果: {Y1}")
print(f"Y1 shape: {Y1.shape}")

# 方法2: 使用 torch.matmul
Y2 = torch.matmul(X, W) + b
print(f"\n方法2 - torch.matmul结果: {Y2}")

# 方法3: 使用 tensor.matmul
Y3 = X.matmul(W) + b
print(f"\n方法3 - tensor.matmul结果: {Y3}")

# 验证结果是否相同
print(f"\n结果是否相同: {torch.allclose(Y1, Y2) and torch.allclose(Y2, Y3)}")


=== 矩阵乘法演示 ===
输入 X: tensor([[ 2.9246, -0.7985, -0.5669],
        [-0.0267, -1.5460, -2.1799]])
X shape: torch.Size([2, 3])
权重 W: tensor([[ 0.2074, -1.9844, -0.1817,  0.4280],
        [ 0.2545,  0.0662, -1.2704,  0.6674],
        [-1.0002, -0.0244,  0.2497, -1.7517]])
W shape: torch.Size([3, 4])
偏置 b: tensor([-0.1309,  1.4378, -0.1544, -0.2853])
b shape: torch.Size([4])

方法1 - @ 操作符结果: tensor([[ 0.8393, -4.4045,  0.1872,  1.4264],
        [ 1.6504,  1.4418,  1.2702,  2.4900]])
Y1 shape: torch.Size([2, 4])

方法2 - torch.matmul结果: tensor([[ 0.8393, -4.4045,  0.1872,  1.4264],
        [ 1.6504,  1.4418,  1.2702,  2.4900]])

方法3 - tensor.matmul结果: tensor([[ 0.8393, -4.4045,  0.1872,  1.4264],
        [ 1.6504,  1.4418,  1.2702,  2.4900]])

结果是否相同: True


## 4. einsum操作

einsum使用爱因斯坦求和约定，可以直观地描述张量运算。


In [4]:
print("=== einsum操作演示 ===")

# 矩阵乘法 Y = XW 的einsum表示
X = torch.randn(2, 3)  # batch=2, input_dim=3
W = torch.randn(3, 4)  # input_dim=3, output_dim=4

# 数学形式: Y_{be} = sum_d X_{bd} * W_{de}
Y_einsum = torch.einsum("bd,de->be", X, W)
Y_matmul = X @ W

print(f"einsum结果: {Y_einsum}")
print(f"matmul结果: {Y_matmul}")
print(f"结果相同: {torch.allclose(Y_einsum, Y_matmul)}")



=== einsum操作演示 ===
einsum结果: tensor([[-0.9760, -0.2255, -1.2338, -1.6805],
        [ 1.3851,  0.7489,  0.5305, -1.9812]])
matmul结果: tensor([[-0.9760, -0.2255, -1.2338, -1.6805],
        [ 1.3851,  0.7489,  0.5305, -1.9812]])
结果相同: True


In [13]:
# 批量矩阵乘法
A = torch.randn(10, 3, 4)  # batch=10
B = torch.randn(10, 4, 5)

# 数学形式: Y_{bij} = sum_k A_{bik} * B_{bkj}
Y_batch = torch.einsum("bik,bkj->bij", A, B)
# 用torch.bmm实现批量矩阵乘法（需要保证A, B的shape为[batch, n, m]和[batch, m, p]）
Y_batch_bmm = torch.bmm(A, B)
b1_elementwise_mul = A[0] @ B[0]

print(f"b1_elementwise_mul: {b1_elementwise_mul}")
print(f"Y_batch[0]: {Y_batch[0]}")
print(f"Y_batch_bmm[0]: {Y_batch_bmm[0]}")
print(f"结果相同: {torch.allclose(b1_elementwise_mul, Y_batch[0]) and torch.allclose(b1_elementwise_mul, Y_batch_bmm[0])}")

print(f"torch.bmm结果shape: {Y_batch_bmm.shape}")
print(f"einsum和bmm结果相同: {torch.allclose(Y_batch, Y_batch_bmm)}")
print(f"\n批量矩阵乘法结果shape: {Y_batch.shape}")

b1_elementwise_mul: tensor([[ 2.6000, -2.3622, -4.1707,  0.7482, -6.6441],
        [-0.6081,  0.0969,  3.0589, -0.7106,  2.9023],
        [-0.0136,  0.3155, -1.6328,  0.0387, -1.5945]])
Y_batch[0]: tensor([[ 2.6000, -2.3622, -4.1707,  0.7482, -6.6441],
        [-0.6081,  0.0969,  3.0589, -0.7106,  2.9023],
        [-0.0136,  0.3155, -1.6328,  0.0387, -1.5945]])
Y_batch_bmm[0]: tensor([[ 2.6000, -2.3622, -4.1707,  0.7482, -6.6441],
        [-0.6081,  0.0969,  3.0589, -0.7106,  2.9023],
        [-0.0136,  0.3155, -1.6328,  0.0387, -1.5945]])
结果相同: True
torch.bmm结果shape: torch.Size([10, 3, 5])
einsum和bmm结果相同: True

批量矩阵乘法结果shape: torch.Size([10, 3, 5])


In [6]:
# 元素乘法（Hadamard积）演示
A = torch.randn(2, 3)
B = torch.randn(2, 3)
elementwise_mul = torch.einsum("ij,ij->ij", A, B)
elementwise_mul_manual = A * B
print(f"\n元素乘法 - einsum: {elementwise_mul}")
print(f"元素乘法 - 手动: {elementwise_mul_manual}")
print(f"结果相同: {torch.allclose(elementwise_mul, elementwise_mul_manual)}")

# 点积
a = torch.randn(3)
b = torch.randn(3)
dot_product = torch.einsum("i,i->", a, b)
dot_product_manual = torch.sum(a * b)
print(f"\n点积 - einsum: {dot_product}")
print(f"点积 - 手动: {dot_product_manual}")



元素乘法 - einsum: tensor([[ 0.0103, -0.0192, -0.9273],
        [-0.4390, -0.2977, -0.3999]])
元素乘法 - 手动: tensor([[ 0.0103, -0.0192, -0.9273],
        [-0.4390, -0.2977, -0.3999]])
结果相同: True

点积 - einsum: -1.4463235139846802
点积 - 手动: -1.4463235139846802


## 5. nn.Linear层演示

nn.Linear是深度学习模型的基础"积木"，实现线性变换：$Y = XW + b$


In [None]:
print("=== nn.Linear层演示 ===")

# 创建线性层
linear_layer = nn.Linear(in_features=3, out_features=4, bias=True)
print(f"线性层: {linear_layer}")
print(f"权重shape: {linear_layer.weight.shape}")
print(f"偏置shape: {linear_layer.bias.shape}")
print(f"权重: {linear_layer.weight.data}")
print(f"偏置: {linear_layer.bias.data}")

# 创建输入
x = torch.randn(2, 3)  # batch_size=2, input_dim=3
print(f"\n输入 x: {x}")
print(f"x shape: {x.shape}")

# 前向传播
output = linear_layer(x)
print(f"\n输出: {output}")
print(f"输出shape: {output.shape}")

# 手动计算验证

manual_output = x @ linear_layer.weight.T + linear_layer.bias
print(f"\n手动计算结果: {manual_output}")
print(f"结果相同: {torch.allclose(output, manual_output)}")


=== nn.Linear层演示 ===
线性层: Linear(in_features=3, out_features=4, bias=True)
权重shape: torch.Size([4, 3])
偏置shape: torch.Size([4])
权重: tensor([[-0.5600, -0.2996,  0.3952],
        [-0.5436, -0.5025,  0.3234],
        [ 0.3115,  0.4748, -0.4359],
        [-0.4226,  0.2962,  0.5021]])
偏置: tensor([0.3455, 0.0904, 0.1903, 0.5480])

输入 x: tensor([[-1.5626,  0.8695,  0.2145],
        [-0.7496, -0.4951,  1.3849]])
x shape: torch.Size([2, 3])

输出: tensor([[ 1.0448,  0.5722,  0.0229,  1.5735],
        [ 1.4609,  1.1946, -0.8819,  1.4134]], grad_fn=<AddmmBackward0>)
输出shape: torch.Size([2, 4])

手动计算结果: tensor([[ 1.0448,  0.5722,  0.0229,  1.5735],
        [ 1.4609,  1.1946, -0.8819,  1.4134]], grad_fn=<AddBackward0>)
结果相同: True


## 6. 多层线性层堆叠

演示如何将多个线性层组合成更复杂的模型。


In [8]:
print("=== 多层线性层堆叠演示 ===")

# 创建多层线性层
layer1 = nn.Linear(20, 30)
layer2 = nn.Linear(30, 40)

print(f"第一层: {layer1}")
print(f"第二层: {layer2}")

# 创建输入
x = torch.randn(128, 20)  # batch_size=128, input_dim=20
print(f"\n输入 x shape: {x.shape}")

# 逐层前向传播
y1 = layer1(x)  # Y1 = XW1
print(f"第一层输出 y1 shape: {y1.shape}")

y2 = layer2(y1)  # Y2 = Y1W2
print(f"第二层输出 y2 shape: {y2.shape}")



=== 多层线性层堆叠演示 ===
第一层: Linear(in_features=20, out_features=30, bias=True)
第二层: Linear(in_features=30, out_features=40, bias=True)

输入 x shape: torch.Size([128, 20])
第一层输出 y1 shape: torch.Size([128, 30])
第二层输出 y2 shape: torch.Size([128, 40])


In [9]:
# 验证高维输入
x_high_dim = torch.randn(128, 4096, 30, 20)
print(f"\n高维输入 x_high_dim shape: {x_high_dim.shape}")

# nn.Linear会自动处理高维输入，只对最后两个维度进行线性变换
y1_high = layer1(x_high_dim)
print(f"高维输入第一层输出 shape: {y1_high.shape}")

y2_high = layer2(y1_high)
print(f"高维输入第二层输出 shape: {y2_high.shape}")



高维输入 x_high_dim shape: torch.Size([128, 4096, 30, 20])
高维输入第一层输出 shape: torch.Size([128, 4096, 30, 30])
高维输入第二层输出 shape: torch.Size([128, 4096, 30, 40])


## 7. 反向传播和自动求导

演示PyTorch的自动求导机制，这是深度学习训练的核心。


In [10]:
print("=== 反向传播和自动求导演示 ===")

# 创建需要梯度的参数
x = torch.tensor(2.0, requires_grad=True)
w = torch.tensor(3.0, requires_grad=True)
b = torch.tensor(1.0, requires_grad=True)

print(f"输入 x: {x}")
print(f"权重 w: {w}")
print(f"偏置 b: {b}")

# 前向传播: z = x * w + b
z = x * w + b
print(f"\n前向传播结果 z: {z}")

# 定义损失函数 (这里用简单的平方损失)
target = torch.tensor(10.0)
loss = (z - target) ** 2
print(f"目标值: {target}")
print(f"损失值: {loss}")

# 反向传播
loss.backward()

# 查看梯度
print(f"\n梯度信息:")
print(f"∂loss/∂x = {x.grad}")
print(f"∂loss/∂w = {w.grad}")
print(f"∂loss/∂b = {b.grad}")

# 手动验证梯度计算
# loss = (z - target)^2 = (x*w + b - target)^2
# ∂loss/∂x = 2*(x*w + b - target) * w = 2*(z - target) * w
# ∂loss/∂w = 2*(x*w + b - target) * x = 2*(z - target) * x
# ∂loss/∂b = 2*(x*w + b - target) * 1 = 2*(z - target)

manual_grad_x = 2 * (z - target) * w
manual_grad_w = 2 * (z - target) * x
manual_grad_b = 2 * (z - target)

print(f"\n手动计算的梯度:")
print(f"∂loss/∂x = {manual_grad_x}")
print(f"∂loss/∂w = {manual_grad_w}")
print(f"∂loss/∂b = {manual_grad_b}")

print(f"\n梯度计算正确: {torch.allclose(x.grad, manual_grad_x) and torch.allclose(w.grad, manual_grad_w) and torch.allclose(b.grad, manual_grad_b)}")


=== 反向传播和自动求导演示 ===
输入 x: 2.0
权重 w: 3.0
偏置 b: 1.0

前向传播结果 z: 7.0
目标值: 10.0
损失值: 9.0

梯度信息:
∂loss/∂x = -18.0
∂loss/∂w = -12.0
∂loss/∂b = -6.0

手动计算的梯度:
∂loss/∂x = -18.0
∂loss/∂w = -12.0
∂loss/∂b = -6.0

梯度计算正确: True


## 8. 完整的训练循环演示

演示一个完整的深度学习模型训练过程。


In [11]:
print("=== 完整训练循环演示 ===")

# 创建简单的回归模型
class SimpleModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(SimpleModel, self).__init__()
        self.linear1 = nn.Linear(input_dim, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, output_dim)
        self.relu = nn.ReLU()
    
    def forward(self, x):
        x = self.relu(self.linear1(x))
        x = self.linear2(x)
        return x

# 创建模型
model = SimpleModel(input_dim=2, hidden_dim=10, output_dim=1)
print(f"模型: {model}")

# 创建损失函数和优化器
criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

# 生成简单的训练数据 (y = 2*x1 + 3*x2 + 1 + noise)
torch.manual_seed(42)
X_train = torch.randn(100, 2)
y_train = 2 * X_train[:, 0] + 3 * X_train[:, 1] + 1 + 0.1 * torch.randn(100)
y_train = y_train.unsqueeze(1)  # 添加维度

print(f"训练数据 X shape: {X_train.shape}")
print(f"训练数据 y shape: {y_train.shape}")

# 训练循环
num_epochs = 100
losses = []

for epoch in range(num_epochs):
    # 前向传播
    outputs = model(X_train)
    loss = criterion(outputs, y_train)
    
    # 反向传播
    optimizer.zero_grad()  # 清零梯度
    loss.backward()        # 计算梯度
    optimizer.step()       # 更新参数
    
    losses.append(loss.item())
    
    if (epoch + 1) % 20 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

print(f"\n最终损失: {losses[-1]:.4f}")

# 测试模型
model.eval()
with torch.no_grad():
    test_input = torch.tensor([[1.0, 2.0]])
    test_output = model(test_input)
    expected = 2 * 1.0 + 3 * 2.0 + 1  # 应该是9
    print(f"\n测试输入: {test_input}")
    print(f"模型预测: {test_output.item():.4f}")
    print(f"期望输出: {expected:.4f}")
    print(f"预测误差: {abs(test_output.item() - expected):.4f}")


=== 完整训练循环演示 ===
模型: SimpleModel(
  (linear1): Linear(in_features=2, out_features=10, bias=True)
  (linear2): Linear(in_features=10, out_features=1, bias=True)
  (relu): ReLU()
)
训练数据 X shape: torch.Size([100, 2])
训练数据 y shape: torch.Size([100, 1])
Epoch [20/100], Loss: 9.0226
Epoch [40/100], Loss: 1.9250
Epoch [60/100], Loss: 0.2798
Epoch [80/100], Loss: 0.1892
Epoch [100/100], Loss: 0.1777

最终损失: 0.1777

测试输入: tensor([[1., 2.]])
模型预测: 7.9833
期望输出: 9.0000
预测误差: 1.0167
