In [7]:
!nvidia-smi

Thu Feb 27 11:46:10 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   77C    P0             34W /   70W |     170MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

**梯度检查
题目：检查模型中某些层的梯度是否正常更新。**

In [8]:
import torch

# 创建一个简单的模型
model = nn.Linear(10, 1)
criterion = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

# 创建输入和目标
input = torch.randn(1, 10, requires_grad=True)
target = torch.tensor([[1.0]])

# 前向传播
output = model(input)
loss = criterion(output, target)

# 反向传播
loss.backward()

# 检查梯度
for name, param in model.named_parameters():
    print(f"参数 {name} 的梯度：", param.grad)

参数 weight 的梯度： tensor([[-2.7934,  3.3818, -7.3518,  0.8378,  2.2146, -0.0750, -3.9112,  3.3398,
         -0.3331,  2.5052]])
参数 bias 的梯度： tensor([-3.0986])


**动态图操作
题目：使用 torch.autograd 动态计算梯度。**

In [9]:
x = torch.tensor(2.0, requires_grad=True)
y = x ** 2 + 3 * x + 1

# 计算梯度
y.backward()

# 查看梯度
print("x 的梯度：", x.grad)  # 输出 7.0

x 的梯度： tensor(7.)


**模型调试与检查
题目：检查模型的每一层输出的形状。**

In [11]:
import torch
import torch.nn as nn

model = nn.Sequential(
    nn.Conv2d(1, 10, kernel_size=5),  # 输出尺寸：[batch, 10, 24, 24]
    nn.MaxPool2d(2),                  # 输出尺寸：[batch, 10, 12, 12]
    nn.Flatten(),                     # 展平后尺寸：[batch, 10 * 12 * 12] = [batch, 1440]
    nn.Linear(1440, 50),              # 修正输入维度为 1440
    nn.ReLU(),
    nn.Linear(50, 10)
)

# 创建一个虚拟输入
input = torch.randn(1, 1, 28, 28)

# 检查每一层的输出形状
with torch.no_grad():
    for layer in model:
        input = layer(input)
        print(f"层 {layer.__class__.__name__} 的输出形状：", input.shape)

层 Conv2d 的输出形状： torch.Size([1, 10, 24, 24])
层 MaxPool2d 的输出形状： torch.Size([1, 10, 12, 12])
层 Flatten 的输出形状： torch.Size([1, 1440])
层 Linear 的输出形状： torch.Size([1, 50])
层 ReLU 的输出形状： torch.Size([1, 50])
层 Linear 的输出形状： torch.Size([1, 10])


**自定义优化器
题目：实现一个简单的自定义优化器（如梯度下降）。
**

In [14]:
import torch
import torch.nn as nn
import torch.nn.functional as F
class CustomOptimizer:
    def __init__(self, params, lr=0.01):
        self.params = list(params)
        self.lr = lr

    def step(self):
        with torch.no_grad():
            for param in self.params:
                param -= self.lr * param.grad

    def zero_grad(self):
        for param in self.params:
            if param.grad is not None:
                param.grad.zero_()

# 使用自定义优化器
model = nn.Linear(10, 1)
optimizer = CustomOptimizer(model.parameters(), lr=0.01)

input = torch.randn(1, 10)
target = torch.tensor([[1.0]])

output = model(input)
loss = F.mse_loss(output, target)
loss.backward()

optimizer.step()
optimizer.zero_grad()

**练习 1: 自定义 Autograd Function
目标: 实现一个自定义的激活函数 LeakySwish：f(x) = x * sigmoid(x) + 0.1*x**

In [21]:
import torch
from torch.autograd import Function

class LeakySwish(Function):
    @staticmethod
    def forward(ctx, x):
        sigmoid = 1 / (1 + torch.exp(-x))
        ctx.save_for_backward(x, sigmoid)
        return x * sigmoid + 0.1 * x

    @staticmethod
    def backward(ctx, grad_output):
        x, sigmoid = ctx.saved_tensors
        sigmoid_grad = sigmoid * (1 - sigmoid)
        dx = sigmoid + x * sigmoid_grad + 0.1
        return grad_output * dx

# 测试
x = torch.randn(3, requires_grad=True)
y = LeakySwish.apply(x)
y.backward(torch.ones_like(y))
print(x.grad)  # 应显示自动计算的梯度

tensor([0.0019, 0.9535, 0.8470])


**练习 2: 混合精度训练
目标: 实现自动混合精度训练循环**

In [23]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

# 定义设备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 数据预处理
transform = transforms.Compose([
    transforms.ToTensor(),  # 将图片转换为Tensor
    transforms.Normalize((0.5,), (0.5,))  # 归一化
])

# 加载数据集
train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)

# 定义一个简单的模型
class SimpleNet(nn.Module):
    def __init__(self):
        super(SimpleNet, self).__init__()
        self.fc1 = nn.Linear(28 * 28, 128)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = x.view(-1, 28 * 28)  # 展平
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

model = SimpleNet().to(device)

# 定义损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 混合精度训练
scaler = torch.cuda.amp.GradScaler()

# 训练循环
model.train()
for epoch in range(5):  # 训练 5 个 epoch
    for inputs, labels in train_dataloader:
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()

        with torch.cuda.amp.autocast():  # 使用混合精度
            outputs = model(inputs)
            loss = criterion(outputs, labels)

        scaler.scale(loss).backward()  # 缩放损失并反向传播
        scaler.unscale_(optimizer)  # 取消缩放
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # 梯度裁剪
        scaler.step(optimizer)  # 更新优化器
        scaler.update()  # 更新缩放器

    print(f"Epoch [{epoch + 1}/5] completed.")

print("训练完成！")

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 404: Not Found

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz to ./data/MNIST/raw/train-images-idx3-ubyte.gz


100%|██████████| 9.91M/9.91M [00:00<00:00, 16.1MB/s]


Extracting ./data/MNIST/raw/train-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 404: Not Found

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz to ./data/MNIST/raw/train-labels-idx1-ubyte.gz


100%|██████████| 28.9k/28.9k [00:00<00:00, 472kB/s]


Extracting ./data/MNIST/raw/train-labels-idx1-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 404: Not Found

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw/t10k-images-idx3-ubyte.gz


100%|██████████| 1.65M/1.65M [00:00<00:00, 4.42MB/s]


Extracting ./data/MNIST/raw/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 404: Not Found

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████| 4.54k/4.54k [00:00<00:00, 4.19MB/s]


Extracting ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw



  scaler = torch.cuda.amp.GradScaler()
  with torch.cuda.amp.autocast():  # 使用混合精度


Epoch [1/5] completed.
Epoch [2/5] completed.
Epoch [3/5] completed.
Epoch [4/5] completed.
Epoch [5/5] completed.
训练完成！


**练习 3: 自定义内存高效的注意力机制
目标: 实现一个分块的注意力计算**

In [1]:
import torch

def memory_efficient_attention(Q, K, V, chunk_size=64):
    batch, heads, seq_len, dim = Q.shape
    out = torch.zeros_like(V)
    for i in range(0, seq_len, chunk_size):
        Q_chunk = Q[:, :, i:i+chunk_size]
        attn = torch.einsum('bhid,bhjd->bhij', Q_chunk, K)
        attn = torch.softmax(attn / dim**0.5, dim=-1)
        out[:, :, i:i+chunk_size] = torch.einsum('bhij,bhjd->bhid', attn, V)
    return out

# 测试
Q = torch.randn(2, 4, 1024, 64).cuda()
K = V = torch.randn_like(Q)
output = memory_efficient_attention(Q, K, V)

**练习 4: 动态计算图操作
目标: 在训练过程中动态修改计算图**

In [3]:
class DynamicGate(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.threshold = 0.5

    def forward(self, x):
        mask = (x > self.threshold).float()
        # 动态修改反向传播行为
        def backward_hook(grad):
            # 梯度超过阈值时放大梯度
            return grad * (1 + mask)
        x.register_hook(backward_hook)
        return x * mask

# 测试
gate = DynamicGate()
x = torch.rand(3, requires_grad=True)
y = gate(x)
loss = y.sum()
loss.backward()
print(x.grad)  # 观察自定义梯度行为

tensor([2., 0., 0.])


**练习 5: 梯度裁剪的高级形式**

In [6]:
import torch
import torch.nn as nn
import torch.optim as optim

# 定义一个简单的模型
class CustomModel(nn.Module):
    def __init__(self):
        super(CustomModel, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(1, 10, kernel_size=5),  # 输出尺寸：24x24
            nn.ReLU(),
            nn.MaxPool2d(2),  # 输出尺寸：12x12
            nn.Flatten()
        )
        self.classifier = nn.Sequential(
            nn.Linear(1440, 50),  # 修正输入维度为 1440
            nn.ReLU(),
            nn.Linear(50, 10)
        )

    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x

# 创建模型
model = CustomModel()

# 定义优化器
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 定义梯度裁剪参数
params = [
    {"params": model.features.parameters(), "max_norm": 1.0},
    {"params": model.classifier.parameters(), "max_norm": 0.5}
]

# 模拟训练过程
input = torch.randn(64, 1, 28, 28)  # 假设输入是 MNIST 数据集
target = torch.randint(0, 10, (64,))  # 随机生成目标标签
criterion = nn.CrossEntropyLoss()

# 前向传播
output = model(input)
loss = criterion(output, target)

# 反向传播
optimizer.zero_grad()
loss.backward()

# 对每个参数组分别进行梯度裁剪
for group in params:
    torch.nn.utils.clip_grad_norm_(group["params"], max_norm=group["max_norm"], norm_type=2)

# 更新优化器
optimizer.step()

print("梯度裁剪完成！")

梯度裁剪完成！


In [2]:
import torch

data = torch.tensor([[1,2],[3,4]], dtype=torch.float32)
data

tensor([[1., 2.],
        [3., 4.]])

In [3]:
import numpy as np

np_array = np.array([[1,2],[3,4]])
data2 = torch.from_numpy(np_array)
data2

tensor([[1, 2],
        [3, 4]])

In [4]:
data2.dtype

torch.int64

In [5]:
# 通过已知张量维度，创建新张量
data3 = torch.rand_like(data2, dtype=torch.float)
data3

tensor([[0.7126, 0.7690],
        [0.1867, 0.5486]])

In [6]:
shape = (2,3,)
rand_tensor = torch.rand(shape)
ones_tensor = torch.ones(shape)
zeros_tensor = torch.zeros(shape)

print(f"Random Tensor: \n {rand_tensor} \n")
print(f"Ones Tensor: \n {ones_tensor} \n")
print(f"Zeros Tensor: \n {zeros_tensor}")

Random Tensor: 
 tensor([[0.8193, 0.4476, 0.9758],
        [0.2281, 0.4020, 0.9419]]) 

Ones Tensor: 
 tensor([[1., 1., 1.],
        [1., 1., 1.]]) 

Zeros Tensor: 
 tensor([[0., 0., 0.],
        [0., 0., 0.]])


In [7]:
# 基于现有tensor构建，但使用新值填充
m = torch.ones(5,3, dtype=torch.double)
n = torch.rand_like(m, dtype=torch.float)

# 获取tensor的大小
print(m.size()) # torch.Size([5,3])

# 均匀分布
print(torch.rand(5,3))
# 标准正态分布
print(torch.randn(5,3))
# 离散正态分布
print(torch.normal(mean=.0,std=1.0,size=(5,3)))
# 线性间隔向量(返回一个1维张量，包含在区间start和end上均匀间隔的steps个点)
print(torch.linspace(start=1,end=10,steps=21))

torch.Size([5, 3])
tensor([[0.6518, 0.2467, 0.4258],
        [0.6910, 0.5011, 0.0290],
        [0.0081, 0.4381, 0.4377],
        [0.3568, 0.6890, 0.5959],
        [0.1497, 0.2263, 0.6815]])
tensor([[-0.2172, -0.4301,  1.1281],
        [ 0.8829,  1.7818, -0.4615],
        [ 0.3353, -0.8969, -1.9054],
        [-1.6914, -0.5239,  0.4264],
        [-1.6923,  0.9537,  1.3282]])
tensor([[ 0.1141, -1.4953, -1.2681],
        [-1.7484, -0.1983, -0.3544],
        [-1.1363,  0.1545, -1.4512],
        [ 0.1336, -0.4788,  1.0420],
        [ 1.7160, -0.7597,  0.3555]])
tensor([ 1.0000,  1.4500,  1.9000,  2.3500,  2.8000,  3.2500,  3.7000,  4.1500,
         4.6000,  5.0500,  5.5000,  5.9500,  6.4000,  6.8500,  7.3000,  7.7500,
         8.2000,  8.6500,  9.1000,  9.5500, 10.0000])


In [8]:
tensor = torch.rand(3,4)

print(f"Shape of tensor: {tensor.shape}")
print(f"Datatype of tensor: {tensor.dtype}")
print(f"Device tensor is stored on: {tensor.device}")

Shape of tensor: torch.Size([3, 4])
Datatype of tensor: torch.float32
Device tensor is stored on: cpu


In [9]:
# 检查pytorch是否支持GPU
if torch.cuda.is_available():
    device = torch.device("cuda")
    tensor = tensor.to(device)

print(tensor)
print(tensor.device)

# mac上没有GPU，使用M系列芯片
if torch.backends.mps.is_available():
    device = torch.device("mps")
    tensor = tensor.to(device)

print(tensor)
print(tensor.device)

tensor([[0.8972, 0.8837, 0.5895, 0.3331],
        [0.3943, 0.7598, 0.6205, 0.2740],
        [0.3034, 0.0081, 0.7375, 0.7037]])
cpu
tensor([[0.8972, 0.8837, 0.5895, 0.3331],
        [0.3943, 0.7598, 0.6205, 0.2740],
        [0.3034, 0.0081, 0.7375, 0.7037]])
cpu


In [10]:
tensor = torch.ones(4, 4)
print('First row: ', tensor[0])
print('First column: ', tensor[:, 0])
print('Last column:', tensor[..., -1])
tensor[:,1] = 0
print(tensor)

First row:  tensor([1., 1., 1., 1.])
First column:  tensor([1., 1., 1., 1.])
Last column: tensor([1., 1., 1., 1.])
tensor([[1., 0., 1., 1.],
        [1., 0., 1., 1.],
        [1., 0., 1., 1.],
        [1., 0., 1., 1.]])


In [11]:
t1 = torch.cat([tensor, tensor, tensor], dim=1)
print(t1 * 3)
print(t1.shape)

tensor([[3., 0., 3., 3., 3., 0., 3., 3., 3., 0., 3., 3.],
        [3., 0., 3., 3., 3., 0., 3., 3., 3., 0., 3., 3.],
        [3., 0., 3., 3., 3., 0., 3., 3., 3., 0., 3., 3.],
        [3., 0., 3., 3., 3., 0., 3., 3., 3., 0., 3., 3.]])
torch.Size([4, 12])


In [12]:
import torch
tensor = torch.arange(1,10, dtype=torch.float32).reshape(3, 3)

# 计算两个张量之间矩阵乘法的几种方式。 y1, y2, y3 最后的值是一样的 dot
y1 = tensor @ tensor.T
y2 = tensor.matmul(tensor.T)

# print(y1)
# print(y2)

y3 = torch.rand_like(tensor)
torch.matmul(tensor, tensor.T, out=y3)
# print(y3)


# 计算张量逐元素相乘的几种方法。 z1, z2, z3 最后的值是一样的。
z1 = tensor * tensor
z2 = tensor.mul(tensor)

z3 = torch.rand_like(tensor)
torch.mul(tensor, tensor, out=z3)

print(z1)
print(z3)

tensor([[ 1.,  4.,  9.],
        [16., 25., 36.],
        [49., 64., 81.]])
tensor([[ 1.,  4.,  9.],
        [16., 25., 36.],
        [49., 64., 81.]])


In [13]:
agg = tensor.sum()
agg_item = agg.item()
print(agg_item, type(agg_item))

45.0 <class 'float'>


In [14]:
np_arr = z1.numpy()
np_arr

array([[ 1.,  4.,  9.],
       [16., 25., 36.],
       [49., 64., 81.]], dtype=float32)

In [15]:
print(tensor, "\n")
tensor.add_(5)
# tensor = tensor + 5
# tensor += 5
print(tensor)

tensor([[1., 2., 3.],
        [4., 5., 6.],
        [7., 8., 9.]]) 

tensor([[ 6.,  7.,  8.],
        [ 9., 10., 11.],
        [12., 13., 14.]])


In [16]:
tensor

tensor([[ 6.,  7.,  8.],
        [ 9., 10., 11.],
        [12., 13., 14.]])