[torchvision.transforms 文档](https://pytorch.org/vision/stable/transforms.html)

# Torchvision 图像分类数据增强

支持常见的计算机视觉转换、变换。可用于转换或增强数据，以训练或推理不同的任务（图像分类、检测、分割、视频分类）

In [4]:
import torch
from torchvision.transforms import v2

# 定义图像的高度和宽度
H, W = 32, 32

# 创建一个随机的 RGB 图像张量，大小为 3x32x32，数据类型为 uint8
# 红、绿、蓝通道，每个像素值在 0 到 255 之间随机生成
img = torch.randint(0, 256, size=(3, H, W), dtype=torch.uint8)

# 定义图像变换组合
# 使用 v2.Compose 将多个变换操作串联在一起
transforms = v2.Compose([
    # 随机裁剪并缩放图像到指定大小 (224x224)
    # RandomResizedCrop 会在随机的位置裁剪图像的一部分，然后将其缩放到指定大小
    # antialias=True 表示在缩放时使用抗锯齿功能，减少图像失真
    v2.RandomResizedCrop(size=(224, 224), antialias=True),

    # 随机水平翻转图像
    # p=0.5 表示翻转的概率为 50%，可以增强数据的多样性
    v2.RandomHorizontalFlip(p=0.5),

    # 将张量的数据类型转换为 float32
    # scale=True 表示对像素值进行线性缩放，将其从 [0, 255] 的范围归一化到 [0, 1] 的范围
    v2.ToDtype(torch.float32, scale=True),

    # 对图像进行标准化（归一化）操作
    # 使用预定义的均值和标准差，通常是针对 ImageNet 数据集预训练模型的值
    # mean=[0.485, 0.456, 0.406] 分别对应 RGB 通道的均值
    # std=[0.229, 0.224, 0.225] 分别对应 RGB 通道的标准差
    v2.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# 应用变换组合到输入图像
# 将变换后的图像存储到变量 img 中
img = transforms(img)

In [6]:
# 检测（复用之前的导入和变换）
from torchvision import tv_tensors

# 创建一个随机的 RGB 图像张量，大小为 3x32x32，数据类型为 uint8
# 红、绿、蓝通道，每个像素值在 0 到 255 之间随机生成
img = torch.randint(0, 256, size=(3, H, W), dtype=torch.uint8)

# 创建随机的边界框坐标
# randint 生成 0 到 H // 2 之间的随机整数，size=(3, 4) 表示生成 3 个边界框，每个边界框有 4 个坐标
boxes = torch.randint(0, H // 2, size=(3, 4))
# 调整边界框坐标，确保它们代表左上角和右下角的坐标 (XYXY 格式)
boxes[:, 2:] += boxes[:, :2]
# 将边界框坐标转换为 tv_tensors.BoundingBoxes 对象
# format="XYXY" 指定边界框格式为 XYXY，canvas_size=(H, W) 指定图像尺寸
boxes = tv_tensors.BoundingBoxes(boxes, format="XYXY", canvas_size=(H, W))

# 应用相同的变换到图像和边界框
# transforms 是之前定义的图像变换组合
img, boxes = transforms(img, boxes)

# 可以传递任意数据结构，例如字典
# 创建一个字典，包含 "image" 和 "boxes" 键，分别存储图像和边界框
# 将字典传递给 transforms，应用相同的变换
output_dict = transforms({"image": img, "boxes": boxes})

[v2 API](https://pytorch.org/vision/stable/transforms.html#v2-api-ref)

* 上周的MNIST结果为例

使用 `torchvision.transforms` 实现数据增强，并记录效果


In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import pandas as pd
import numpy as np
import os
'''
当前模型是一个简单的全连接网络，
'''
# Load MNIST data from local CSV files
train_data = pd.read_csv('./data/train.csv')
test_data = pd.read_csv('./data/test.csv')

# Prepare the data
train_images = train_data.iloc[:, 1:].values.reshape(-1, 28, 28) / 255.0
train_labels = train_data.iloc[:, 0].values
test_images = test_data.values.reshape(-1, 28, 28) / 255.0

# Convert to PyTorch tensors
train_images = torch.tensor(train_images, dtype=torch.float32).unsqueeze(1)
train_labels = torch.tensor(train_labels, dtype=torch.long)
test_images = torch.tensor(test_images, dtype=torch.float32).unsqueeze(1)

train_dataset = torch.utils.data.TensorDataset(train_images, train_labels)
test_dataset = torch.utils.data.TensorDataset(test_images, torch.zeros(test_images.size(0), dtype=torch.long))

train_loader = DataLoader(dataset=train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=1000, shuffle=False)

class SimpleNN(nn.Module):
    def __init__(self):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(28*28, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 10)

    def forward(self, x):
        x = x.view(-1, 28*28)
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

model = SimpleNN()

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

epochs = 10
for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    for batch_idx, (data, target) in enumerate(train_loader):
        optimizer.zero_grad()
        outputs = model(data)
        loss = criterion(outputs, target)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f'Epoch [{epoch+1}/{epochs}], Loss: {running_loss/len(train_loader):.4f}')

model.eval()
predicted_labels = []
with torch.no_grad():
    for data, _ in test_loader:
        outputs = model(data)
        _, predicted = torch.max(outputs.data, 1)
        predicted_labels.extend(predicted.numpy())



Epoch [1/10], Loss: 0.3956
Epoch [2/10], Loss: 0.1610
Epoch [3/10], Loss: 0.1103
Epoch [4/10], Loss: 0.0843
Epoch [5/10], Loss: 0.0663
Epoch [6/10], Loss: 0.0512
Epoch [7/10], Loss: 0.0423
Epoch [8/10], Loss: 0.0330
Epoch [9/10], Loss: 0.0287
Epoch [10/10], Loss: 0.0218


## 数据增强

In [13]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import pandas as pd
import numpy as np
import os
from PIL import Image

# 定义数据增强的转换
transform = transforms.Compose([
    transforms.RandomRotation(10),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor()
])

# Load MNIST data from local CSV files
train_data = pd.read_csv('./data/train.csv')
test_data = pd.read_csv('./data/test.csv')

# Convert to PyTorch tensors and apply transformations
train_images = train_data.iloc[:, 1:].values.reshape(-1, 28, 28)
train_labels = train_data.iloc[:, 0].values
test_images = test_data.values.reshape(-1, 28, 28)

# 将 NumPy 数组转换为 PIL 图像，然后应用数据增强
train_images = [transform(Image.fromarray((image * 255).astype(np.uint8))) for image in train_images]
test_images = [transform(Image.fromarray((image * 255).astype(np.uint8))) for image in test_images]

train_images = torch.stack(train_images)
train_labels = torch.tensor(train_labels, dtype=torch.long)
test_images = torch.stack(test_images)

train_dataset = torch.utils.data.TensorDataset(train_images, train_labels)
test_dataset = torch.utils.data.TensorDataset(test_images, torch.zeros(test_images.size(0), dtype=torch.long))

train_loader = DataLoader(dataset=train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=1000, shuffle=False)

class SimpleNN(nn.Module):
    def __init__(self):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(28*28, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 10)

    def forward(self, x):
        x = x.view(-1, 28*28)
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

model = SimpleNN()

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

epochs = 10
for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    for batch_idx, (data, target) in enumerate(train_loader):
        optimizer.zero_grad()
        outputs = model(data)
        loss = criterion(outputs, target)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f'Epoch [{epoch+1}/{epochs}], Loss: {running_loss/len(train_loader):.4f}')

model.eval()
predicted_labels = []
with torch.no_grad():
    for data, _ in test_loader:
        outputs = model(data)
        _, predicted = torch.max(outputs.data, 1)
        predicted_labels.extend(predicted.numpy())



Epoch [1/10], Loss: 1.0481
Epoch [2/10], Loss: 0.7296
Epoch [3/10], Loss: 0.6267
Epoch [4/10], Loss: 0.5442
Epoch [5/10], Loss: 0.4720
Epoch [6/10], Loss: 0.4039
Epoch [7/10], Loss: 0.3428
Epoch [8/10], Loss: 0.2884
Epoch [9/10], Loss: 0.2385
Epoch [10/10], Loss: 0.1939
