In [1]:
# 自动计算cell的计算时间
%load_ext autotime

%matplotlib inline
%config InlineBackend.figure_format='svg' #矢量图设置，让绘图更清晰

%reload_ext tensorboard

time: 300 ms (started: 2021-09-01 17:45:43 +08:00)


In [None]:
%%bash

# 增加更新
git add *.ipynb */*.ipynb

git remote -v

git commit -m '更新 #1 Sept 01, 2021'

#git push origin master
git push

In [21]:
import os
import torch
from torch import nn
import torch.nn.functional as F
from torchvision import transforms
from torchvision.datasets import MNIST, CIFAR10
from torch.utils.data import DataLoader, random_split
import pytorch_lightning as pl
from pytorch_lightning import LightningModule, Trainer, LightningDataModule
import torchmetrics

PATH_DATASETS = os.environ.get('PATH_DATASETS', '.')
AVAIL_GPUS = min(1, torch.cuda.device_count())
BATCH_SIZE = 256 if AVAIL_GPUS else 64
num_workers = 24

time: 2.22 ms (started: 2021-09-01 18:05:38 +08:00)


# 定义 LitMNISTModel

下面，我们重用了 hello world 教程中的 LightningModule，它对 MNIST 手写数字进行分类。

不幸的是，我们在模型中硬编码了特定于数据集的项目，永远限制它使用 MNIST 数据。 😢

如果您不打算在不同的数据集上训练/评估您的模型，这很好。 但是，在许多情况下，当您想使用不同的数据集尝试您的架构时，这会变得很麻烦。

In [8]:
class LitMNIST(LightningModule):

    def __init__(self, data_dir=PATH_DATASETS, hidden_size=64, learning_rate=2e-4):

        super().__init__()

        # We hardcode dataset specific stuff here.
        self.data_dir = data_dir
        self.num_classes = 10
        self.dims = (1, 28, 28)
        channels, width, height = self.dims
        self.transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.1307, ), (0.3081, )),
        ])

        self.hidden_size = hidden_size
        self.learning_rate = learning_rate

        # Build model
        self.model = nn.Sequential(
            nn.Flatten(),
            nn.Linear(channels * width * height, hidden_size),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(hidden_size, self.num_classes),
        )

    def forward(self, x):
        x = self.model(x)
        return F.log_softmax(x, dim=1)

    def training_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = F.nll_loss(logits, y)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = F.nll_loss(logits, y)
        preds = torch.argmax(logits, dim=1)
        acc = torchmetrics.functional.accuracy(preds, y)
        self.log('val_loss', loss, prog_bar=True)
        self.log('val_acc', acc, prog_bar=True)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
        return optimizer

    ####################
    # DATA RELATED HOOKS
    ####################

    def prepare_data(self):
        # download
        MNIST(self.data_dir, train=True, download=True)
        MNIST(self.data_dir, train=False, download=True)

    def setup(self, stage=None):

        # Assign train/val datasets for use in dataloaders
        if stage == 'fit' or stage is None:
            mnist_full = MNIST(self.data_dir, train=True, transform=self.transform)
            self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000])

        # Assign test dataset for use in dataloader(s)
        if stage == 'test' or stage is None:
            self.mnist_test = MNIST(self.data_dir, train=False, transform=self.transform)

    def train_dataloader(self):
        return DataLoader(self.mnist_train, batch_size=128, num_workers=num_workers)

    def val_dataloader(self):
        return DataLoader(self.mnist_val, batch_size=128, num_workers=num_workers)

    def test_dataloader(self):
        return DataLoader(self.mnist_test, batch_size=128, num_workers=num_workers)

time: 1.99 ms (started: 2021-09-01 17:50:14 +08:00)


# 训练 ListMNIST 模型

In [9]:
model = LitMNIST()
trainer = Trainer(
    max_epochs=2,
    gpus=AVAIL_GPUS,
    progress_bar_refresh_rate=20,
)
trainer.fit(model)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7,8,9]

  | Name  | Type       | Params
-------------------------------------
0 | model | Sequential | 55.1 K
-------------------------------------
55.1 K    Trainable params
0         Non-trainable params
55.1 K    Total params
0.220     Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Training: -1it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

time: 14.7 s (started: 2021-09-01 17:50:15 +08:00)


# 使用数据模块

DataModules 是一种将数据相关挂钩与 LightningModule 分离的方法，因此您可以开发与数据集无关的模型。

# 定义 MNISTDataModule

让我们回顾一下下面课程中的每个函数，并讨论它们在做什么：

1. `__init__`
    * 接收一个 `data_dir` 参数，该参数指向您已下载/希望下载 MNIST 数据集的位置。
    * 定义将应用于训练、验证和测试数据集拆分的转换。
    * 定义默认的 `self.dims`，它是一个从 `datamodule.size()` 返回的元组，可以帮助您初始化模型。
2. `prepare_data` 
    * 这是我们可以下载数据集的地方。 我们指向我们想要的数据集，如果在那里找不到数据集，就要求 torchvision 的 MNIST 数据集类下载。 
    * 请注意，我们没有在此函数中进行任何状态分配（即 `self.something = ...`）
3. `setup`  
    * 从文件加载数据并为每个分割（训练、验证、测试）准备 PyTorch 张量数据集。
    * 设置需要一个“`stage`”参数，用于分离“`fit`”和“`test`”的逻辑。
    * 如果您不介意一次加载所有数据集，您可以设置一个条件，以允许在 None 传递到 stage 时运行“fit”相关设置和“test”相关设置。
    * 请注意，这会在所有 GPU 上运行，并且在此处进行状态分配*是*安全的
4. `x_dataloader`  
    * train_dataloader()、val_dataloader() 和 test_dataloader() 都返回 PyTorch DataLoader 实例，这些实例是通过包装我们在 setup() 中准备的各自数据集创建的

In [16]:
class MNISTDataModule(LightningDataModule):

    def __init__(self, data_dir: str = PATH_DATASETS):
        super().__init__()
        self.data_dir = data_dir
        self.transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.1307, ), (0.3081, )),
        ])

        # 调用 dm.size() 时返回 self.dims
        # 在这里设置默认暗度，因为我们知道它们。
        # 可以选择在 dm.setup() 中动态分配
        self.dims = (1, 28, 28)
        self.num_classes = 10

    def prepare_data(self):
        # download
        MNIST(self.data_dir, train=True, download=True)
        MNIST(self.data_dir, train=False, download=True)

    def setup(self, stage=None):

        # 分配 train/val 数据集以用于数据加载器
        if stage in ('fit', None):
            mnist_full = MNIST(self.data_dir, train=True, transform=self.transform)
            self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000])

        # 分配测试数据集以在数据加载器中使用
        if stage in ('test', None):
            self.mnist_test = MNIST(self.data_dir, train=False, transform=self.transform)

    def train_dataloader(self):
        return DataLoader(self.mnist_train, batch_size=BATCH_SIZE, num_workers=num_workers)

    def val_dataloader(self):
        return DataLoader(self.mnist_val, batch_size=BATCH_SIZE, num_workers=num_workers)

    def test_dataloader(self):
        return DataLoader(self.mnist_test, batch_size=BATCH_SIZE, num_workers=num_workers)

time: 1.59 ms (started: 2021-09-01 17:58:48 +08:00)


# 定义与数据集无关的 LitModel

下面，我们定义与我们之前制作的 LitMNIST 模型相同的模型。

但是，这次我们的模型可以自由使用我们想要的任何输入数据🔥。

In [15]:
class LitModel(LightningModule):

    def __init__(self, channels, width, height, num_classes, hidden_size=64, learning_rate=2e-4):

        super().__init__()

        # 我们将输入维度作为参数并使用它们来动态构建模型。
        self.channels = channels
        self.width = width
        self.height = height
        self.num_classes = num_classes
        self.hidden_size = hidden_size
        self.learning_rate = learning_rate

        self.model = nn.Sequential(
            nn.Flatten(),
            nn.Linear(channels * width * height, hidden_size),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(hidden_size, num_classes),
        )

    def forward(self, x):
        x = self.model(x)
        return F.log_softmax(x, dim=1)

    def training_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = F.nll_loss(logits, y)
        return loss

    def validation_step(self, batch, batch_idx):

        x, y = batch
        logits = self(x)
        loss = F.nll_loss(logits, y)
        preds = torch.argmax(logits, dim=1)
        acc = torchmetrics.functional.accuracy(preds, y)
        self.log('val_loss', loss, prog_bar=True)
        self.log('val_acc', acc, prog_bar=True)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
        return optimizer

time: 1.39 ms (started: 2021-09-01 17:58:24 +08:00)


# 使用 MNISTDataModule 训练 LitModel

现在，我们使用 MNISTDataModule 的配置设置和数据加载器初始化和训练 LitModel。

In [17]:
# 初始化数据模块
dm = MNISTDataModule()
# 从数据模块的属性初始化模型
model = LitModel(*dm.size(), dm.num_classes)
# Init trainer
trainer = Trainer(
    max_epochs=3,
    progress_bar_refresh_rate=20,
    gpus=AVAIL_GPUS,
)
# 将数据模块作为 arg 传递给 trainer.fit 以覆盖模型挂钩:)
trainer.fit(model, dm)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7,8,9]

  | Name  | Type       | Params
-------------------------------------
0 | model | Sequential | 55.1 K
-------------------------------------
55.1 K    Trainable params
0         Non-trainable params
55.1 K    Total params
0.220     Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Training: -1it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

time: 15.5 s (started: 2021-09-01 18:02:26 +08:00)


# 定义 CIFAR10 数据模块

让我们通过为 CIFAR10 数据集定义一个新的数据模块来证明我们之前制作的 LitModel 与数据集无关。

In [19]:
class CIFAR10DataModule(LightningDataModule):

    def __init__(self, data_dir: str = './'):
        super().__init__()
        self.data_dir = data_dir
        self.transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
        ])

        self.dims = (3, 32, 32)
        self.num_classes = 10

    def prepare_data(self):
        # download
        CIFAR10(self.data_dir, train=True, download=True)
        CIFAR10(self.data_dir, train=False, download=True)

    def setup(self, stage=None):

        # Assign train/val datasets for use in dataloaders
        if stage == 'fit' or stage is None:
            cifar_full = CIFAR10(self.data_dir, train=True, transform=self.transform)
            self.cifar_train, self.cifar_val = random_split(cifar_full, [45000, 5000])

        # Assign test dataset for use in dataloader(s)
        if stage == 'test' or stage is None:
            self.cifar_test = CIFAR10(self.data_dir, train=False, transform=self.transform)

    def train_dataloader(self):
        return DataLoader(self.cifar_train, batch_size=BATCH_SIZE, num_workers=num_workers)

    def val_dataloader(self):
        return DataLoader(self.cifar_val, batch_size=BATCH_SIZE, num_workers=num_workers)

    def test_dataloader(self):
        return DataLoader(self.cifar_test, batch_size=BATCH_SIZE, num_workers=num_workers)

time: 1.39 ms (started: 2021-09-01 18:05:08 +08:00)


# 使用 CIFAR10DataModule 训练 LitModel

我们的模型不是很好，所以它在 CIFAR10 数据集上的表现会很糟糕。

这里的重点是我们可以看到我们的 LitModel 使用不同的数据模块作为其输入数据没有问题。

In [24]:
dm = CIFAR10DataModule()
model = LitModel(*dm.size(), dm.num_classes, hidden_size=256)
trainer = Trainer(
    max_epochs=5,
    progress_bar_refresh_rate=20,
    gpus=AVAIL_GPUS,
)
trainer.fit(model, dm)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


Using downloaded and verified file: ./cifar-10-python.tar.gz
Extracting ./cifar-10-python.tar.gz to ./
Files already downloaded and verified


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7,8,9]

  | Name  | Type       | Params
-------------------------------------
0 | model | Sequential | 855 K 
-------------------------------------
855 K     Trainable params
0         Non-trainable params
855 K     Total params
3.420     Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Training: -1it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

time: 29.4 s (started: 2021-09-01 18:14:15 +08:00)


# 恭喜 - 是时候加入社区了！

恭喜您完成本笔记本教程！ 如果你喜欢这个并想加入闪电运动，你可以通过以下方式来做到！