In [1]:
#%pip install pytorch-lightning

In [1]:
import torch
if torch.cuda.is_available():
    device_properties = torch.cuda.get_device_properties(0)  # 获取第一个 GPU 设备的属性
    device_name = device_properties.name
    print(f"GPU Device Name: {device_name}")
else:
    print("CUDA is not available. Running on CPU.")

GPU Device Name: NVIDIA GeForce RTX 2080 Ti


### 简介
用Vgg-11训练fashionMNist数据集
调整Vgg-11模型
    1.5个卷积块的输出特征个数缩小至1/4*N

In [2]:
import torch
import pytorch_lightning as pl

### 数据集加载

In [3]:
from torchvision.datasets import FashionMNIST
from torchvision import transforms

class DataConfiguration:
    def __init__(self, batch_size, num_workers, pin_memory):
        self.batch_size = batch_size
        self.num_workers = num_workers  
        self.pin_memory = pin_memory    # True if GPU is available 

class LitLoadData_FashionMNist(pl.LightningDataModule):
   
    def __init__(self, data_config):
        super().__init__()
        self.data_config = data_config

    def prepare_data(self):
        # Download the FashionMNIST dataset if not already downloaded
        FashionMNIST(root="./data", train=True, download=True)
        FashionMNIST(root="./data", train=False, download=True) 

    def setup(self, stage=None):
        # Transformations to apply to the data
        transform = transforms.Compose([
            transforms.Resize((224, 224)),  # Resize the images to 224x224
            transforms.ToTensor(),
            transforms.Normalize((0.5,), (0.5,))    # Normalize the data to the range [-1, 1]
        ])

        # Load the FashionMNIST dataset
        self.train_dataset = FashionMNIST(root="./data", train=True, transform=transform)
        self.val_dataset = FashionMNIST(root="./data", train=False, transform=transform)

    def train_dataloader(self):
        return torch.utils.data.DataLoader(
            self.train_dataset,
            batch_size=self.data_config.batch_size,
            num_workers=self.data_config.num_workers,
            pin_memory=self.data_config.pin_memory, # 
            persistent_workers=True,
            shuffle=True
        )

    def val_dataloader(self):
        return torch.utils.data.DataLoader(
            self.val_dataset,
            batch_size=self.data_config.batch_size,
            num_workers=self.data_config.num_workers,
            pin_memory=self.data_config.pin_memory,
            persistent_workers=True,
            shuffle=False
        )


### 模型定义

In [4]:


from typing import Any

class TrainingConfiguration:
    def __init__(self, learning_rate,optimizer):
        self.learning_rate = learning_rate
        self.optimizer=optimizer

def make_layers(cfg, batch_norm=False):
        layers = []
        in_channels = 1
        for v in cfg:
            if v == 'M':
                layers += [torch.nn.MaxPool2d(kernel_size=2, stride=2)] 
            else:
                conv2d = torch.nn.Conv2d(in_channels, v, kernel_size=3, padding=1)
                if batch_norm:
                    layers += [conv2d, torch.nn.BatchNorm2d(v), torch.nn.ReLU(inplace=True)]
                else:
                    layers += [conv2d, torch.nn.ReLU(inplace=True)]
                in_channels = v
        return torch.nn.Sequential(*layers)
    

#vgg卷积块定义
class LitVgg11Model(pl.LightningModule):   
    def __init__(self, training_config):
        super().__init__()
        self.training_config = training_config 
        vgg_convlayers=[64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'] #标准的vgg11卷积块结构
        #fashionMnist相对于vgg11的训练集ImageNet图像过于简单,不需要太多特征
        vgg_convlayers = [int(i//4) if isinstance(i, int) else i for i in vgg_convlayers]
        
        self.model=torch.nn.Sequential(
            make_layers(vgg_convlayers),
            torch.nn.Flatten(),
            torch.nn.Linear(128*7*7, 4096),    
            torch.nn.ReLU(inplace=True),
            torch.nn.Dropout(),
            torch.nn.Linear(4096, 4096),
            torch.nn.ReLU(inplace=True),
            torch.nn.Dropout(),
            torch.nn.Linear(4096, 10)
        )

              
    def forward(self, x):
        return self.model(x)
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = torch.nn.functional.cross_entropy(logits, y)
        self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        self.log("train_acc", torch.sum(y == torch.argmax(logits, dim=1)).item() / len(y), prog_bar=True, logger=True,on_epoch=True)       
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = torch.nn.functional.cross_entropy(logits, y)
        self.log("val_loss", loss, prog_bar=True, logger=True) 
        self.log("val_acc", torch.sum(y == torch.argmax(logits, dim=1)).item() / len(y), prog_bar=True, logger=True)       
        return loss
    
    def configure_optimizers(self):
        if self.training_config.optimizer=="Adam":
            optimizer = torch.optim.Adam(self.parameters(), lr=self.training_config.learning_rate)
        elif self.training_config.optimizer=="SGD":
            optimizer = torch.optim.SGD(self.parameters(), lr=self.training_config.learning_rate)
        return optimizer
    
    
    

### 工作流程

In [6]:
%%time 
class TrainerConfiguration:
    max_epochs: int 
    def __init__(self, max_epochs):
        self.max_epochs = max_epochs

if __name__ == '__main__': 
    data_config = DataConfiguration(batch_size=128, num_workers=3, pin_memory=torch.cuda.is_available())
    #data_config = DataConfiguration(batch_size=128, num_workers=1, pin_memory=False)
    training_config = TrainingConfiguration(learning_rate=0.05, optimizer="SGD")
    trainer_config = TrainerConfiguration(max_epochs=10)

    model = LitVgg11Model(training_config)
    data = LitLoadData_FashionMNist(data_config)

    #add tensorboardLogger
    tb_logger=pl.loggers.TensorBoardLogger('tensorBoard-logs/',name='classModel_vgg11_v1',default_hp_metric=False)
    
    #add modelcheckpoint
    checkpoint_callback=pl.callbacks.ModelCheckpoint(
        monitor='val_acc',
        dirpath='checkPoint-logs/classModel_vgg11_v1',
        filename='classModel_vgg11_v1_{epoch:02d}_{val_acc:.2f}',
        #save_top_k=3, # save the top 3 models
        mode='max', 
    )

    trainer=pl.Trainer(
        max_epochs=trainer_config.max_epochs, 
        logger=tb_logger, 
        callbacks=[checkpoint_callback],
        accelerator='gpu',
        #strategy='dp',
        gpus=1,
        enable_model_summary=True 
        )


    # 定义数据转换
    transform = transforms.Compose([
        transforms.Resize((224, 224)),  # 将图像调整为 224x224
        transforms.ToTensor(),
        transforms.Normalize((0.5,), (0.5,))  # 将数据归一化到 [-1, 1] 范围
    ])

    data.prepare_data()
    data.setup()
    t=data.train_dataloader()
    v=data.val_dataloader()
    trainer.fit(model, train_dataloaders=t, val_dataloaders=v)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]

  | Name  | Type       | Params
-------------------------------------
0 | model | Sequential | 43.1 M
-------------------------------------
43.1 M    Trainable params
0         Non-trainable params
43.1 M    Total params
172.373   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

CPU times: total: 1min 3s
Wall time: 46.2 s


In [None]:
%load_ext tensorboard

In [None]:
%tensorboard --logdir ./tensorBoard-logs/classModel_vgg11_v1  --inspect

ERROR: Failed to launch TensorBoard (exited with 0).
Contents of stderr:
TensorFlow installation not found - running with reduced feature set.
Contents of stdout:
Processing event files... (this can take a few minutes)

Found event files in:
./tensorBoard-logs/classModel_vgg11_v1\version_0
./tensorBoard-logs/classModel_vgg11_v1\version_1
./tensorBoard-logs/classModel_vgg11_v1\version_2
./tensorBoard-logs/classModel_vgg11_v1\version_3
./tensorBoard-logs/classModel_vgg11_v1\version_4
./tensorBoard-logs/classModel_vgg11_v1\version_5
./tensorBoard-logs/classModel_vgg11_v1\version_6
./tensorBoard-logs/classModel_vgg11_v1\version_7
./tensorBoard-logs/classModel_vgg11_v1\version_8

These tags are in ./tensorBoard-logs/classModel_vgg11_v1\version_0:
audio -
histograms -
images -
scalars
   epoch
   train_acc_epoch
   train_acc_step
   train_loss_epoch
   train_loss_step
   val_acc
   val_loss
tensor -

Event statistics for ./tensorBoard-logs/classModel_vgg11_v1\version_0:
audio -
graph -
histo

In [None]:
%tensorboard --logdir ./tensorBoard-logs

Reusing TensorBoard on port 6006 (pid 118704), started 1 day, 23:18:20 ago. (Use '!kill 118704' to kill it.)

In [None]:
%pip show torch torchvision

Name: torch
Version: 1.8.1
Summary: Tensors and Dynamic neural networks in Python with strong GPU acceleration
Home-page: https://pytorch.org/
Author: PyTorch Team
Author-email: packages@pytorch.org
License: BSD-3
Location: c:\programdata\anaconda3\envs\remotebase_py3820\lib\site-packages
Requires: numpy, typing_extensions
Required-by: pytorch-lightning, torchaudio, torchmetrics, torchvision
---
Name: torchvision
Version: 0.8.2+cu101
Summary: image and video datasets and models for torch deep learning
Home-page: https://github.com/pytorch/vision
Author: PyTorch Core Team
Author-email: soumith@pytorch.org
License: BSD
Location: c:\programdata\anaconda3\envs\remotebase_py3820\lib\site-packages
Requires: numpy, pillow, torch
Required-by: 
Note: you may need to restart the kernel to use updated packages.


In [None]:
%pip install torchvision==0.9.1+cu101 torchaudio==0.8.1 -f https://download.pytorch.org/whl/torch_stable.html -i https://pypi.tuna.tsinghua.edu.cn/simple


^C

Note: you may need to restart the kernel to use updated packages.
