### 简介
用不带relu的LeNet模型训练fashionMNIST数据集

In [7]:
import torch
import pytorch_lightning as pl
from torchvision.datasets import FashionMNIST
from torchvision import transforms

### 数据集加载

In [8]:

class DataConfiguration:
    def __init__(self, batch_size, num_workers, pin_memory):
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.pin_memory = pin_memory    # True if GPU is available

class LitLoadData_FashionMNist(pl.LightningDataModule):
    def __init__(self, data_config):
        super().__init__()
        self.data_config = data_config

    def prepare_data(self):
        # Download the FashionMNIST dataset if not already downloaded
        FashionMNIST(root="../data", train=True, download=True)
        FashionMNIST(root="../data", train=False, download=True)

    def setup(self, stage=None):
        # Transformations to apply to the data
        transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.5,), (0.5,))    # Normalize the data to the range [-1, 1] 标准化式子: (x - 0.5) / 0.5
        ])

        # Load the FashionMNIST dataset
        self.train_dataset = FashionMNIST(root="../data", train=True, transform=transform)
        self.val_dataset = FashionMNIST(root="../data", train=False, transform=transform)

    def train_dataloader(self):
        return torch.utils.data.DataLoader(
            self.train_dataset,
            batch_size=self.data_config.batch_size,
            num_workers=self.data_config.num_workers,
            pin_memory=self.data_config.pin_memory,
            persistent_workers=True,
            shuffle=True
        )

    def val_dataloader(self):
        return torch.utils.data.DataLoader(
            self.val_dataset,
            batch_size=self.data_config.batch_size,
            num_workers=self.data_config.num_workers,
            pin_memory=self.data_config.pin_memory,
            persistent_workers=True,
            shuffle=False
        )


### 模型定义

In [24]:
class TrainingConfiguration:
    lr: float=0.001
    optimizer: str="SGD"
    def __init__(self, lr, optimizer):
        self.lr = lr
        self.optimizer = optimizer

class LitLeNetModel(pl.LightningModule):
    def __init__(self,training_config):
        super().__init__()
        self.training_config = training_config
        self.conv1 = torch.nn.Conv2d(1, 6, kernel_size=5, padding=2)  #size: 1*28*28 -> 6*28*28
        self.conv2 = torch.nn.Conv2d(6, 16, kernel_size=5, padding=0)  #size: 6*14*14 -> 16*10*10
        self.fc1 = torch.nn.Linear(16*5*5, 120)
        self.fc2 = torch.nn.Linear(120, 84)
        self.fc3 = torch.nn.Linear(84, 10)

    def forward(self, x):
        x = torch.nn.functional.max_pool2d(self.conv1(x), kernel_size=2, stride=2)
        x = torch.nn.functional.max_pool2d(self.conv2(x), kernel_size=2, stride=2)
        x = x.view(x.size(0), -1) # Flatten the tensor
        #todo: 用一个linear层代替
        x = self.fc1(x)
        x = self.fc2(x)
        x = self.fc3(x) # No activation function as it is included in the cross-entropy
        return x

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = torch.nn.functional.cross_entropy(y_hat, y)
        #on_epoch=True: tensorboard有step和epoch两条曲线
        self.log("train_loss", loss, prog_bar=True, logger=True, on_epoch=True), 
        self.log("train_acc", torch.sum(y == torch.argmax(y_hat, dim=1)).item() / len(y), prog_bar=True, logger=True,on_epoch=True)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = torch.nn.functional.cross_entropy(y_hat, y)
        #与training_step不同,tensorboard是epoch曲线
        self.log("val_loss", loss, prog_bar=True, logger=True)
        self.log("val_acc", torch.sum(y == torch.argmax(y_hat, dim=1)).item() / len(y), prog_bar=True, logger=True)

    def configure_optimizers(self):
        if self.training_config.optimizer == "SGD":
            optimizer = torch.optim.SGD(self.parameters(), lr=self.training_config.lr)
        elif self.training_config.optimizer == "Adam":
            optimizer = torch.optim.Adam(self.parameters(), lr=self.training_config.lr)
        else:
            raise ValueError("Undefined optimizer")
        return optimizer


### 工作流程

In [26]:
class TrainerConfiguration:
    max_epochs: int 
    def __init__(self, max_epochs):
        self.max_epochs = max_epochs

if __name__ == '__main__': 
    data_config = DataConfiguration(batch_size=32, num_workers=2, pin_memory=torch.cuda.is_available())
    training_config = TrainingConfiguration(lr=0.09, optimizer="SGD") #lr=0.9时梯度爆炸,loss为nan
    trainer_config = TrainerConfiguration(max_epochs=10)

    model = LitLeNetModel(training_config)
    data = LitLoadData_FashionMNist(data_config)

    #add tensorboardLogger
    tb_logger=pl.loggers.TensorBoardLogger('tensorBoard-logs/',name='classModel_Lenet_v1',default_hp_metric=False)
    
    #add modelcheckpoint
    #filename前缀与当前py文件同名,后缀为epoch和val_loss
    checkpoint_callback=pl.callbacks.ModelCheckpoint(
        monitor='val_loss',
        dirpath='checkPoint-logs/classModel_Lenet_v1',
        filename='classModel_Lenet_v1-{epoch:02d}-{val_loss:.2f}',
        save_top_k=3, # save the top 3 models
        mode='min', 

    )

    trainer=pl.Trainer(
        max_epochs=trainer_config.max_epochs, 
        logger=tb_logger, 
        callbacks=[checkpoint_callback],
        accelerator='gpu',
        enable_model_summary=True 
        )
    
    trainer.fit(model, data)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type   | Params | Mode 
-----------------------------------------
0 | conv1 | Conv2d | 156    | train
1 | conv2 | Conv2d | 2.4 K  | train
2 | fc1   | Linear | 48.1 K | train
3 | fc2   | Linear | 10.2 K | train
4 | fc3   | Linear | 850    | train
-----------------------------------------
61.7 K    Trainable params
0         Non-trainable params
61.7 K    Total params
0.247     Total estimated model params size (MB)
5         Modules in train mode
0         Modules in eval mode


Epoch 9: 100%|██████████| 1875/1875 [00:20<00:00, 93.74it/s, v_num=0, train_loss_step=0.147, train_acc_step=0.906, val_loss=0.341, val_acc=0.879, train_loss_epoch=0.287, train_acc_epoch=0.897]  

`Trainer.fit` stopped: `max_epochs=10` reached.


Epoch 9: 100%|██████████| 1875/1875 [00:20<00:00, 93.52it/s, v_num=0, train_loss_step=0.147, train_acc_step=0.906, val_loss=0.341, val_acc=0.879, train_loss_epoch=0.287, train_acc_epoch=0.897]


In [None]:
#tensorboard --logdir pytorch/tensorBoard-logs/ --port 6006 #event找不到时先确认log目录在终端的相对路径

In [5]:
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [6]:
%tensorboard --help

ERROR: Failed to launch TensorBoard (exited with 0).
Contents of stderr:
TensorFlow installation not found - running with reduced feature set.
Contents of stdout:
usage: tensorboard [-h] [--helpfull] [--logdir PATH] [--logdir_spec PATH_SPEC]
                   [--host ADDR] [--bind_all] [--port PORT]
                   [--reuse_port BOOL] [--load_fast {false,auto,true}]
                   [--extra_data_server_flags EXTRA_DATA_SERVER_FLAGS]
                   [--grpc_creds_type {local,ssl,ssl_dev}]
                   [--grpc_data_provider PORT] [--purge_orphaned_data BOOL]
                   [--db URI] [--db_import] [--inspect] [--version_tb]
                   [--tag TAG] [--event_file PATH] [--path_prefix PATH]
                   [--window_title TEXT] [--max_reload_threads COUNT]
                   [--reload_interval SECONDS] [--reload_task TYPE]
                   [--reload_multifile BOOL]
                   [--reload_multifile_inactive_secs SECONDS]
                   [--generic_dat

In [None]:
def get_dataloader(self, train): 
    features = [self.x[i : self.T-self.tau+i] for i in range(self.tau)] #shape: (tau, n-T+tau); range(0,3): 0,1,2
    self.features = torch.stack(features, 1) #shape: (n-T+tau, tau)
    self.labels = self.x[self.tau:].reshape((-1, 1)) #shape: (n-T+tau, 1)
    i = slice(0, self.num_train) if train else slice(self.num_train, None) #slice(start, end, step)的作用:
    return self.get_tensorloader([self.features, self.labels], train, i)

In [None]:
import torch
x=torch.randn(3,4)
print(x)
torch.stack((x,x),1) #shape: (3,2,4)

tensor([[ 0.3133, -0.6973,  0.4610, -1.3551],
        [-0.1408, -0.8418,  0.6508, -1.1864],
        [-0.3327,  0.4809, -0.3594,  2.5382]])


tensor([[[ 0.3133, -0.6973,  0.4610, -1.3551],
         [ 0.3133, -0.6973,  0.4610, -1.3551]],

        [[-0.1408, -0.8418,  0.6508, -1.1864],
         [-0.1408, -0.8418,  0.6508, -1.1864]],

        [[-0.3327,  0.4809, -0.3594,  2.5382],
         [-0.3327,  0.4809, -0.3594,  2.5382]]])

In [None]:
torch.stack((x,x),2) #shape: (3,4,2)

tensor([[[ 0.3133,  0.3133],
         [-0.6973, -0.6973],
         [ 0.4610,  0.4610],
         [-1.3551, -1.3551]],

        [[-0.1408, -0.1408],
         [-0.8418, -0.8418],
         [ 0.6508,  0.6508],
         [-1.1864, -1.1864]],

        [[-0.3327, -0.3327],
         [ 0.4809,  0.4809],
         [-0.3594, -0.3594],
         [ 2.5382,  2.5382]]])