In [5]:
import os
import re
## 本算法使用CPU训练
## 此条命令运行必须放在 import torch 之前，否则不能生效
# os.environ["CUDA_VISIBLE_DEVICES"] = "0" 
import sys
### pytorch 相关
import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.optim.lr_scheduler import StepLR
### pytorch 自定义损失函数
from torch.autograd import Function
from torch.autograd import Variable

### torchvision
import torchvision.transforms as transforms

import quaternion # qw qx qy qz

### matplotlib
from matplotlib import pyplot as plt

### numpy
import numpy as np

### time，延时函数-秒 time.sleep(1) 延时1秒
import time 

### PIL
from PIL import Image

### SciPy
import scipy.signal as signal

# jupyter 使用 matplotlib 绘图所需，否则会挂掉
os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'

# 数据集相关
from torch.utils.data import Dataset, DataLoader

# 配置文件读取 yaml
import yaml

# 进度条
import tqdm

import datetime

from tinyGC2L import TinyGC2L_Net
from myDataset import MyDataset

In [6]:
data_base_path = '400_samples'
train_data_path = ['400_total']
train_data_path = [f'{data_base_path}/{d}' for d in train_data_path]
train_epoch_cnt = 10
device = torch.device('cuda:0') # cuda:0 cpu

weight_dir = '../weights/weights_'

In [7]:
def evaluate(model, loss_function, data_loader, device, epoch):
    # 在训练开始之前写上 model.trian() ，在测试时写上 model.eval() 
    # 避免 BN层 和 Dropou 的影响
    model.eval()
    # 累积损失
    accu_loss = torch.zeros(1).to(device)
    for step, (data, file_path) in enumerate(data_loader, start=0):
    # for step, data in enumerate(data_loader, start=0):
        timestampns_set = data['timestampns_set'] # [1000, 226]
        gyro_set = data['gyro_set'] # [1000, 226, 3]  [​​batch_size​​​,​​channel​​​,​​height​​​,​​width​​]
        start_quat = data['start_quat'] # [1000, 4]
        label = data['label']

        # 正向传播
        output = model(timestampns_set.to(device), gyro_set.to(device), start_quat.to(device))
        
        # 计算损失
        loss = loss_function(output, label.to(device))

        # 打印统计信息
        accu_loss += loss.detach()

        debug_desc = "[train epoch {}] evaluation loss: {:.6f}".format(epoch, accu_loss.item() / (step + 1))
        # 判断是否出现无效 loss 值
        if not torch.isfinite(loss):
            print('WARNING: non-finite loss, ending training ', loss)
            sys.exit(1)
        #for循环结束后，统计每个 step 的 loss
        # return accu_loss.item() / (step + 1)
        return debug_desc
    

In [8]:

for data_path in train_data_path:
    save_dir = weight_dir + data_path

    # train_dataset = MyDataset(dataset_name =  'Euroc/Len_400_Samples', # 'simulator',
    train_dataset = MyDataset(dataset_name =  data_path, # 'simulator',
                            category = 'train',
                            transform = None)

    # val_dataset = MyDataset(dataset_name = 'Euroc/Len_400_Samples', # 'simulator',
    val_dataset = MyDataset(dataset_name = data_path, # 'simulator',
                            category = 'val',
                            transform = None)

    print('Total train set size: {}'.format(train_dataset.__len__()))
    print('Total val set size: {}'.format(val_dataset.__len__()))

    # 使用DataLoader可以利用多线程，batch,shuffle等
    trainset_dataloader = DataLoader(dataset = train_dataset,
                                    batch_size = 150, # 50
                                    shuffle = True,
                                    num_workers = 5)

    val_dataloader = DataLoader(dataset = val_dataset,
                            batch_size = val_dataset.__len__(), # 30  
                            shuffle = False, # 训练集往往需要进行shuffle，这样可以避免对顺序的依赖，而验证集不需要
                            num_workers = 5)

    ## 设定训练参数
    ddi_net = TinyGC2L_Net(device)
    ddi_net = ddi_net.to(device)

    # 判断 save_dir 路径中是否存在　best-model.pth 文件
    weight_file_path = os.path.join(save_dir, 'best-model.pth')  # 替换为你想要检查的文件路径

    if os.path.exists(weight_file_path):
        ddi_net.load_state_dict(torch.load(weight_file_path))
    else:
        print('Create new best-model.pth')

    # 定义损失函数
    loss_function = torch.nn.MSELoss(size_average=None, reduce=None, reduction='sum')  # reduction='sum'
    # 定义优化器
    optimizer = torch.optim.LBFGS(ddi_net.parameters(), lr=0.05)
    # 定义动态学习率
    lr_step = StepLR(optimizer, step_size=5, gamma=0.7)

    # 训练过程
    # 一个epoch即对整个训练集进行一次训练，循环几次就是几轮
    last_loss = 0
    os.makedirs(save_dir, exist_ok=True)  # 创建多级路径，如果已存在则不会报错
    for epoch in range(train_epoch_cnt):
        # train
        def closure():
            optimizer.zero_grad()
            for step, (data, file_path) in enumerate(trainset_dataloader, start=0):
            # for step, data in enumerate(trainset_dataloader, start=0):
                timestampns_set = data['timestampns_set'] # [1000, 226]
                gyro_set = data['gyro_set'] # [1000, 226, 3]  [​​batch_size​​​,​​channel​​​,​​height​​​,​​width​​]
                start_quat = data['start_quat'] # [1000, 4]
                label = data['label']
                 # 正向传播
                timestampns_set = timestampns_set.to(device)
                gyro_set = gyro_set.to(device)
                start_quat = start_quat.to(device)
                output = ddi_net(timestampns_set, gyro_set, start_quat)
                # 计算损失
                loss = loss_function(output, label.to(device))
                 # 反向传播
                loss.backward()
            # 输出训练损失
            # print(f'Epoch {epoch+1}/{train_epoch_cnt}, Loss: {loss.item()}')
            return loss.item()
        
        optimizer.step(closure)
        # 学习率更新
        lr_step.step()
        eval_debug_desc = evaluate(
                                    model=ddi_net,
                                    loss_function=loss_function,
                                    data_loader=val_dataloader,
                                    device=device,
                                    epoch=epoch)

        print(eval_debug_desc)

        with open(os.path.join(save_dir, 'train_eval_loss.txt'), 'a') as file: 
            file.write('{}\n'.format(eval_debug_desc))

        # 保存训练损失更小的权重参数
        current_eval_loss = float(re.findall(r'\d+\.\d+|\d+', eval_debug_desc)[1])
        if epoch == 0:
            last_loss = current_eval_loss
        elif current_eval_loss < last_loss:
            save_path = os.path.join(save_dir, "best-model.pth".format(epoch))
            torch.save(ddi_net.state_dict(), save_path)
            print("best-model.pth's train loss {}".format(current_eval_loss))
            last_loss = current_eval_loss

Total train set size: 261
Total val set size: 52
Create new best-model.pth
[train epoch 0] evaluation loss: 0.995606
[train epoch 1] evaluation loss: 0.216183
best-model.pth's train loss 0.216183
[train epoch 2] evaluation loss: 0.033010
best-model.pth's train loss 0.03301
[train epoch 3] evaluation loss: 0.005581
best-model.pth's train loss 0.005581
[train epoch 4] evaluation loss: 0.001136
best-model.pth's train loss 0.001136
[train epoch 5] evaluation loss: 0.000485
best-model.pth's train loss 0.000485
[train epoch 6] evaluation loss: 0.000286
best-model.pth's train loss 0.000286
[train epoch 7] evaluation loss: 0.000227
best-model.pth's train loss 0.000227
[train epoch 8] evaluation loss: 0.000206
best-model.pth's train loss 0.000206
[train epoch 9] evaluation loss: 0.000200
best-model.pth's train loss 0.0002
