In [1]:
import os
import argparse
import json
import torch
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter
from torchvision import transforms
import matplotlib.pyplot as plt
from my_dataset import MyDataSet
from model import swin_base_patch4_window7_224_in22k as create_model
from utils import read_split_data, train_one_epoch, evaluate
import torch.optim.lr_scheduler as lr_scheduler
import math
from torch import nn
from apollo import Apollo
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def one_cycle(y1=0.0, y2=1.0, steps=100):
    # lambda function for sinusoidal ramp from y1 to y2
    return lambda x: ((1 - math.cos(x * math.pi / steps)) / 2) * (y2 - y1) + y1
def main(args):
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    if os.path.exists("./weights") is False:
        os.makedirs("./weights")

    tb_writer = SummaryWriter()

    train_images_path, train_images_label, val_images_path, val_images_label = read_split_data(args.data_path)

    img_size = 224
    #预处理
    data_transform = {
        "train": transforms.Compose([transforms.RandomResizedCrop(img_size),
                                     transforms.RandomHorizontalFlip(),
                                     transforms.ToTensor(),
                                     transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])]),
        "val": transforms.Compose([transforms.Resize(int(img_size * 1.143)),
                                   transforms.CenterCrop(img_size),
                                   transforms.ToTensor(),
                                   transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])}

    # 实例化训练数据集
    train_dataset = MyDataSet(images_path=train_images_path,
                              images_class=train_images_label,
                              transform=data_transform["train"])

    # 实例化验证数据集
    val_dataset = MyDataSet(images_path=val_images_path,
                            images_class=val_images_label,
                            transform=data_transform["val"])

    batch_size = args.batch_size
    nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8])  # number of workers
    print('Using {} dataloader workers every process'.format(nw))
    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=batch_size,
                                               shuffle=True,
                                               pin_memory=True,
                                               num_workers=0,
                                               collate_fn=train_dataset.collate_fn)

    val_loader = torch.utils.data.DataLoader(val_dataset,
                                             batch_size=batch_size,
                                             shuffle=False,
                                             pin_memory=True,
                                             num_workers=0,
                                             collate_fn=val_dataset.collate_fn)

    model = create_model(num_classes=args.num_classes , input_size=[1024, 7, 7], routings=3)
#     if torch.cuda.device_count() > 1:
#         print("Use", torch.cuda.device_count(), 'gpus')
#         model = nn.DataParallel(model)
#     if torch.cuda.device_count() > 1:
#         print("Use", torch.cuda.device_count(), 'gpus')
#         model = nn.DataParallel(model, device_ids=[0,1])
    model = model.to(device)
#     if args.weights != "":
#         assert os.path.exists(args.weights), "weights file: '{}' not exist.".format(args.weights)
#         weights_dict = torch.load(args.weights, map_location=device)["model"]
#         # 删除有关分类类别的权重
#         for k in list(weights_dict.keys()):
#             if "head" in k:
#                 del weights_dict[k]
#         print(model.load_state_dict(weights_dict, strict=False))

#     if args.freeze_layers:
#         for name, para in model.named_parameters():
#             # 除head外，其他权重全部冻结
#             if "head" not in name:
#                 para.requires_grad_(False)
#             else:
#                 print("training {}".format(name))
    pg = [p for p in model.parameters() if p.requires_grad]
    

    #optimizer = optim.AdamW(pg, lr=args.lr, betas=(0.9, 0.999), eps=5E-2, weight_decay=0.01)
    #torch.optim.AdamW(params, lr=args.lr, betas=(0.9, 0.999), eps=1e-10, weight_decay=0.01, amsgrad=False)
    optimizer = optim.AdamW(pg, lr=args.lr, weight_decay=5E-6)
    #optimizer = optim.AdamW(pg, lr=args.lr, weight_decay=5E-5)
#     optimizer = optim.Adam(pg, lr=args.lr , weight_decay=5E-5)
    #optimizer = optim.Adam(pg, lr=args.lr , betas=0.999,eps=1e-08, weight_decay=1E-4)
    #定义优化器
    #通过降低损失，来更新模型权重
    #optimizer = optim.SGD(pg, lr=args.lr, momentum=0.9, weight_decay=5E-5)
    #optimizer = Apollo(pg, lr=args.lr, beta=0.9, eps=1e-4, rebound=args.rebound,
                          # warmup=args.warmup_updates, init_lr=args.init_lr, weight_decay=0.)
    lf = lambda x: ((1 + math.cos(x * math.pi / args.epochs)) / 2) * (1 - args.lrf) + args.lrf  # cosine  
    #lf = lambda epoch:0.1*epoch
    scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf)
    #epochs = args.epochs
    #scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda epochs: 1/(epochs+1))
    #scheduler = lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)
    numbers = {'epoch':[],'train_loss':[],'train_acc': [], 'val_loss': [], 'val_acc': [], 'best_val':[],'lr':[]}
    pths = []
    best_val_acc = 0
    v_acc = []
    epochs = []
    best_epoch = []
    for epoch in range(args.epochs):
        #optimizer.zero_grad()
        # trai#
        train_loss, train_acc = train_one_epoch(model=model,
                                                optimizer=optimizer,
                                                data_loader=train_loader,
                                                device=device,
                                                epoch=epoch)
        
        scheduler.step()     #调整学习率
        #scheduler.step()     #调整学习率
        # validate
        val_loss, val_acc = evaluate(model=model,
                                     data_loader=val_loader,
                                     device=device,
                                     epoch=epoch)
        optimizer.step()   #模型的更新
        #optimizer.step()   #模型的更新
        epochs.append(epoch)
        v_acc.append(val_acc)
        numbers['epoch'].append(epoch)
        numbers['train_loss'].append(train_loss)
        numbers['train_acc'].append(train_acc)
        numbers['val_loss'].append(val_loss)
        numbers['val_acc'].append(val_acc)
        numbers['lr'].append(optimizer.state_dict()['param_groups'][0]['lr'])
        tags = ["train_loss", "train_acc", "val_loss", "val_acc", "learning_rate"]
        tb_writer.add_scalar(tags[0], train_loss, epoch)
        tb_writer.add_scalar(tags[1], train_acc, epoch)
        tb_writer.add_scalar(tags[2], val_loss, epoch)
        tb_writer.add_scalar(tags[3], val_acc, epoch)
        tb_writer.add_scalar(tags[4], optimizer.param_groups[0]["lr"], epoch)
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            pths.append(model.state_dict())
            best_epoch.append(epoch)
            numbers['best_val'].append(val_acc)
        if epoch == 299:
            torch.save(model.state_dict(), "./weights/last_model.pth")
        filename='databc_nobsrgan.json'
        with open(filename,'w') as f:
            json.dump(numbers,f)
    print(best_epoch[len(best_epoch)-1])
    torch.save(pths[len(pths)-1], "./weights/best_model.pth")
    plt.plot(epochs,v_acc)
    print("best_epoch:",best_epoch[len(best_epoch)-1])

        #torch.save(model.state_dict(), "./weights/model-{}.pth".format(epoch))


In [3]:
if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--num_classes', type=int, default=2)
    parser.add_argument('--epochs', type=int, default=300)
    parser.add_argument('--batch-size', type=int, default=16)
    parser.add_argument('--lr', type=float, default=0.00001)#0.00001
    parser.add_argument('--rebound', choices=['constant', 'belief'], default='constant', help='type of recified bound of diagonal hessian')
    parser.add_argument('--lrf', type=float, default=0.001)
    parser.add_argument('--init_lr', type=float, default=0, help='initial learning rate')
    parser.add_argument('--warmup_updates', type=int, default=0, metavar='N', help='number of updates to warm up (default: 0)')
    # 数据集所在根目录
    # http://download.tensorflow.org/example_images/flower_photos.tgz
    parser.add_argument('--data-path', type=str,
                        default="../data7")

    # 预训练权重路径，如果不想载入就设置为空字符
    parser.add_argument('--weights', type=str, default='./swin_base_patch4_window7_224_22k.pth',
                        help='initial weights path')
    # 是否冻结权重
    parser.add_argument('--freeze-layers', type=bool, default=False)
    parser.add_argument('--device', default='cuda:1', help='device id (i.e. 0 or 0,1 or cpu)')

    opt = parser.parse_args(args=[])

    main(opt)


893 images were found in the dataset.
715 images for training.
178 images for validation.
Using 8 dataloader workers every process
  0%|          | 0/45 [00:00<?, ?it/s]


RuntimeError: CUDA out of memory. Tried to allocate 26.00 MiB (GPU 0; 10.76 GiB total capacity; 2.95 GiB already allocated; 15.12 MiB free; 3.06 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF