In [1]:
import os
import sys
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from tensorboardX import SummaryWriter

import time
import datetime
import json
import logging
import importlib
import shutil
from collections import defaultdict

#MY_DIRNAME = os.path.dirname(os.path.abspath(states))
#sys.path.insert(0, os.path.join(MY_DIRNAME, '..'))

# TODO: import net 
from yolo_model import yoloModel
from PASCAL_Dataloader import create_split_loaders
from YOLO_Loss import YoloLoss

total_loss = defaultdict(dict)
avg_minibatch_loss = defaultdict(dict)

In [2]:
def train(config):
    config['global_step'] = config.get('start_step', 0)
    is_training = False if config.get('export_onnx') else True

    # TODO: Load and initialize network
    net = yoloModel(config)

    # Define the optimizer and learning rate
    optimizer = obtain_optimizer(config, net)
    lr_scheduler = optim.lr_scheduler.StepLR(optimizer,
        step_size=config['decay_step'],
        gamma=config['decay_gamma'])

    # Use pretrained model
    if config['pretrain_snapshot']:
        print('Hello')
        logging.info('Load pretrained weights from {}'.format(config['pretrain_snapshot']))
        state_dict = torch.load(config['pretrain_snapshot'])
        net.load_state_dict(state_dict)

    # Use all 3 scales for computing YOLO loss
    YOLO_losses = []
    for i in range(3):
        YOLO_losses.append(YoloLoss(config['classes'], (config['img_w'], config['img_h']), config['anchors'][i]))

    # Check if your system supports CUDA
    use_cuda = torch.cuda.is_available()

    # Setup GPU optimization if CUDA is supported
    if use_cuda:
        computing_device = torch.device("cuda")
        extras = {"num_workers": 3, "pin_memory": True}
        print("CUDA is supported")
    else: # Otherwise, train on the CPU
        computing_device = torch.device("cpu")
        extras = False
        print("CUDA NOT supported")
    
    # Load in data 
    root_dir = os.getcwd()
    train_loader, val_loader, test_loader = create_split_loaders(root_dir, config['batch_size'])
    
    # Instantiate model to run on the GPU or CPU based on CUDA support
    net = net.to(computing_device)
    print("Model on CUDA?", next(net.parameters()).is_cuda)
    
    # Begin training loop
    print("Start training:")
    for epoch in range(config['epochs']):
        N_minibatch_loss = 0.0
        for minibatch, samples in enumerate(train_loader):
            images, labels = samples["image"], samples["label"]
            start_time = time.time()
            config['global_step'] += 1
                
            #images = images.unsqueeze(0)

            # Put the minibatch data in CUDA Tensors and run on the GPU if supported
            images = images.to(computing_device)
            #labels = labels.to(computing_device)

            # Forward and backward
            optimizer.zero_grad()
            outputs = net(images)
            loss_names = ["total_loss", "x", "y", "w", "h", "conf", "cls"]
            losses = []
            for z in range(len(loss_names)):
                losses.append([])
            for i in range(3):
                loss_item = YOLO_losses[i](outputs[i], labels)
                for j, l in enumerate(loss_item):
                    losses[j].append(l)
            losses = [sum(l) for l in losses]
            loss = losses[0]
            loss.backward()
            optimizer.step()
            
            # Add this iteration's loss to the total_loss
            total_loss[epoch][minibatch] = loss.item()
            N_minibatch_loss += loss

            if minibatch > 0 and minibatch % 10 == 0:
                _loss = loss.item()
                N_minibatch_loss /= 10
                lr = optimizer.param_groups[0]['lr']
                print('Epoch [%.3d] Minibatch = %d Loss = %.2f lr = %.5f '%
                    (epoch, minibatch, N_minibatch_loss, lr))
                
                # Add the averaged loss over N minibatches and reset the counter
                avg_minibatch_loss[epoch][minibatch] = N_minibatch_loss.item()
                N_minibatch_loss = 0.0
                
                config['tensorboard_writer'].add_scalar("lr",
                                                        lr,
                                                        config['global_step'])
                for i, name in enumerate(loss_names):
                    value = _loss if i == 0 else losses[i]
                    config['tensorboard_writer'].add_scalar(name,
                                                            value,
                                                            config['global_step'])

            if minibatch > 0 and minibatch % 100 == 0:
                save_checkpoint(net.state_dict(), config)
                

        lr_scheduler.step()

    save_checkpoint(net.state_dict(), config)
    print('Training Complete')

    
def save_checkpoint(state_dict, config, evaluate_func=None):
        
    checkpoint_path = os.path.join(config["sub_working_dir"], "model.pth")
    torch.save(state_dict, checkpoint_path)
    print("Model checkpoint saved to %s" % checkpoint_path)


def obtain_optimizer(config, net):
    optimizer = None

    # Assign different learning rate for each layer
    params = None
    base_parameters = list(
        map(id, net.backbone.parameters())
    )
    logits_parameters = filter(lambda p: id(p) not in base_parameters, net.parameters())

    if not config['freeze_backbone']:
        parameters = [
            {"params": logits_parameters, "lr": config['other_lr']},
            {"params": net.backbone.parameters(), "lr": config['backbone_lr']},
        ]
    else:
        print("Freezing backbone parameters")
        for p in net.backbone.parameters():
            p.requires_grad = False
        parameters = [
            {"params": logits_parameters, "lr": config['other_lr']},
        ]

    # Initialize optimizer class
    if config['optimizer_type'] == "adam":
        optimizer = optim.Adam(params, weight_decay=config['optimizer_weight_decay'])
    elif config['optimizer_type'] == "amsgrad":
        optimizer = optim.Adam(params, weight_decay=config['optimizer_weight_decay'], amsgrad=True)
    elif config['optimizer_type'] == "rmsprop":
        optimizer = optim.RMSprop(params, weight_decay=config['optimizer_weight_decay'])
    else:
        optimizer = optim.SGD(parameters, momentum=0.9, weight_decay=config['optimizer_weight_decay'],
                              nesterov=(config['optimizer_type'] == "nesterov"))

    return optimizer

In [3]:
def main():
    
    # Initialize hyperparameters/variables
    config = {}
    config['backbone_name'] = "darknet_53"
    config['backbone_pretrained'] = "./darknet53_weights_pytorch.pth" # set empty to disable
    
    config['anchors'] = [[[116, 90], [156, 198], [373, 326]],
                                [[30, 61], [62, 45], [59, 119]],
                                [[10, 13], [16, 30], [33, 23]]]
    config['classes'] = 20
    
    config['backbone_lr'] = 0.001
    config['other_lr'] = 0.01
    config['freeze_backbone'] = False   #  freeze backbone wegiths to finetune
    config['decay_gamma'] = 0.1
    config['decay_step'] = 10         #  decay lr in every ? epochs
    
    config['optimizer_type'] = "sgd"
    config['optimizer_weight_decay'] = 4e-05
    
    config['batch_size'] = 16  # Number of training samples per batch to be passed to network
    config['epochs'] = 20  # Number of epochs to train the model
    config['img_h'] = config['img_w'] = 416
    config['seed'] = np.random.seed()
    config['working_dir'] = "./states"     #  replace with your working dir
    
    def get_latest_states(dirpath):
        """
        Get the latest image file in the given directory
        """
        # get filepaths of all files and dirs in the given dir
        valid_files = [os.path.join(dirpath, filename) for filename in os.listdir(dirpath)]

        return max(valid_files, key=os.path.getmtime)
   
    
    # Create sub_working_dir
    sub_working_dir = '{}/{}'.format(
        config['working_dir'], time.strftime("%Y%m%d%H%M%S", time.localtime()))
    #if not os.path.exists(sub_working_dir):
        #os.makedirs(sub_working_dir)
    config["sub_working_dir"] = sub_working_dir
    logging.info("sub working dir: %s" % sub_working_dir)
    
    if not os.path.exists(os.path.join(sub_working_dir, "model.pth")):
        config['pretrain_snapshot'] = ""
    else:
        config['pretrain_snapshot'] = os.path.join(get_latest_states(config['working_dir']), "model.pth")       #  load checkpoint
        
    # Create tf_summary writer
    config["tensorboard_writer"] = SummaryWriter(sub_working_dir)

    # Start training
    train(config)

In [None]:
if __name__ == "__main__":
    main()

CUDA is supported
Model on CUDA? True
Start training:
Epoch [000] Minibatch = 10 Loss = 3.16 lr = 0.01000 
Epoch [000] Minibatch = 20 Loss = 1.81 lr = 0.01000 
Epoch [000] Minibatch = 30 Loss = 1.27 lr = 0.01000 
Epoch [000] Minibatch = 40 Loss = 1.03 lr = 0.01000 
Epoch [000] Minibatch = 50 Loss = 0.90 lr = 0.01000 
Epoch [000] Minibatch = 60 Loss = 0.85 lr = 0.01000 
Epoch [000] Minibatch = 70 Loss = 0.76 lr = 0.01000 
Epoch [000] Minibatch = 80 Loss = 0.74 lr = 0.01000 
Epoch [000] Minibatch = 90 Loss = 0.68 lr = 0.01000 
Epoch [000] Minibatch = 100 Loss = 0.67 lr = 0.01000 
Epoch [000] Minibatch = 190 Loss = 0.54 lr = 0.01000 
Epoch [000] Minibatch = 200 Loss = 0.52 lr = 0.01000 
Model checkpoint saved to ./states/20190523200450/model.pth
Epoch [000] Minibatch = 210 Loss = 0.55 lr = 0.01000 
Epoch [000] Minibatch = 220 Loss = 0.52 lr = 0.01000 
Epoch [000] Minibatch = 230 Loss = 0.52 lr = 0.01000 
Epoch [000] Minibatch = 240 Loss = 0.51 lr = 0.01000 
Epoch [000] Minibatch = 250 Los

Epoch [002] Minibatch = 440 Loss = 0.32 lr = 0.01000 
Epoch [002] Minibatch = 450 Loss = 0.35 lr = 0.01000 
Epoch [002] Minibatch = 460 Loss = 0.35 lr = 0.01000 
Epoch [002] Minibatch = 470 Loss = 0.38 lr = 0.01000 
Epoch [002] Minibatch = 480 Loss = 0.33 lr = 0.01000 
Epoch [002] Minibatch = 490 Loss = 0.38 lr = 0.01000 
Epoch [002] Minibatch = 500 Loss = 0.38 lr = 0.01000 
Model checkpoint saved to ./states/20190523200450/model.pth
Epoch [002] Minibatch = 510 Loss = 0.38 lr = 0.01000 
Epoch [003] Minibatch = 10 Loss = 0.36 lr = 0.01000 
Epoch [003] Minibatch = 20 Loss = 0.33 lr = 0.01000 
Epoch [003] Minibatch = 30 Loss = 0.34 lr = 0.01000 
Epoch [003] Minibatch = 40 Loss = 0.33 lr = 0.01000 
Epoch [003] Minibatch = 50 Loss = 0.33 lr = 0.01000 
Epoch [003] Minibatch = 60 Loss = 0.33 lr = 0.01000 
Epoch [003] Minibatch = 70 Loss = 0.34 lr = 0.01000 
Epoch [003] Minibatch = 80 Loss = 0.33 lr = 0.01000 
Epoch [003] Minibatch = 90 Loss = 0.35 lr = 0.01000 
Epoch [003] Minibatch = 100 Los

Epoch [005] Minibatch = 290 Loss = 0.27 lr = 0.01000 
Epoch [005] Minibatch = 300 Loss = 0.31 lr = 0.01000 
Model checkpoint saved to ./states/20190523200450/model.pth
Epoch [005] Minibatch = 310 Loss = 0.32 lr = 0.01000 
Epoch [005] Minibatch = 320 Loss = 0.32 lr = 0.01000 
Epoch [005] Minibatch = 330 Loss = 0.30 lr = 0.01000 
Epoch [005] Minibatch = 340 Loss = 0.30 lr = 0.01000 
Epoch [005] Minibatch = 350 Loss = 0.31 lr = 0.01000 
Epoch [005] Minibatch = 360 Loss = 0.31 lr = 0.01000 
Epoch [005] Minibatch = 370 Loss = 0.30 lr = 0.01000 
Epoch [005] Minibatch = 380 Loss = 0.35 lr = 0.01000 
Epoch [005] Minibatch = 390 Loss = 0.31 lr = 0.01000 
Epoch [005] Minibatch = 400 Loss = 0.30 lr = 0.01000 
Model checkpoint saved to ./states/20190523200450/model.pth
Epoch [005] Minibatch = 410 Loss = 0.33 lr = 0.01000 
Epoch [005] Minibatch = 420 Loss = 0.28 lr = 0.01000 
Epoch [005] Minibatch = 430 Loss = 0.30 lr = 0.01000 
Epoch [005] Minibatch = 440 Loss = 0.32 lr = 0.01000 
Epoch [005] Mini