In [1]:
from __future__ import print_function, division
import os
from io import BytesIO
import bson
import torch
import pandas as pd
from skimage import io, transform
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
import torch.nn.functional as F
from torchvision import transforms, utils

In [2]:
# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

plt.ion()

num_classes = 5270 # ToDo: Automatically detect num_class in CdiscountDataset

In [4]:
train_dataset = CdiscountDataset(
    offsets_csv="train_offsets.csv",
    images_csv="train_images.csv",
    bson_file_path="/mnt/data/cdiscount/train.bson",
    with_label=True,
    transform=transforms.Compose([
        Rescale(256),
        ToTensor()
    ])
)

val_dataset = CdiscountDataset(
    offsets_csv="train_offsets.csv",
    images_csv="val_images.csv",
    bson_file_path="/mnt/data/cdiscount/train.bson",
    with_label=True,
    transform=transforms.Compose([
        Rescale(256),
        ToTensor()
    ])
)

In [5]:
# for i in range(len(val_dataset)):
#     sample = val_dataset[i]
#     img, label = sample['img'], sample['label']
#     print(label)
#     if i == 30:
#         break

In [6]:
# parameters
train_batch_size = 256
val_batch_size = 256
arch = 'resnet34'
learning_rate = 1e-4
weight_decay = 5e-4
resume = None
start_epoch = 0
epochs = 10
print_freq = 10
best_prec1 = 0

In [7]:
train_dataloader = DataLoader(train_dataset, batch_size=train_batch_size, shuffle=True, num_workers=6)
val_dataloader = DataLoader(val_dataset, batch_size=val_batch_size, shuffle=True, num_workers=6)

In [8]:
import time
# end = time.clock()
# for i_batch, (img, target) in enumerate(train_dataloader):
#     print(type(img))
#     print(type(target))
#     print(target.sum())
#     if (i_batch == 5):
#         break
# print(time.clock() - end)

In [9]:
def train(train_loader, model, criterion, optimizer, epoch):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()

    # switch to train mode
    model.train()

    end = time.time()
    for i, (img, target) in enumerate(train_loader):
        # measure data loading time
        data_time.update(time.time() - end)
        target = target.cuda(async=True)
        input_var = torch.autograd.Variable(img)
        target_var = torch.autograd.Variable(target)

        # compute output
        output = model(input_var)
        loss = criterion(output, target_var)
        
#         print("print argmax output:")
#         values, indices = torch.max(output, 0)
#         print(values)
#         print(indices)
        
#         print("print loss:")
#         print(loss)
        
        # measure accuracy and record loss
        prec1, prec5 = accuracy(output.data, target, topk=(1, 5))
        losses.update(loss.data[0], img.size(0))
        top1.update(prec1[0], img.size(0))
        top5.update(prec5[0], img.size(0))

        # compute gradient and do one optimization step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % print_freq == 0:
            print('Epoch: [{0}][{1}/{2}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                  'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
                  'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
                   epoch, i, len(train_loader), batch_time=batch_time,
                   data_time=data_time, loss=losses, top1=top1, top5=top5))


def validate(val_loader, model, criterion):
    batch_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()

    # switch to evaluate mode
    model.eval()

    end = time.time()
    for i, (img, target) in enumerate(val_loader):
        target = target.cuda(async=True)
        input_var = torch.autograd.Variable(img, volatile=True)
        target_var = torch.autograd.Variable(target, volatile=True)

        # compute output
        output = model(input_var)
        loss = criterion(output, target_var)
        
        # measure accuracy and record loss
        prec1, prec5 = accuracy(output.data, target, topk=(1, 5))
        losses.update(loss.data[0], img.size(0))
        top1.update(prec1[0], img.size(0))
        top5.update(prec5[0], img.size(0))

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % print_freq == 0:
            print('Test: [{0}/{1}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                  'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
                  'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
                   i, len(val_loader), batch_time=batch_time, loss=losses,
                   top1=top1, top5=top5))

    print(' * Prec@1 {top1.avg:.3f} Prec@5 {top5.avg:.3f}'.format(top1=top1, top5=top5))

    return top1.avg


def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
    torch.save(state, filename)
    if is_best:
        shutil.copyfile(filename, 'model_best.pth.tar')


class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

        
def adjust_learning_rate(learning_rate, optimizer, epoch):
    """Sets the learning rate to the initial LR decayed by 10 every 1 epochs"""
    new_learning_rate = learning_rate * (0.1 ** (epoch // 1))
    for param_group in optimizer.param_groups:
        param_group['lr'] = new_learning_rate
        
        
def accuracy(output, target, topk=(1,)):
    """Computes the precision@k for the specified values of k"""
    maxk = max(topk)
    batch_size = target.size(0)

    _, pred = output.topk(maxk, 1, True, True)
    pred = pred.t()
    correct = pred.eq(target.view(1, -1).expand_as(pred))

    res = []
    for k in topk:
        correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
        res.append(correct_k.mul_(100.0 / batch_size))
    return res

In [10]:
class AssembledModel(torch.nn.Module):
    def __init__(self, model, classifier):
        super().__init__()
        self.__class__.__name__ = "AssembledModel"
        self.model = model
        self.classifier = classifier

    def forward(self, x):
        x = self.model(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return x

def assemble_model(model, cut, fin, num_classes, activation):
    # cut the classifier layer
    model = torch.nn.Sequential(*list(model.children())[:cut])
    
    # create a new classifier
    classifier_layers = [
        torch.nn.Linear(in_features=fin, out_features=num_classes),
    ]
    classifier = torch.nn.Sequential(*classifier_layers)
    
    # return the assembled model
    return AssembledModel(model, classifier)

In [None]:
import torchvision.models as models

print("=> using pre-trained model '{}'".format(arch))
model = models.__dict__[arch](pretrained=True)
model = assemble_model(model, -1, 512, 5270, torch.nn.Softmax())
model = torch.nn.DataParallel(model).cuda()

print(model)
print(model.parameters())

# for child in model.children():
#     print(child)
#     for param in child.parameters():
#         print('require_grad = ' + str(param.requires_grad))

=> using pre-trained model 'resnet34'
DataParallel (
  (module): AssembledModel (
    (model): Sequential (
      (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True)
      (2): ReLU (inplace)
      (3): MaxPool2d (size=(3, 3), stride=(2, 2), padding=(1, 1), dilation=(1, 1))
      (4): Sequential (
        (0): BasicBlock (
          (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True)
          (relu): ReLU (inplace)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True)
        )
        (1): BasicBlock (
          (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True)
       

In [None]:
# define loss function (criterion) and optimizer
criterion = torch.nn.CrossEntropyLoss().cuda()
optimizer = torch.optim.Adam(
    model.parameters(), 
    lr = learning_rate,
    weight_decay=weight_decay
)

# optionally resume from a checkpoint
if resume:
    if os.path.isfile(resume):
        print("=> loading checkpoint '{}'".format(resume))
        checkpoint = torch.load(resume)
        start_epoch = checkpoint['epoch']
        best_prec1 = checkpoint['best_prec1']
        model.load_state_dict(checkpoint['state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        print("=> loaded checkpoint '{}' (epoch {})"
              .format(resume, checkpoint['epoch']))
    else:
        print("=> no checkpoint found at '{}'".format(args.resume))

torch.backends.cudnn.benchmark = True # uses the inbuilt cudnn auto-tuner to find the fastest convolution algorithms.
                       # If this is set to false, uses some in-built heuristics that might not always be fastest.

# train
print("start training")
for epoch in range(start_epoch, epochs):
    adjust_learning_rate(learning_rate, optimizer, epoch)

    # train for one epoch
    train(train_dataloader, model, criterion, optimizer, epoch)

    # evaluate on validation set
    prec1 = validate(val_dataloader, model, criterion)

    # remember best prec@1 and save checkpoint
    is_best = prec1 > best_prec1
    best_prec1 = max(prec1, best_prec1)
    save_checkpoint({
        'epoch': epoch + 1,
        'arch': arch,
        'state_dict': model.state_dict(),
        'best_prec1': best_prec1,
        'optimizer' : optimizer.state_dict(),
    }, is_best)

start training
Epoch: [0][0/38676]	Time 25.902 (25.902)	Data 9.863 (9.863)	Loss 8.7853 (8.7853)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][10/38676]	Time 3.825 (5.726)	Data 0.001 (0.898)	Loss 8.1375 (8.4579)	Prec@1 3.906 (1.243)	Prec@5 5.078 (2.166)
Epoch: [0][20/38676]	Time 3.786 (4.817)	Data 0.003 (0.471)	Loss 7.2237 (8.0891)	Prec@1 11.719 (3.906)	Prec@5 17.578 (7.050)
Epoch: [0][30/38676]	Time 3.852 (4.497)	Data 0.002 (0.320)	Loss 6.7817 (7.7525)	Prec@1 15.625 (6.376)	Prec@5 23.438 (11.114)
Epoch: [0][40/38676]	Time 3.817 (4.336)	Data 0.001 (0.242)	Loss 6.4757 (7.4656)	Prec@1 14.453 (8.013)	Prec@5 22.266 (13.815)
Epoch: [0][50/38676]	Time 3.886 (4.238)	Data 0.002 (0.195)	Loss 6.0119 (7.2245)	Prec@1 17.969 (9.574)	Prec@5 25.781 (16.108)
Epoch: [0][60/38676]	Time 3.835 (4.172)	Data 0.001 (0.163)	Loss 5.9001 (7.0240)	Prec@1 18.750 (10.854)	Prec@5 29.688 (18.110)
Epoch: [0][70/38676]	Time 3.871 (4.125)	Data 0.001 (0.141)	Loss 5.6352 (6.8609)	Prec@1 18.359 (11.928)	Prec@5 36.71

Epoch: [0][650/38676]	Time 3.829 (3.861)	Data 0.002 (0.017)	Loss 3.9765 (4.7656)	Prec@1 33.984 (29.664)	Prec@5 52.734 (44.294)
Epoch: [0][660/38676]	Time 3.810 (3.861)	Data 0.003 (0.017)	Loss 3.9605 (4.7505)	Prec@1 36.328 (29.803)	Prec@5 51.562 (44.466)
Epoch: [0][670/38676]	Time 3.792 (3.860)	Data 0.002 (0.017)	Loss 3.6625 (4.7382)	Prec@1 41.406 (29.918)	Prec@5 56.641 (44.606)
Epoch: [0][680/38676]	Time 3.783 (3.860)	Data 0.002 (0.016)	Loss 3.6963 (4.7258)	Prec@1 41.406 (30.030)	Prec@5 55.469 (44.733)
Epoch: [0][690/38676]	Time 3.803 (3.859)	Data 0.002 (0.016)	Loss 3.5781 (4.7126)	Prec@1 40.625 (30.148)	Prec@5 57.812 (44.882)
Epoch: [0][700/38676]	Time 3.771 (3.859)	Data 0.002 (0.016)	Loss 4.2409 (4.7021)	Prec@1 31.250 (30.230)	Prec@5 50.781 (45.001)
Epoch: [0][710/38676]	Time 3.833 (3.859)	Data 0.003 (0.016)	Loss 3.7151 (4.6907)	Prec@1 41.797 (30.325)	Prec@5 57.031 (45.120)
Epoch: [0][720/38676]	Time 3.823 (3.858)	Data 0.001 (0.016)	Loss 3.6800 (4.6778)	Prec@1 41.406 (30.448)	Prec@5 