**train between 100 and 001. Free last two res layers, lr = 1e-4, 1 epoch: 67.8 --> 69.2; 2nd epoch improves training acc but not val acc.**

In [1]:
import os, sys, math, io
import numpy as np
import pandas as pd
import multiprocessing as mp
import bson
import struct
from PIL import Image
import time
import shutil

%matplotlib inline
import matplotlib.pyplot as plt

from collections import defaultdict
from tqdm import *


In [2]:
import torch
import torch.nn as nn
from torch.nn import init
from torch.autograd import Variable
import torchvision
import torchvision.transforms as T
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import sampler
from torch.utils.data import Dataset

In [3]:
train_offsets_df = pd.read_csv("train_offsets.csv", index_col=0)
train_images_df = pd.read_csv("train_images_all.csv", index_col=0)
val_images_df = pd.read_csv("val_images_all.csv", index_col=0)
categories_df = pd.read_csv('categories_name_to_id.csv')

  mask |= (ar1 == a)


In [4]:
idx2l1 = list(categories_df['category_level1'])
idx2l2 = list(categories_df['category_level2'])

In [5]:
data_dir = "./input/"
train_bson_path = os.path.join(data_dir, "train.bson")
train_bson_file = open(train_bson_path, "rb")

In [6]:
class BSONIterator(Dataset):
    def __init__(self, bson_file, images_df, offsets_df, transform, mode = 'train'):
        super(BSONIterator, self).__init__()
        self.file = bson_file
        self.images_df = images_df
        self.offsets_df = offsets_df
        self.transform = transform
        self.mode = mode

    def __getitem__(self, idx):
        image_row = self.images_df.iloc[idx]
        product_id = image_row["product_id"]
        offset_row = self.offsets_df.loc[product_id]
        # Random access this product's data from the BSON file.
        self.file.seek(offset_row["offset"])
        item_data = self.file.read(offset_row["length"])
        # Grab the image from the product.
        item = bson.BSON.decode(item_data)
        img_idx = image_row["img_idx"]
        bson_img = item["imgs"][img_idx]["picture"]

        # Load the image.
        image = io.BytesIO(bson_img)
        img = Image.open(image)
        x = self.transform(img)
        if self.mode == 'train':
            idx = int(image_row["category_idx"])
            level1 = int(idx2l1[idx])
            level2 = int(idx2l2[idx])
            
            target1 = torch.LongTensor([level1])
            target2 = torch.LongTensor([level2])
            target3 = torch.LongTensor([idx])
            return x, target1, target2, target3 #for the sake of pin_memory and async
        else:
            return x
    
    def __len__(self):
        return len(self.images_df)

In [8]:
mean, std = [0.485, 0.456, 0.406], [0.229, 0.224, 0.225]
transform_train = T.Compose([T.RandomHorizontalFlip(), 
                             T.ToTensor(),T.Normalize(mean=mean, std=std)])
transform_val = T.Compose([T.ToTensor(),T.Normalize(mean=mean, std=std)])

Create a generator for training and a generator for validation.

In [9]:
train_gen = BSONIterator(train_bson_file, train_images_df, train_offsets_df, transform_train, mode = 'train')
val_gen = BSONIterator(train_bson_file, val_images_df, train_offsets_df, transform_val, mode = 'train')

In [10]:
print(len(train_gen), len(val_gen))

12129141 242152


In [11]:
batch_size = 256
loader_train = DataLoader(train_gen, batch_size=batch_size, 
                          sampler=sampler.RandomSampler(train_gen), num_workers=1, pin_memory = True)
loader_val = DataLoader(val_gen, batch_size=batch_size, 
                          sampler=sampler.SequentialSampler(val_gen), num_workers=1, pin_memory = True)

In [12]:
print(len(loader_train), len(loader_val))

47380 946


# Training

**ResNet50**

In [14]:
model = torchvision.models.resnet50(pretrained=False)
model.avgpool = nn.AvgPool2d(kernel_size = 6)
model.fc = nn.Linear(in_features=2048, out_features=49 + 483 + 5270)

In [15]:
trained_model = 'model_best.pth.tar'

def load_model(model, trained_model):
    if os.path.isfile(trained_model):
        print("=> loading checkpoint '{}'".format(trained_model))
        checkpoint = torch.load(trained_model)
        model.load_state_dict(checkpoint['state_dict'])
        print("=> loaded checkpoint '{}'".format(trained_model))
        return model
    else:
        print("=> no checkpoint found at '{}'".format(best_model))

In [16]:
model = load_model(model, trained_model)

=> loading checkpoint 'model_best.pth.tar'
=> loaded checkpoint 'model_best.pth.tar'


In [17]:
for layer in [model.conv1, model.bn1, model.relu, model.maxpool, model.layer1, model.layer2]:
    for param in layer.parameters():
        param.requires_grad = False
model.cuda()

ResNet (
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True)
  (relu): ReLU (inplace)
  (maxpool): MaxPool2d (size=(3, 3), stride=(2, 2), padding=(1, 1), dilation=(1, 1))
  (layer1): Sequential (
    (0): Bottleneck (
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True)
      (relu): ReLU (inplace)
      (downsample): Sequential (
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True)
      )
    )
    (1): Bott

In [18]:
def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
    torch.save(state, filename)
    if is_best:
        shutil.copyfile(filename, 'model_best.pth.tar')


class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def adjust_learning_rate(lr, optimizer, epoch, denominator = 2):
    """Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
    lr = lr * (0.1 ** (epoch // denominator))
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

def accuracy(output, target, topk=(1,)):
    """Computes the precision@k for the specified values of k"""
    batch_size = target.size(0)
    _, pred = output.max(dim=1)
    correct = pred.eq(target)
    res = []
    for k in topk:
        correct_k = correct.float().sum(0, keepdim=True)
        res.append(correct_k.mul_(100.0 / batch_size))
    return res

In [19]:
def train(train_loader, model, criterion, optimizer, weights, epoch, print_freq = 500):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    loss_log = []
    acc_log = []

    # switch to train mode
    model.train()

    end = time.time()
    for i, (img, target1, target2, target3) in enumerate(train_loader):
        # measure data loading time
        data_time.update(time.time() - end)
        
        img = img.cuda(async=True)
        img_var = Variable(img)
        
        target1 = target1.view(-1).cuda(async=True)
        target1_var = Variable(target1)
        target2 = target2.view(-1).cuda(async=True)
        target2_var = Variable(target2)
        target3 = target3.view(-1).cuda(async=True)
        target3_var = Variable(target3)

        # compute output
        output = model(img_var)
        loss1 = criterion(output[:, :49], target1_var)
        loss2=  criterion(output[:, 49:532], target2_var)
        loss3=  criterion(output[:, 532:], target3_var)
        loss = loss1*weights[0] + loss2*weights[1] + loss3*weights[2]
        # measure accuracy and record loss
        if weights[2]>0:
            prec1 = accuracy(output.data[:, 532:], target3, topk=(1, ))[0]#only need top1
        elif weights[1]>0:
            prec1 = accuracy(output.data[:, 49:532], target2, topk=(1, ))[0]
        else:
            prec1 = accuracy(output.data[:, :49], target1, topk=(1, ))[0]
        losses.update(loss.data[0], img.size(0)) #[0] to take out the float inside torch.Tensor
        top1.update(prec1[0], img.size(0))
        loss_log.append(losses.val)
        acc_log.append(top1.val)

        # compute gradient and do SGD step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % print_freq == 0:
            print('Epoch: [{0}][{1}/{2}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                  'Prec@1 {top1.val:.3f} ({top1.avg:.3f})'.format(
                   epoch, i, len(train_loader), batch_time=batch_time,
                   data_time=data_time, loss=losses, top1=top1))
    return loss_log, acc_log

In [20]:
def validate(val_loader, model, weights, print_freq=50):
    batch_time = AverageMeter()
    top1 = AverageMeter()

    # switch to evaluate mode
    model.eval()

    end = time.time()
    for i, (img, target1, target2, target3) in enumerate(val_loader):
        
        img = img.cuda(async=True)
        img_var = Variable(img, volatile=True)
        
        target1 = target1.view(-1).cuda(async=True)
        target1_var = Variable(target1)
        target2 = target2.view(-1).cuda(async=True)
        target2_var = Variable(target2)
        target3 = target3.view(-1).cuda(async=True)
        target3_var = Variable(target3)

        # compute output
        output = model(img_var)

        # measure accuracy and record loss
        if weights[2]> 0:
            prec1 = accuracy(output.data[:, 532:], target3, topk=(1, ))[0]#only need top1
        elif weights[1]>0:
            prec1 = accuracy(output.data[:, 49:532], target2, topk=(1, ))[0]
        else:
            prec1 = accuracy(output.data[:, :49], target1, topk=(1, ))[0]
        top1.update(prec1[0], img.size(0))

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % print_freq == 0:
            print('Test: [{0}/{1}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Prec@1 {top1.val:.3f} ({top1.avg:.3f})'.format(
                   i, len(val_loader), batch_time=batch_time, top1=top1))

    print(' * Prec@1 {top1.avg:.3f}'.format(top1=top1))

    return top1.avg

**lr = 1e-4, 1 epoch, 67.8 --> 69.2**

In [21]:
if __name__ == '__main__':
    best_prec1 = 69.2
    criterion = nn.CrossEntropyLoss().cuda()
    lr = 1e-4
    optimizer = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr = lr, momentum=0.9, 
                          weight_decay=0)
    resume = None
    start_epoch = 0
    epochs = 1
    arch = 'resnet50_levelID'

    for epoch in range(start_epoch, epochs):
        adjust_learning_rate(lr=lr, optimizer=optimizer, epoch=epoch, denominator=2)

        # train for one epoch
        for weights in [[1, 0, 0], [0, 0, 1]]:
            loss_log, acc_log = train(train_loader=loader_train, model=model, criterion=criterion,
                                      weights = weights, optimizer=optimizer, epoch=epoch)

        # evaluate on validation set
            prec1 = validate(val_loader=loader_val, model=model, weights=weights)

        # remember best prec@1 and save checkpoint
        is_best = prec1 > best_prec1
        best_prec1 = max(prec1, best_prec1)
        save_checkpoint({
            'epoch': epoch + 1,
            'arch': arch,
            'state_dict': model.state_dict(),
            'best_prec1': best_prec1,
            'optimizer' : optimizer.state_dict(),
        }, is_best)

        #plot loss and acc
#         fig = plt.figure(figsize = (6,3), dpi = 600)
#         loss_log = np.array(loss_log)
#         ax1 = plt.subplot(121)
#         ax1.plot(loss_log)
#         ax1.set_ylabel('Loss', weight = 'bold')
#         acc_log = np.array(acc_log)
#         ax2 = plt.subplot(111)
#         ax2.plot(acc_log)
#         ax2.set_ylabel('Train_accuracy', weight = 'bold')
#         np.savetxt(X=np.vstack((loss_log, acc_log)), fname='loss_acc_log.txt', fmt='%.3f')

Epoch: [0][0/47380]	Time 2.140 (2.140)	Data 1.250 (1.250)	Loss 0.7943 (0.7943)	Prec@1 80.859 (80.859)
Epoch: [0][500/47380]	Time 0.638 (0.641)	Data 0.000 (0.003)	Loss 0.9671 (0.8626)	Prec@1 78.906 (78.674)
Epoch: [0][1000/47380]	Time 0.626 (0.639)	Data 0.000 (0.002)	Loss 0.9516 (0.8638)	Prec@1 76.562 (78.491)
Epoch: [0][1500/47380]	Time 0.635 (0.638)	Data 0.000 (0.001)	Loss 0.9268 (0.8647)	Prec@1 76.953 (78.413)
Epoch: [0][2000/47380]	Time 0.636 (0.637)	Data 0.000 (0.001)	Loss 0.7738 (0.8624)	Prec@1 80.469 (78.443)
Epoch: [0][2500/47380]	Time 0.635 (0.637)	Data 0.000 (0.001)	Loss 0.9848 (0.8614)	Prec@1 78.516 (78.461)
Epoch: [0][3000/47380]	Time 0.635 (0.636)	Data 0.000 (0.001)	Loss 0.7753 (0.8598)	Prec@1 80.859 (78.481)
Epoch: [0][3500/47380]	Time 0.634 (0.636)	Data 0.000 (0.001)	Loss 0.8473 (0.8588)	Prec@1 77.734 (78.506)
Epoch: [0][4000/47380]	Time 0.631 (0.636)	Data 0.000 (0.001)	Loss 0.6653 (0.8578)	Prec@1 81.250 (78.526)
Epoch: [0][4500/47380]	Time 0.635 (0.636)	Data 0.000 (0.001

Epoch: [0][39000/47380]	Time 0.639 (0.636)	Data 0.000 (0.000)	Loss 0.8154 (0.8385)	Prec@1 80.078 (78.960)
Epoch: [0][39500/47380]	Time 0.635 (0.636)	Data 0.000 (0.000)	Loss 0.6936 (0.8384)	Prec@1 80.859 (78.964)
Epoch: [0][40000/47380]	Time 0.633 (0.636)	Data 0.000 (0.000)	Loss 0.9435 (0.8382)	Prec@1 77.344 (78.968)
Epoch: [0][40500/47380]	Time 0.634 (0.636)	Data 0.000 (0.000)	Loss 0.7899 (0.8381)	Prec@1 78.906 (78.971)
Epoch: [0][41000/47380]	Time 0.636 (0.636)	Data 0.000 (0.000)	Loss 0.8504 (0.8378)	Prec@1 79.688 (78.977)
Epoch: [0][41500/47380]	Time 0.634 (0.636)	Data 0.000 (0.000)	Loss 0.8588 (0.8376)	Prec@1 76.953 (78.982)
Epoch: [0][42000/47380]	Time 0.637 (0.636)	Data 0.000 (0.000)	Loss 0.9075 (0.8374)	Prec@1 77.344 (78.987)
Epoch: [0][42500/47380]	Time 0.634 (0.636)	Data 0.000 (0.000)	Loss 0.7817 (0.8373)	Prec@1 78.125 (78.990)
Epoch: [0][43000/47380]	Time 0.628 (0.636)	Data 0.000 (0.000)	Loss 0.8656 (0.8372)	Prec@1 78.516 (78.993)
Epoch: [0][43500/47380]	Time 0.637 (0.636)	Dat

Epoch: [0][25000/47380]	Time 0.638 (0.635)	Data 0.000 (0.000)	Loss 1.2965 (1.2679)	Prec@1 67.969 (70.471)
Epoch: [0][25500/47380]	Time 0.639 (0.635)	Data 0.000 (0.000)	Loss 1.2812 (1.2681)	Prec@1 65.625 (70.466)
Epoch: [0][26000/47380]	Time 0.638 (0.635)	Data 0.000 (0.000)	Loss 1.0543 (1.2679)	Prec@1 73.828 (70.472)
Epoch: [0][26500/47380]	Time 0.637 (0.635)	Data 0.000 (0.000)	Loss 1.3005 (1.2678)	Prec@1 70.703 (70.475)
Epoch: [0][27000/47380]	Time 0.640 (0.635)	Data 0.000 (0.000)	Loss 1.1923 (1.2677)	Prec@1 72.656 (70.476)
Epoch: [0][27500/47380]	Time 0.634 (0.635)	Data 0.000 (0.000)	Loss 1.2457 (1.2677)	Prec@1 70.312 (70.478)
Epoch: [0][28000/47380]	Time 0.634 (0.635)	Data 0.001 (0.000)	Loss 1.5469 (1.2678)	Prec@1 64.844 (70.477)
Epoch: [0][28500/47380]	Time 0.637 (0.635)	Data 0.000 (0.000)	Loss 1.2010 (1.2678)	Prec@1 73.047 (70.479)
Epoch: [0][29000/47380]	Time 0.637 (0.635)	Data 0.000 (0.000)	Loss 1.3981 (1.2677)	Prec@1 66.797 (70.481)
Epoch: [0][29500/47380]	Time 0.634 (0.635)	Dat

Epoch: [1][11000/47380]	Time 0.617 (0.635)	Data 0.000 (0.001)	Loss 0.7302 (0.7741)	Prec@1 80.469 (80.338)
Epoch: [1][11500/47380]	Time 0.635 (0.635)	Data 0.000 (0.001)	Loss 0.7805 (0.7742)	Prec@1 80.469 (80.335)
Epoch: [1][12000/47380]	Time 0.636 (0.635)	Data 0.000 (0.001)	Loss 0.9153 (0.7740)	Prec@1 76.953 (80.335)
Epoch: [1][12500/47380]	Time 0.634 (0.635)	Data 0.000 (0.001)	Loss 0.7407 (0.7736)	Prec@1 81.250 (80.346)
Epoch: [1][13000/47380]	Time 0.635 (0.635)	Data 0.000 (0.001)	Loss 0.8064 (0.7733)	Prec@1 80.469 (80.355)
Epoch: [1][13500/47380]	Time 0.635 (0.635)	Data 0.000 (0.001)	Loss 0.6673 (0.7731)	Prec@1 81.641 (80.363)
Epoch: [1][14000/47380]	Time 0.634 (0.635)	Data 0.000 (0.001)	Loss 0.7690 (0.7732)	Prec@1 81.250 (80.362)
Epoch: [1][14500/47380]	Time 0.636 (0.635)	Data 0.000 (0.001)	Loss 0.6587 (0.7732)	Prec@1 82.812 (80.362)
Epoch: [1][15000/47380]	Time 0.625 (0.635)	Data 0.000 (0.000)	Loss 0.7562 (0.7731)	Prec@1 79.688 (80.363)
Epoch: [1][15500/47380]	Time 0.637 (0.635)	Dat

Test: [400/946]	Time 0.302 (0.316)	Prec@1 88.281 (90.765)
Test: [450/946]	Time 0.299 (0.316)	Prec@1 93.359 (89.331)
Test: [500/946]	Time 0.297 (0.315)	Prec@1 84.766 (88.075)
Test: [550/946]	Time 0.327 (0.315)	Prec@1 40.625 (86.900)
Test: [600/946]	Time 0.318 (0.314)	Prec@1 59.766 (85.859)
Test: [650/946]	Time 0.306 (0.314)	Prec@1 62.500 (85.135)
Test: [700/946]	Time 0.322 (0.314)	Prec@1 46.875 (84.362)
Test: [750/946]	Time 0.312 (0.314)	Prec@1 90.234 (83.329)
Test: [800/946]	Time 0.309 (0.314)	Prec@1 77.344 (82.047)
Test: [850/946]	Time 0.301 (0.313)	Prec@1 48.438 (80.799)
Test: [900/946]	Time 0.317 (0.313)	Prec@1 51.172 (79.470)
 * Prec@1 78.068
Epoch: [1][0/47380]	Time 1.452 (1.452)	Data 1.087 (1.087)	Loss 1.2192 (1.2192)	Prec@1 72.656 (72.656)
Epoch: [1][500/47380]	Time 0.636 (0.637)	Data 0.000 (0.003)	Loss 1.0514 (1.2207)	Prec@1 73.438 (71.438)
Epoch: [1][1000/47380]	Time 0.638 (0.640)	Data 0.000 (0.002)	Loss 1.0834 (1.2201)	Prec@1 72.266 (71.329)
Epoch: [1][1500/47380]	Time 0.640 

Epoch: [1][36000/47380]	Time 0.633 (0.634)	Data 0.000 (0.000)	Loss 1.1655 (1.2097)	Prec@1 69.141 (71.518)
Epoch: [1][36500/47380]	Time 0.633 (0.634)	Data 0.000 (0.000)	Loss 1.1154 (1.2097)	Prec@1 73.047 (71.522)
Epoch: [1][37000/47380]	Time 0.633 (0.634)	Data 0.000 (0.000)	Loss 1.2786 (1.2097)	Prec@1 73.047 (71.521)
Epoch: [1][37500/47380]	Time 0.636 (0.634)	Data 0.000 (0.000)	Loss 1.0372 (1.2096)	Prec@1 76.953 (71.524)
Epoch: [1][38000/47380]	Time 0.622 (0.634)	Data 0.000 (0.000)	Loss 1.0300 (1.2097)	Prec@1 73.828 (71.524)
Epoch: [1][38500/47380]	Time 0.632 (0.634)	Data 0.000 (0.000)	Loss 1.0456 (1.2097)	Prec@1 73.438 (71.523)
Epoch: [1][39000/47380]	Time 0.635 (0.634)	Data 0.000 (0.000)	Loss 1.2099 (1.2097)	Prec@1 73.047 (71.524)
Epoch: [1][39500/47380]	Time 0.632 (0.634)	Data 0.000 (0.000)	Loss 1.3283 (1.2097)	Prec@1 71.094 (71.524)
Epoch: [1][40000/47380]	Time 0.636 (0.634)	Data 0.000 (0.000)	Loss 1.1106 (1.2097)	Prec@1 73.828 (71.522)
Epoch: [1][40500/47380]	Time 0.633 (0.634)	Dat

Epoch: [2][22000/47380]	Time 0.636 (0.634)	Data 0.000 (0.000)	Loss 0.5562 (0.6976)	Prec@1 84.766 (82.151)
Epoch: [2][22500/47380]	Time 0.635 (0.634)	Data 0.000 (0.000)	Loss 0.6717 (0.6973)	Prec@1 80.078 (82.156)
Epoch: [2][23000/47380]	Time 0.635 (0.634)	Data 0.000 (0.000)	Loss 0.6410 (0.6972)	Prec@1 82.812 (82.160)
Epoch: [2][23500/47380]	Time 0.639 (0.634)	Data 0.000 (0.000)	Loss 0.6614 (0.6969)	Prec@1 82.031 (82.166)
Epoch: [2][24000/47380]	Time 0.635 (0.634)	Data 0.000 (0.000)	Loss 0.7508 (0.6966)	Prec@1 80.469 (82.172)
Epoch: [2][24500/47380]	Time 0.638 (0.634)	Data 0.000 (0.000)	Loss 0.6243 (0.6964)	Prec@1 84.766 (82.176)
Epoch: [2][25000/47380]	Time 0.636 (0.634)	Data 0.002 (0.000)	Loss 0.6622 (0.6960)	Prec@1 83.594 (82.185)
Epoch: [2][25500/47380]	Time 0.633 (0.634)	Data 0.000 (0.000)	Loss 0.6827 (0.6958)	Prec@1 78.125 (82.190)
Epoch: [2][26000/47380]	Time 0.635 (0.634)	Data 0.000 (0.000)	Loss 0.6447 (0.6955)	Prec@1 85.156 (82.197)
Epoch: [2][26500/47380]	Time 0.639 (0.634)	Dat

Epoch: [2][8000/47380]	Time 0.630 (0.635)	Data 0.000 (0.001)	Loss 0.9943 (1.1047)	Prec@1 77.734 (73.702)
Epoch: [2][8500/47380]	Time 0.634 (0.635)	Data 0.000 (0.001)	Loss 1.0284 (1.1042)	Prec@1 75.781 (73.703)
Epoch: [2][9000/47380]	Time 0.636 (0.635)	Data 0.002 (0.001)	Loss 1.1009 (1.1042)	Prec@1 74.609 (73.697)
Epoch: [2][9500/47380]	Time 0.636 (0.635)	Data 0.000 (0.001)	Loss 1.0624 (1.1038)	Prec@1 73.828 (73.710)
Epoch: [2][10000/47380]	Time 0.636 (0.635)	Data 0.000 (0.001)	Loss 1.2782 (1.1032)	Prec@1 70.703 (73.724)
Epoch: [2][10500/47380]	Time 0.637 (0.635)	Data 0.000 (0.001)	Loss 1.0180 (1.1032)	Prec@1 77.734 (73.720)
Epoch: [2][11000/47380]	Time 0.635 (0.635)	Data 0.001 (0.001)	Loss 1.0372 (1.1027)	Prec@1 75.391 (73.724)
Epoch: [2][11500/47380]	Time 0.637 (0.635)	Data 0.000 (0.001)	Loss 0.8599 (1.1024)	Prec@1 77.734 (73.727)
Epoch: [2][12000/47380]	Time 0.632 (0.635)	Data 0.000 (0.001)	Loss 0.9289 (1.1022)	Prec@1 75.781 (73.727)
Epoch: [2][12500/47380]	Time 0.635 (0.635)	Data 0.

Epoch: [2][47000/47380]	Time 0.637 (0.635)	Data 0.000 (0.000)	Loss 1.2008 (1.0971)	Prec@1 69.141 (73.829)
Test: [0/946]	Time 1.165 (1.165)	Prec@1 79.297 (79.297)
Test: [50/946]	Time 0.297 (0.333)	Prec@1 87.500 (70.642)
Test: [100/946]	Time 0.299 (0.319)	Prec@1 83.594 (79.738)
Test: [150/946]	Time 0.315 (0.314)	Prec@1 66.016 (82.957)
Test: [200/946]	Time 0.309 (0.313)	Prec@1 73.828 (81.841)
Test: [250/946]	Time 0.335 (0.313)	Prec@1 61.328 (81.759)
Test: [300/946]	Time 0.326 (0.312)	Prec@1 95.312 (82.249)
Test: [350/946]	Time 0.303 (0.312)	Prec@1 97.266 (82.856)
Test: [400/946]	Time 0.298 (0.312)	Prec@1 79.688 (82.048)
Test: [450/946]	Time 0.299 (0.311)	Prec@1 89.453 (81.172)
Test: [500/946]	Time 0.301 (0.311)	Prec@1 81.250 (80.036)
Test: [550/946]	Time 0.313 (0.310)	Prec@1 39.062 (78.788)
Test: [600/946]	Time 0.316 (0.310)	Prec@1 47.266 (77.714)
Test: [650/946]	Time 0.306 (0.310)	Prec@1 58.203 (76.703)
Test: [700/946]	Time 0.315 (0.310)	Prec@1 47.656 (75.805)
Test: [750/946]	Time 0.300 

**lr = 1e-4, 1 epoch, val acc doesn't change, train acc: 73.8 --> 74.1**

In [22]:
if __name__ == '__main__':
    best_prec1 = 69.2
    criterion = nn.CrossEntropyLoss().cuda()
    lr = 1e-4
    optimizer = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr = lr, momentum=0.9, 
                          weight_decay=0)
    resume = None
    start_epoch = 0
    epochs = 1
    arch = 'resnet50_levelID'

    for epoch in range(start_epoch, epochs):
        adjust_learning_rate(lr=lr, optimizer=optimizer, epoch=epoch, denominator=2)

        # train for one epoch
        for weights in [[1, 0, 0], [0, 0, 1]]:
            loss_log, acc_log = train(train_loader=loader_train, model=model, criterion=criterion,
                                      weights = weights, optimizer=optimizer, epoch=epoch)

        # evaluate on validation set
            prec1 = validate(val_loader=loader_val, model=model, weights=weights)

        # remember best prec@1 and save checkpoint
        is_best = prec1 > best_prec1
        best_prec1 = max(prec1, best_prec1)
        save_checkpoint({
            'epoch': epoch + 1,
            'arch': arch,
            'state_dict': model.state_dict(),
            'best_prec1': best_prec1,
            'optimizer' : optimizer.state_dict(),
        }, is_best)

Epoch: [0][0/47380]	Time 1.563 (1.563)	Data 1.199 (1.199)	Loss 0.4758 (0.4758)	Prec@1 85.938 (85.938)
Epoch: [0][500/47380]	Time 0.641 (0.638)	Data 0.000 (0.003)	Loss 0.3613 (0.4039)	Prec@1 85.547 (88.607)
Epoch: [0][1000/47380]	Time 0.636 (0.637)	Data 0.000 (0.002)	Loss 0.4899 (0.3992)	Prec@1 89.062 (88.687)
Epoch: [0][1500/47380]	Time 0.637 (0.636)	Data 0.000 (0.001)	Loss 0.4479 (0.3965)	Prec@1 86.719 (88.716)
Epoch: [0][2000/47380]	Time 0.631 (0.636)	Data 0.000 (0.001)	Loss 0.4520 (0.3935)	Prec@1 87.500 (88.783)
Epoch: [0][2500/47380]	Time 0.634 (0.635)	Data 0.000 (0.001)	Loss 0.4249 (0.3918)	Prec@1 87.891 (88.839)
Epoch: [0][3000/47380]	Time 0.634 (0.635)	Data 0.001 (0.001)	Loss 0.3676 (0.3904)	Prec@1 89.062 (88.875)
Epoch: [0][3500/47380]	Time 0.636 (0.635)	Data 0.000 (0.001)	Loss 0.3578 (0.3895)	Prec@1 87.500 (88.890)
Epoch: [0][4000/47380]	Time 0.633 (0.635)	Data 0.001 (0.001)	Loss 0.4391 (0.3887)	Prec@1 86.328 (88.915)
Epoch: [0][4500/47380]	Time 0.636 (0.635)	Data 0.000 (0.001

Epoch: [0][39000/47380]	Time 0.634 (0.635)	Data 0.000 (0.000)	Loss 0.3404 (0.3679)	Prec@1 91.016 (89.412)
Epoch: [0][39500/47380]	Time 0.635 (0.635)	Data 0.000 (0.000)	Loss 0.2864 (0.3678)	Prec@1 91.406 (89.416)
Epoch: [0][40000/47380]	Time 0.629 (0.635)	Data 0.000 (0.000)	Loss 0.4599 (0.3677)	Prec@1 89.062 (89.419)
Epoch: [0][40500/47380]	Time 0.633 (0.635)	Data 0.000 (0.000)	Loss 0.4197 (0.3676)	Prec@1 89.453 (89.421)
Epoch: [0][41000/47380]	Time 0.637 (0.635)	Data 0.000 (0.000)	Loss 0.3657 (0.3674)	Prec@1 88.672 (89.425)
Epoch: [0][41500/47380]	Time 0.633 (0.635)	Data 0.000 (0.000)	Loss 0.3563 (0.3674)	Prec@1 87.109 (89.427)
Epoch: [0][42000/47380]	Time 0.631 (0.635)	Data 0.000 (0.000)	Loss 0.3329 (0.3673)	Prec@1 90.625 (89.429)
Epoch: [0][42500/47380]	Time 0.635 (0.635)	Data 0.000 (0.000)	Loss 0.4238 (0.3671)	Prec@1 87.891 (89.435)
Epoch: [0][43000/47380]	Time 0.635 (0.635)	Data 0.001 (0.000)	Loss 0.4575 (0.3670)	Prec@1 87.109 (89.437)
Epoch: [0][43500/47380]	Time 0.637 (0.635)	Dat

Epoch: [0][25000/47380]	Time 0.631 (0.635)	Data 0.000 (0.000)	Loss 1.1523 (1.0842)	Prec@1 69.141 (74.082)
Epoch: [0][25500/47380]	Time 0.636 (0.635)	Data 0.000 (0.000)	Loss 1.0609 (1.0841)	Prec@1 76.562 (74.084)
Epoch: [0][26000/47380]	Time 0.635 (0.635)	Data 0.001 (0.000)	Loss 0.9122 (1.0840)	Prec@1 78.125 (74.084)
Epoch: [0][26500/47380]	Time 0.628 (0.635)	Data 0.000 (0.000)	Loss 0.9456 (1.0839)	Prec@1 77.734 (74.086)
Epoch: [0][27000/47380]	Time 0.630 (0.635)	Data 0.000 (0.000)	Loss 1.1516 (1.0838)	Prec@1 71.484 (74.088)
Epoch: [0][27500/47380]	Time 0.634 (0.635)	Data 0.001 (0.000)	Loss 1.0322 (1.0836)	Prec@1 74.609 (74.092)
Epoch: [0][28000/47380]	Time 0.635 (0.635)	Data 0.000 (0.000)	Loss 1.1727 (1.0835)	Prec@1 74.219 (74.094)
Epoch: [0][28500/47380]	Time 0.632 (0.635)	Data 0.000 (0.000)	Loss 1.0945 (1.0833)	Prec@1 73.438 (74.099)
Epoch: [0][29000/47380]	Time 0.634 (0.635)	Data 0.000 (0.000)	Loss 1.0875 (1.0834)	Prec@1 76.562 (74.099)
Epoch: [0][29500/47380]	Time 0.636 (0.635)	Dat