This notebook contains a generator class for Keras called `BSONIterator` that can read directly from the BSON data. You can use it in combination with `ImageDataGenerator` for doing data augmentation.

In [1]:
import os, sys, math, io
import numpy as np
import pandas as pd
import multiprocessing as mp
import bson
import struct
from PIL import Image
import time
import shutil

%matplotlib inline
import matplotlib.pyplot as plt

from collections import defaultdict
from tqdm import *


# Part 2: The generator

First load the lookup tables from the CSV files (you don't need to do this if you just did all the steps from part 1).

In [None]:
train_offsets_df = pd.read_csv("train_offsets.csv", index_col=0)
train_images_df = pd.read_csv("train_images.csv", index_col=0)
val_images_df = pd.read_csv("val_images.csv", index_col=0)

The Keras generator is implemented by the `BSONIterator` class. It creates batches of images (and their one-hot encoded labels) directly from the BSON file. It can be used with multiple workers.

**Note:** For fastest results, put the train.bson and test.bson files on a fast drive (SSD).

See also the code in: https://github.com/fchollet/keras/blob/master/keras/preprocessing/image.py

In [None]:
import torch
import torch.nn as nn
from torch.nn import init
from torch.autograd import Variable
import torchvision
import torchvision.transforms as T
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import sampler
from torch.utils.data import Dataset

In [None]:
class BSONIterator(Dataset):
    def __init__(self, bson_file, images_df, offsets_df, transform, lock, train=True):
        super(BSONIterator, self).__init__()
        self.file = bson_file
        self.images_df = images_df
        self.offsets_df = offsets_df
        self.transform = transform
        self.lock = lock
        self.train = train

    def __getitem__(self, idx):
        # Protect file and dataframe access with a lock.
#         with self.lock:
        image_row = self.images_df.iloc[idx]
        product_id = image_row["product_id"]
        offset_row = self.offsets_df.loc[product_id]
        # Random access this product's data from the BSON file.
        self.file.seek(offset_row["offset"])
        item_data = self.file.read(offset_row["length"])
        # Grab the image from the product.
        item = bson.BSON.decode(item_data)
        img_idx = image_row["img_idx"]
        bson_img = item["imgs"][img_idx]["picture"]

        # Load the image.
        image = io.BytesIO(bson_img)
        img = Image.open(image)
        x = self.transform(img)
        if self.train:
            y = image_row["category_idx"]
            return x, y
        else:
            return x
    
    def __len__(self):
        return len(self.images_df)

In [None]:
data_dir = "./input/"
# file_dir = r'C:\Users\YANG\Downloads\cdiscount'
train_bson_path = os.path.join(data_dir, "train_example.bson")
train_bson_file = open(train_bson_path, "rb")

Because the training and validation generators read from the same BSON file, they need to use the same lock to protect it.

In [None]:
import threading
lock = threading.Lock()

In [None]:
# mean, std = [0.485, 0.456, 0.406], [0.229, 0.224, 0.225]
# transform_train = T.Compose([T.CenterCrop(size=160), T.RandomHorizontalFlip(), T.ToTensor(),
#                              T.Normalize(mean=mean, std=std)])
# transform_val = T.Compose([T.CenterCrop(size=160), T.ToTensor(),T.Normalize(mean=mean, std=std)])

In [None]:
mean, std = [0.485, 0.456, 0.406], [0.229, 0.224, 0.225]
transform_train = T.Compose([T.RandomHorizontalFlip(), 
                             T.ToTensor(),T.Normalize(mean=mean, std=std)])
transform_val = T.Compose([T.ToTensor(),T.Normalize(mean=mean, std=std)])

Create a generator for training and a generator for validation.

In [None]:
train_gen = BSONIterator(train_bson_file, train_images_df, train_offsets_df, transform_train, lock, train = True)
val_gen = BSONIterator(train_bson_file, val_images_df, train_offsets_df, transform_val, lock, train = True)

In [None]:
bx, by = train_gen[0]

In [None]:
print(len(train_gen), len(val_gen))

In [None]:
batch_size = 102
loader_train = DataLoader(train_gen, batch_size=batch_size, num_workers=4, pin_memory = False)
# loader_val = DataLoader(val_gen, batch_size=batch_size, 
#                           sampler=sampler.RandomSampler(val_gen), num_workers=4, pin_memory = True)

In [None]:
loader_train


In [None]:
print(len(loader_train), len(loader_val))

## How fast is the generator? Create a single batch:

In [None]:
itr = iter(loader_train)

In [None]:
bx, by = next(itr)

In [None]:
# itr = iter(loader_train)
start = time.time()
bx, by = next(itr)
end = time.time()
print(bx.size(), by.size())
print(end - start)

# Part 3: Training

Create a very Resnet18 model and train it, to test that the generators work.

**ResNet18**

In [None]:
# model = torchvision.models.resnet18(pretrained=True)
# model.avgpool = nn.AvgPool2d(kernel_size = 5)
# model.fc = nn.Linear(in_features=512, out_features=5270) #or 49 + 483 + 5270
# for layer in [model.conv1, model.bn1, model.relu, model.maxpool, model.layer1, model.layer2]:
#     for param in layer.parameters():
#         param.requires_grad = False

In [None]:
# model = torchvision.models.resnet18(pretrained=True)
# model.fc = nn.Linear(in_features=512, out_features=5270) #or 49 + 483 + 5270
# for layer in [model.conv1, model.bn1, model.relu, model.maxpool, model.layer1, model.layer2]:
#     for param in layer.parameters():
#         param.requires_grad = False

In [None]:
model = torchvision.models.resnet18(pretrained=True)
model.avgpool = nn.AvgPool2d(kernel_size = 6)
model.fc = nn.Linear(in_features=512, out_features=5270) #or 49 + 483 + 5270
for layer in [model.conv1, model.bn1, model.relu, model.maxpool, model.layer1, model.layer2]:
    for param in layer.parameters():
        param.requires_grad = False

**ResNet50**

In [None]:
# model = torchvision.models.resnet50(pretrained=True)
# model.avgpool = nn.AvgPool2d(kernel_size = 5)
# model.fc = nn.Linear(in_features=2048, out_features=5270) #or 49 + 483 + 5270

In [None]:
# class ResnetAffine(nn.Module):
#     def __init__(self, resnet, in_features, out_features):
#         super(ResnetAffine, self).__init__()
#         self.resnet = resnet
#         self.relu = nn.ReLU(inplace=True)
#         self.affine = nn.Linear(in_features=in_features, out_features=out_features)
        
#     def forward(self, x):
#         x = self.resnet(x)
#         x = self.relu(x)
#         x = self.affine(x)
#         return x

In [None]:
# resnet = torchvision.models.resnet18(pretrained=True)
# resnet.avgpool = nn.AvgPool2d(kernel_size = 5)
# for param in resnet.parameters():
#     param.requires_grad = False
# for param in resnet.fc.parameters():
#     param.requires_grad = True
# model = ResnetAffine(resnet, 1000, 5270)
# model.affine.bias.data.zero_()
# init.kaiming_uniform(model.affine.weight.data)
# model.cuda()

In [None]:
init.kaiming_normal(model.fc.weight.data)
model.fc.bias.data.zero_()
model.cuda()


In [None]:
def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
    torch.save(state, filename)
    if is_best:
        shutil.copyfile(filename, 'model_best.pth.tar')


class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def adjust_learning_rate(lr, optimizer, epoch, denominator = 2):
    """Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
    lr = lr * (0.1 ** (epoch // denominator))
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

def accuracy(output, target, topk=(1,)):
    """Computes the precision@k for the specified values of k"""
    batch_size = target.size(0)
    _, pred = output.max(dim=1)
    correct = pred.eq(target)
    res = []
    for k in topk:
        correct_k = correct.float().sum(0, keepdim=True)
        res.append(correct_k.mul_(100.0 / batch_size))
    return res

# def accuracy(output, target, topk=(1,)):
#     """Computes the precision@k for the specified values of k"""
#     maxk = max(topk)
#     batch_size = target.size(0)

#     _, pred = output.topk(maxk, 1, True, True)
#     pred = pred.t()
#     correct = pred.eq(target.view(1, -1).expand_as(pred))

#     res = []
#     for k in topk:
#         correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
#         res.append(correct_k.mul_(100.0 / batch_size))
#     return res

In [None]:
def train(train_loader, model, criterion, optimizer, epoch, print_freq = 50):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    loss_log = []
    acc_log = []
#     top5 = AverageMeter() #only need top1

    # switch to train mode
    model.train()

    end = time.time()
    for i, (img, target) in enumerate(train_loader):
        # measure data loading time
        data_time.update(time.time() - end)

        target = target.cuda(async=True)
        img = img.cuda(async=True)
        img_var = Variable(img)
        target_var = Variable(target)

        # compute output
        output = model(img_var)
        loss = criterion(output, target_var)
        loss_log.append(loss)
        # measure accuracy and record loss
        prec1 = accuracy(output.data, target, topk=(1, ))[0]#only need top1
        losses.update(loss.data[0], img.size(0)) #[0] to take out the float inside torch.Tensor
        top1.update(prec1[0], img.size(0))
        acc_log.append(top1.val)

        # compute gradient and do SGD step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % print_freq == 0:
            print('Epoch: [{0}][{1}/{2}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                  'Prec@1 {top1.val:.3f} ({top1.avg:.3f})'.format(
                   epoch, i, len(train_loader), batch_time=batch_time,
                   data_time=data_time, loss=losses, top1=top1))
    return loss_log, acc_log

In [None]:
def validate(val_loader, model, print_freq=50):
    batch_time = AverageMeter()
#     losses = AverageMeter()
    top1 = AverageMeter()
#     top5 = AverageMeter() #only need top1

    # switch to evaluate mode
    model.eval()

    end = time.time()
    for i, (img, target) in enumerate(val_loader):
        
        target = target.cuda(async=True)
        img = img.cuda(async=True)
        img_var = Variable(img, volatile=True)
        target_var = Variable(target, volatile=True)

        # compute output
        output = model(img_var)
#         loss = criterion(output, target_var)

        # measure accuracy and record loss
        prec1 = accuracy(output.data, target, topk=(1,))[0]#only need top1
#         losses.update(loss.data[0], img.size(0))
        top1.update(prec1[0], img.size(0))

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % print_freq == 0:
            print('Test: [{0}/{1}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Prec@1 {top1.val:.3f} ({top1.avg:.3f})'.format(
                   i, len(val_loader), batch_time=batch_time, top1=top1))

    print(' * Prec@1 {top1.avg:.3f}'.format(top1=top1))

    return top1.avg

In [None]:
def main():
    best_prec1 = 46.5
    criterion = nn.CrossEntropyLoss().cuda()
    optimizer = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr = 1e-2, momentum=0.9, 
                          weight_decay=0)
    resume = None
    start_epoch = 0
    epochs = 1
    arch = 'resnet18'

    if resume:
        if os.path.isfile(resume):
            print("=> loading checkpoint '{}'".format(resume))
            checkpoint = torch.load(resume)
            start_epoch = checkpoint['epoch']
            best_prec1 = checkpoint['best_prec1']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})"
                  .format(resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(resume))

    for epoch in range(start_epoch, epochs):
    #     if args.distributed:
    #         train_sampler.set_epoch(epoch)
        adjust_learning_rate(lr=1e-2, optimizer=optimizer, epoch=epoch, denominator=2)

        # train for one epoch
        loss_log, acc_log = train(train_loader=loader_train, model=model, criterion=criterion,
                                  optimizer=optimizer, epoch=epoch)

        # evaluate on validation set
        prec1 = validate(val_loader=loader_val, model=model)

        # remember best prec@1 and save checkpoint
        is_best = prec1 > best_prec1
        best_prec1 = max(prec1, best_prec1)
        save_checkpoint({
            'epoch': epoch + 1,
            'arch': arch,
            'state_dict': model.state_dict(),
            'best_prec1': best_prec1,
            'optimizer' : optimizer.state_dict(),
        }, is_best)

        #plot loss and acc
        fig = plt.figure(figsize = (6,3), dpi = 1200)
        loss_log = np.array(list(map(lambda x: x.data[0], loss_log)))
        ax1 = plt.subplot(121)
        ax1.plot(loss_log)
        ax1.set_ylabel('Loss', weight = 'bold')
        acc_log = np.array(acc_log)
        ax2 = plt.subplot(122)
        ax2.plot(acc_log)
        ax2.set_ylabel('Train_accuracy', weight = 'bold')
        np.savetxt(X=np.vstack((loss_log, acc_log)), fname='loss_acc_log.txt', fmt='%.3f')

In [None]:
main()

# Part 4: Test set predictions

Note: The previous version of this kernel used `BSONIterator` to load the test set images in batches. However, storing the prediction results takes up a huge amount of memory. 

I suggest using a different kind of generator instead, something like the following:

```
from keras import backend as K
from keras.preprocessing.image import ImageDataGenerator

submission_df = pd.read_csv(data_dir + "sample_submission.csv")
submission_df.head()

test_datagen = ImageDataGenerator(preprocessing_function=preprocess_input)
data = bson.decode_file_iter(open(test_bson_path, "rb"))

with tqdm(total=num_test_products) as pbar:
    for c, d in enumerate(data):
        product_id = d["_id"]
        num_imgs = len(d["imgs"])

        batch_x = np.zeros((num_imgs, 180, 180, 3), dtype=K.floatx())

        for i in range(num_imgs):
            bson_img = d["imgs"][i]["picture"]

            # Load and preprocess the image.
            img = load_img(io.BytesIO(bson_img), target_size=(180, 180))
            x = img_to_array(img)
            x = test_datagen.random_transform(x)
            x = test_datagen.standardize(x)

            # Add the image to the batch.
            batch_x[i] = x

        prediction = model.predict(batch_x, batch_size=num_imgs)
        avg_pred = prediction.mean(axis=0)
        cat_idx = np.argmax(avg_pred)
        
        submission_df.iloc[c]["category_id"] = idx2cat[cat_idx]        
        pbar.update()
        
submission_df.to_csv("my_submission.csv.gz", compression="gzip", index=False)        
```