This notebook contains a generator class for Keras called `BSONIterator` that can read directly from the BSON data. You can use it in combination with `ImageDataGenerator` for doing data augmentation.

In [1]:
import os, sys, math, io
import numpy as np
import pandas as pd
import multiprocessing as mp
import bson
import struct
from PIL import Image
import time
import shutil

# %matplotlib inline
import matplotlib.pyplot as plt
from multiprocessing import Pool
import time

In [2]:
import torch
import torch.nn as nn
from torch.nn import init
from torch.autograd import Variable
import torchvision
import torchvision.transforms as T
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import sampler
from torch.utils.data import Dataset

# Part 2: The generator

First load the lookup tables from the CSV files (you don't need to do this if you just did all the steps from part 1).

The Keras generator is implemented by the `BSONIterator` class. It creates batches of images (and their one-hot encoded labels) directly from the BSON file. It can be used with multiple workers.

**Note:** For fastest results, put the train.bson and test.bson files on a fast drive (SSD).

See also the code in: https://github.com/fchollet/keras/blob/master/keras/preprocessing/image.py

In [3]:
class BSONIterator(Dataset):
    def __init__(self, bson_file, images_df, offsets_df, transform, train=True):
        super(BSONIterator, self).__init__()
        self.file = bson_file
        self.images_df = images_df
        self.offsets_df = offsets_df
        self.transform = transform
        self.train = train

    def __getitem__(self, idx):
        image_row = self.images_df.iloc[idx]
        product_id = image_row["product_id"]
        offset_row = self.offsets_df.loc[product_id]
        # Random access this product's data from the BSON file.
        self.file.seek(offset_row["offset"])
        item_data = self.file.read(offset_row["length"])
        item = bson.BSON.decode(item_data)
        img_idx = image_row["img_idx"]
        return item["imgs"][img_idx]["picture"], image_row["category_idx"]
    
    def __len__(self):
        return len(self.images_df)

In [4]:
class TensorGenerator():
    
    def __init__(self, transform):
        self.transform = transform
        
    def __call__(self, data):
        bson_img = data
        # Load the image.
        image = io.BytesIO(bson_img)
        img = Image.open(image)
        x = self.transform(img)
        return x
        

In [5]:
train_offsets_df = pd.read_csv("train_offsets.csv", index_col=0)
train_images_df = pd.read_csv("train_images.csv", index_col=0)
val_images_df = pd.read_csv("val_images.csv", index_col=0)

data_dir = "./input/"
file_dir = r'C:\Users\YANG\Downloads\cdiscount'
train_bson_path = os.path.join(data_dir, "train_example.bson")
train_bson_file = open(train_bson_path, "rb")

In [129]:
mean, std = [0.485, 0.456, 0.406], [0.229, 0.224, 0.225]
transform_train = T.Compose([T.RandomHorizontalFlip(), 
                             T.ToTensor(),T.Normalize(mean=mean, std=std)])
transform_val = T.Compose([T.ToTensor(),T.Normalize(mean=mean, std=std)])

In [7]:
indexes = list(range(96))
train_gen = BSONIterator(train_bson_file, train_images_df, train_offsets_df, 
                         transform_train)
generator = TensorGenerator(transform=transform_train)

In [126]:
img, label = train_gen[0]

In [127]:
image = io.BytesIO(img)

In [128]:
image = Image.open(image)

In [130]:
class Transformer():
    def __init__(self, mean, std, train=True):
        self.mean = np.array(mean).reshape(3, 1, 1)
        self.std = np.array(mean).reshape(3, 1, 1)
        self.train = train
        
    def __call__(self, image):
        if self.train:
            img = image.transpose(Image.FLIP_LEFT_RIGHT) #flip
        else:
            img = image
        img = np.array(img, np.uint8, copy=False) #grab
        img = img.transpose(2, 0, 1) #transpose
        img = img/255 #[0, 1]
        img = (img-self.mean)/self.std #normalize
        return img

In [109]:
transformer_train = Transformer(mean, std, train=True)

In [167]:
%time imgs = [transformer_train(image) for i in range(1000)]

CPU times: user 472 ms, sys: 96 ms, total: 568 ms
Wall time: 578 ms


In [170]:
%time imgs_np = np.vstack(imgs).reshape(-1, 3, 180, 180)

CPU times: user 292 ms, sys: 40 ms, total: 332 ms
Wall time: 336 ms


In [171]:
imgs_np.shape

(1000, 3, 180, 180)

In [172]:
%time imgs_tc = torch.from_numpy(imgs_np)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 202 µs


In [173]:
%time imgs2 = [transform_train(image) for i in range(100)]

CPU times: user 36 ms, sys: 0 ns, total: 36 ms
Wall time: 38 ms


In [174]:
%time imgs2_tc = torch.stack(imgs2)

CPU times: user 12 ms, sys: 0 ns, total: 12 ms
Wall time: 12.4 ms


In [177]:
%time imgs2_tc_cuda = imgs2_tc.cuda()

CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 4.22 ms


In [None]:
Ncore = 4
pool = Pool(processes=Ncore)

In [None]:
if __name__ == '__main__':
    n = 10
    start = time.time()
#     pool = Pool(processes=Ncore)
    for i in range(n):
        datasets, labels = [], []
        for idx in indexes:
            data, label = train_gen[idx]
            datasets.append(data)
            labels.append(label)
        res = pool.map(generator, datasets)
    end = time.time()
#         pool.close()
#         pool.join()
    print((end - start)/n)

In [None]:
len(res)

In [None]:
%time imgs = torch.stack(res, dim=0)

In [None]:
%time imgs_cuda = imgs.cuda()

In [None]:
%time img_labels = torch.from_numpy(np.array(labels)).cuda()