In [1]:
import os, sys, math, io
import numpy as np
import pandas as pd
import multiprocessing as mp
import bson
import struct
from PIL import Image
import time
import shutil

# %matplotlib inline
import matplotlib.pyplot as plt
from multiprocessing import Pool
import time

In [2]:
import torch
import torch.nn as nn
from torch.nn import init
from torch.autograd import Variable
import torchvision
import torchvision.transforms as T
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import sampler

pytorch Dataloader is not thread safe due to the random access to the bson file on disk and will result in errors if the read if not finished and next read request comes. To use the Dataloader, instead of putting the indices into the queue, and every worker processes the read bson action simutaneously, I put read and decoded bson images into the queue, and every worker processes the transform action only. This takes more time in filling the queue but less time in transform the images, while the latter takes more time than the first with a single worker.

In [3]:
class Dataset(object):
    def __init__(self, bson_file, images_df, offsets_df, transformer, train=True):
        self.file = bson_file
        self.images_df = images_df
        self.offsets_df = offsets_df
        self.transformer = transformer
        self.train = train

    def __getitem__(self, data):
        #Though it has the form of indexing, it is just a function to transform images only.
        #This is to conform to the usage of the Dataloader class.
        bson_img, y = data
        image = io.BytesIO(bson_img)
        img = Image.open(image)
        x = self.transformer(img)
        if self.train:
            return x, y
        else:
            return x

    def __len__(self):
        return len(self.images_df)

**Note the modification in the batchsampler**

In [4]:
class MyBatchSampler(object):
    """Wraps another sampler to yield a mini-batch of indices.

    Args:
        sampler (Sampler): Base sampler.
        batch_size (int): Size of mini-batch.
        drop_last (bool): If ``True``, the sampler will drop the last batch if
            its size would be less than ``batch_size``

    Example:
        >>> list(BatchSampler(range(10), batch_size=3, drop_last=False))
        [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]]
        >>> list(BatchSampler(range(10), batch_size=3, drop_last=True))
        [[0, 1, 2], [3, 4, 5], [6, 7, 8]]
    """

    def __init__(self, sampler, batch_size, drop_last):
        self.sampler = sampler
        self.batch_size = batch_size
        self.drop_last = drop_last
        self.data_source = sampler.data_source

    def __getitem__(self, idx):
        #decode the bytes in bson to images here rather than in the Dataset class.
        image_row = self.data_source.images_df.iloc[idx]
        product_id = image_row["product_id"]
        offset_row = self.data_source.offsets_df.loc[product_id]
        # Random access this product's data from the BSON file.
        self.data_source.file.seek(offset_row["offset"])
        item_data = self.data_source.file.read(offset_row["length"])
        item = bson.BSON.decode(item_data)
        img_idx = image_row["img_idx"]
        
        return item["imgs"][img_idx]["picture"], image_row["category_idx"]
    
    def __iter__(self):
        batch_data = []
        for idx in self.sampler:
            data = self[idx]
            batch_data.append(data)
    
            if len(batch_data) == self.batch_size:
                yield batch_data
                batch_data = []
        if len(batch_data) > 0 and not self.drop_last:
            yield batch_data

    def __len__(self):
        if self.drop_last:
            return len(self.sampler) // self.batch_size
        else:
            return (len(self.sampler) + self.batch_size - 1) // self.batch_size


In [7]:
train_offsets_df = pd.read_csv("train_offsets.csv", index_col=0)
train_images_df = pd.read_csv("train_images_withlevel.csv", index_col=0)
val_images_df = pd.read_csv("val_images_withlevel.csv", index_col=0)

data_dir = "./input/"
train_bson_path = os.path.join(data_dir, "train_example.bson")
train_bson_file = open(train_bson_path, "rb")

In [8]:
mean, std = [0.485, 0.456, 0.406], [0.229, 0.224, 0.225]
transformer_train = T.Compose([T.RandomHorizontalFlip(), 
                             T.ToTensor(),T.Normalize(mean=mean, std=std)])
transformer_val = T.Compose([T.ToTensor(),T.Normalize(mean=mean, std=std)])

In [9]:
dataset_train = Dataset(train_bson_file, train_images_df, train_offsets_df, transformer_train, train=True)
batch_size = 96
batch_sampler = MyBatchSampler(batch_size=batch_size, sampler=sampler.RandomSampler(dataset_train), drop_last=False)

In [10]:
loader_train = DataLoader(dataset=dataset_train, batch_sampler=batch_sampler, num_workers=4, pin_memory=True)

In [15]:
itr = iter(loader_train)

In [16]:
%time bx, by = next(itr)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 41.2 µs


In [13]:
print(bx.size(), by.size())

torch.Size([96, 3, 180, 180]) torch.Size([96])
