In [1]:
import os, sys, math, io
import numpy as np
import pandas as pd
import multiprocessing as mp
import bson
import struct
from skimage.data import imread
from skimage import transform

%matplotlib inline
import matplotlib.pyplot as plt

from collections import defaultdict
from tqdm import *


In [2]:
data_dir = "./input/"
train_bson_path = os.path.join(data_dir, "train_example.bson")
num_train_products = 82

## Read the BSON files

We store the offsets and lengths of all items, allowing us random access to the items later.

Inspired by code from: https://www.kaggle.com/vfdev5/random-item-access

Note: this takes a few minutes to execute, but we only have to do it once (we'll save the table to a CSV file afterwards).

In [3]:
def read_bson(bson_path, num_records, with_categories):
    rows = {}
    with open(bson_path, "rb") as f, tqdm(total=num_records) as pbar:
        offset = 0
        while True:
            item_length_bytes = f.read(4) #read in 4 lines to decode the actual length
            if len(item_length_bytes) == 0:
                break

            length = struct.unpack("<i", item_length_bytes)[0] #decode the actual length

            f.seek(offset) #go back to the starting line
            item_data = f.read(length) #read in the current product
            assert len(item_data) == length

            item = bson.BSON.decode(item_data)
            product_id = item["_id"]
            num_imgs = len(item["imgs"])

            row = [num_imgs, offset, length]
            if with_categories: #train/val mode
                row += [item["category_id"]]
            rows[product_id] = row

            offset += length #off set from the head of the input file
            pbar.update()

    columns = ["num_imgs", "offset", "length"]
    if with_categories:
        columns += ["category_id"] #["num_imgs", "offset", "length", "category_id"]

    df = pd.DataFrame.from_dict(rows, orient="index")
    df.index.name = "product_id"
    df.columns = columns
    return df

In [4]:
%time train_offsets_df = read_bson(train_bson_path, num_records=num_train_products, with_categories=True)

100%|██████████| 82/82 [00:00<00:00, 118230.64it/s]

CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 3.05 ms





In [5]:
train_offsets_df.head()

Unnamed: 0_level_0,num_imgs,offset,length,category_id
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1,0,6979,1000010653
1,1,6979,7318,1000010653
2,1,14297,5455,1000004079
3,1,19752,4580,1000004141
4,1,24332,6346,1000015539


In [6]:
train_offsets_df.tail()

Unnamed: 0_level_0,num_imgs,offset,length,category_id
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
95,1,580552,2649,1000010653
97,2,583201,10568,1000010683
98,4,593769,25169,1000010667
99,1,618938,8664,1000014053
101,1,627602,3548,1000004085


In [7]:
train_offsets_df.to_csv("train_offsets.csv")

In [8]:
# How many products?
len(train_offsets_df)

82

In [9]:
# How many categories?
len(train_offsets_df["category_id"].unique())

36

In [10]:
# How many images in total?
train_offsets_df["num_imgs"].sum()

110