In [1]:
import os, sys, math, io
import numpy as np
import pandas as pd
import multiprocessing as mp
import bson
import struct
from skimage.data import imread
from skimage import transform

%matplotlib inline
import matplotlib.pyplot as plt

from collections import defaultdict
from tqdm import *


In [2]:
data_dir = "./input/"
file_dir = r'C:\Users\YANG\Downloads\cdiscount'
train_bson_path = os.path.join(file_dir, "train.bson")
num_train_products = 7069896

# Part 1: Create lookup tables

The generator uses several lookup tables that describe the layout of the BSON file, which products and images are part of the training/validation sets, and so on.

You only need to generate these tables once, as they get saved to CSV files. If you already have these CSV files, skip to part 2.

## Lookup table for categories

In [3]:
categories_path = os.path.join(data_dir, "category_names.csv")
categories_df = pd.read_csv(categories_path, index_col="category_id")

# Maps the category_id to an integer index.
categories_df["category_idx"] = pd.Series(range(len(categories_df)), index=categories_df.index)

In [4]:
categories_df.head()

Unnamed: 0_level_0,category_level1,category_level2,category_level3,category_idx
category_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1000021794,ABONNEMENT / SERVICES,CARTE PREPAYEE,CARTE PREPAYEE MULTIMEDIA,0
1000012764,AMENAGEMENT URBAIN - VOIRIE,AMENAGEMENT URBAIN,ABRI FUMEUR,1
1000012776,AMENAGEMENT URBAIN - VOIRIE,AMENAGEMENT URBAIN,ABRI VELO - ABRI MOTO,2
1000012768,AMENAGEMENT URBAIN - VOIRIE,AMENAGEMENT URBAIN,FONTAINE A EAU,3
1000012755,AMENAGEMENT URBAIN - VOIRIE,SIGNALETIQUE,PANNEAU D'INFORMATION EXTERIEUR,4


Create dictionaries for quick lookup of `category_id` to `category_idx` mapping.

In [5]:
print(len(categories_df))
print(len(categories_df['category_level1'].unique()))
print(len(categories_df['category_level2'].unique()))
print(len(categories_df['category_level3'].unique()))

5270
49
483
5263


In [6]:
id_idx = zip(categories_df.index, categories_df['category_idx'])
idx_id = zip(categories_df['category_idx'], categories_df.index)
cat2idx = dict(id_idx)
idx2cat = dict(idx_id)

In [None]:
cat2idx[1000012755], idx2cat[4]

## Read the BSON files

We store the offsets and lengths of all items, allowing us random access to the items later.

Inspired by code from: https://www.kaggle.com/vfdev5/random-item-access

Note: this takes a few minutes to execute, but we only have to do it once (we'll save the table to a CSV file afterwards).

In [7]:
def read_bson(bson_path, num_records, with_categories):
    rows = {}
    with open(bson_path, "rb") as f, tqdm(total=num_records) as pbar:
        offset = 0
        while True:
            item_length_bytes = f.read(4) #read in 4 lines
            if len(item_length_bytes) == 0:
                break

            length = struct.unpack("<i", item_length_bytes)[0]
            f.seek(offset) #locating the starting line
            item_data = f.read(length)
            assert len(item_data) == length

            item = bson.BSON.decode(item_data)
            product_id = item["_id"]
            num_imgs = len(item["imgs"])

            row = [num_imgs, offset, length]
            if with_categories:
                row += [item["category_id"]]
            rows[product_id] = row

            offset += length
            f.seek(offset)
            pbar.update()

    columns = ["num_imgs", "offset", "length"]
    if with_categories:
        columns += ["category_id"]

    df = pd.DataFrame.from_dict(rows, orient="index")
    df.index.name = "product_id"
    df.columns = columns
    df.sort_index(inplace=True)
    return df

In [8]:
%time train_offsets_df = read_bson(train_bson_path, num_records=num_train_products, with_categories=True)

100%|███████████████████████████████████████████████████████| 7069896/7069896 [03:52<00:00, 30432.44it/s]


Wall time: 3min 57s


In [9]:
train_offsets_df.head()

Unnamed: 0_level_0,num_imgs,offset,length,category_id
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1,0,6979,1000010653
1,1,6979,7318,1000010653
2,1,14297,5455,1000004079
3,1,19752,4580,1000004141
4,1,24332,6346,1000015539


In [10]:
train_offsets_df.to_csv("train_offsets.csv")

In [11]:
# How many products?
len(train_offsets_df)

7069896

In [12]:
# How many categories?
len(train_offsets_df["category_id"].unique())

5270

In [13]:
# How many images in total?
train_offsets_df["num_imgs"].sum()

12371293

## Create a random train/validation split

We split on products, not on individual images. Since some of the categories only have a few products, we do the split separately for each category.

This creates two new tables, one for the training images and one for the validation images. There is a row for every single image, so if a product has more than one image it occurs more than once in the table.

In [14]:
def make_val_set(df, split_percentage=0.2, drop_percentage=0.):
    # Find the product_ids for each category.
    category_dict = defaultdict(list)
    for ir in tqdm(df.itertuples()):
        category_dict[ir[4]].append(ir[0]) #key: category_id, value: [products_id]

    train_list = []
    val_list = []
    with tqdm(total=len(df)) as pbar:
        for category_id, product_ids in category_dict.items():
            category_idx = cat2idx[category_id]

            # Randomly remove products to make the dataset smaller.
            keep_size = int(len(product_ids) * (1. - drop_percentage))
            if keep_size < len(product_ids):
                product_ids = np.random.choice(product_ids, keep_size, replace=False)

            # Randomly choose the products that become part of the validation set.
            val_size = int(len(product_ids) * split_percentage)
            if val_size > 0:
                val_ids = np.random.choice(product_ids, val_size, replace=False)
            else:
                val_ids = []

            # Create a new row for each image.
            for product_id in product_ids:
                row = [product_id, category_idx]
                for img_idx in range(df.loc[product_id, "num_imgs"]):
                    if product_id in val_ids:
                        val_list.append(row + [img_idx])
                    else:
                        train_list.append(row + [img_idx])
                pbar.update()
                
    columns = ["product_id", "category_idx", "img_idx"]
    train_df = pd.DataFrame(train_list, columns=columns)
    val_df = pd.DataFrame(val_list, columns=columns)   
    return train_df, val_df

Create a 80/20 split. Also drop 90% of all products to make the dataset more manageable. (Note: if `drop_percentage` > 0, the progress bar doesn't go all the way.)

In [16]:
train_images_df, val_images_df = make_val_set(train_offsets_df, split_percentage=0.2, 
                                              drop_percentage=0.9)

7069896it [00:07, 981556.37it/s] 
 10%|█████▌                                                  | 704102/7069896 [00:17<02:40, 39612.35it/s]


In [17]:
train_images_df.head()

Unnamed: 0,product_id,category_idx,img_idx
0,22330284,5055,0
1,11679,5055,0
2,16772703,5055,0
3,7791056,5055,0
4,10202676,5055,0


In [18]:
train_images_df.tail()

Unnamed: 0,product_id,category_idx,img_idx
989823,7526708,871,1
989824,12947214,4690,0
989825,9582241,410,0
989826,20513149,5228,0
989827,18734768,5101,0


In [19]:
val_images_df.head()

Unnamed: 0,product_id,category_idx,img_idx
0,7661257,5055,0
1,7661257,5055,1
2,7661257,5055,2
3,8565368,5055,0
4,13513120,5055,0


In [20]:
print("Number of training images:", len(train_images_df))
print("Number of validation images:", len(val_images_df))
print("Total images:", len(train_images_df) + len(val_images_df))

Number of training images: 989828
Number of validation images: 241932
Total images: 1231760


Are all categories represented in the train/val split? (Note: if the drop percentage is high, then very small categories won't have enough products left to make it into the validation set.)

In [21]:
len(train_images_df["category_idx"].unique()), len(val_images_df["category_idx"].unique())

(5270, 4268)

Quickly verify that the split really is approximately 80-20:

In [22]:
category_idx = 5055
num_train = np.sum(train_images_df["category_idx"] == category_idx)
num_val = np.sum(val_images_df["category_idx"] == category_idx)
num_val / num_train

0.25521821631878555

Close enough. ;-) Remember that we split on products but not all products have the same number of images, which is where the slightly discrepancy comes from. (Also, there tend to be fewer validation images if `drop_percentage` > 0.)

Save the lookup tables as CSV so that we don't need to repeat the above procedure again.

In [23]:
train_images_df.to_csv("train_images.csv")
val_images_df.to_csv("val_images.csv")