In [1]:
import os, sys, math, io
import numpy as np
import pandas as pd
import multiprocessing as mp
import bson
import struct
from skimage.data import imread
from skimage import transform

%matplotlib inline
import matplotlib.pyplot as plt

from collections import defaultdict
from tqdm import *


In [2]:
data_dir = "./input/"
train_bson_path = os.path.join(data_dir, "train_example.bson")
num_train_products = 82

#read in the offset csv and the mapping dictionary.
train_offsets_df = pd.read_csv('train_offsets.csv', index_col='product_id')
categories_df = pd.read_csv('categories_name_to_id.csv', index_col='category_id')

#recover the mapping.
id_idx = zip(categories_df.index, categories_df['category_idx']) #cat_id --> cat_idx
idx_id = zip(categories_df['category_idx'], categories_df.index) #cat_idx --> cat_id
id_level1 = zip(categories_df.index, categories_df['category_level1']) #cat_id --> level1_idx
id_level2 = zip(categories_df.index, categories_df['category_level2']) #cat_id --> level2_idx
cat2idx = dict(id_idx)
idx2cat = dict(idx_id)
cat2l1 = dict(id_level1)
cat2l2 = dict(id_level2)

In [3]:
train_offsets_df.head()

Unnamed: 0_level_0,num_imgs,offset,length,category_id
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1,0,6979,1000010653
1,1,6979,7318,1000010653
2,1,14297,5455,1000004079
3,1,19752,4580,1000004141
4,1,24332,6346,1000015539


In [4]:
categories_df.head()

Unnamed: 0_level_0,category_level1,category_level2,category_level3,category_idx
category_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1000021794,0,0,0,0
1000012764,1,1,1,1
1000012776,1,1,2,2
1000012768,1,1,3,3
1000012755,1,2,4,4


## Create a random train/validation split

We split on products, not on individual images. Since some of the categories only have a few products, we do the split separately for each category. 

**This is to ensure the sampled (1/10) dataset resembles the original dataset as well as the val dataset resembles the train dataset.**

This creates two new tables, one for the training images and one for the validation images. There is a row for every single image, so if a product has more than one image it occurs more than once in the table.

In [5]:
def make_val_set_levelidx(df, val_percentage=0.2, sample_percentage=1.):
    # Put products under their categories.
    category_dict = defaultdict(list)
    for ir in tqdm(df.itertuples()):
        category_dict[ir[4]].append(ir[0]) #key: category_id, value: [products_id]

    train_list = []
    val_list = []
    with tqdm(total=len(df)) as pbar:
        for category_id, product_ids in category_dict.items():
            category_idx = cat2idx[category_id]
            level1_idx = cat2l1[category_id]
            level2_idx = cat2l2[category_id]
            
            # Randomly remove products to make the dataset smaller.
            keep_size = int(len(product_ids) * (sample_percentage))
            if keep_size < len(product_ids):
                product_ids = np.random.choice(product_ids, keep_size, replace=False)

            # Randomly choose the products that become part of the validation set.
            val_size = int(len(product_ids) * val_percentage)
            if val_size > 0:
                val_ids = np.random.choice(product_ids, val_size, replace=False)
            else:
                val_ids = []

            # Create a new row for each image.
            for product_id in product_ids:
                row = [product_id, category_idx, level2_idx, level1_idx]
                for img_idx in range(df.loc[product_id, "num_imgs"]):
                    if product_id in val_ids:
                        val_list.append(row + [img_idx])
                    else:
                        train_list.append(row + [img_idx])
                pbar.update()
                
    columns = ["product_id", "category_idx", "level2_idx", "level1_idx", "img_idx"]
    train_df = pd.DataFrame(train_list, columns=columns)
    val_df = pd.DataFrame(val_list, columns=columns)   
    return train_df, val_df

### Create a 80/20 split. Also drop 90% of all products to make the prototype design and hyperparameter searching much faster. 

In [6]:
train_images_df, val_images_df = make_val_set_levelidx(train_offsets_df, val_percentage=0.2, 
                                                       sample_percentage=1.)

82it [00:00, 348110.25it/s]
100%|██████████| 82/82 [00:00<00:00, 65003.39it/s]


In [7]:
train_images_df.head()

Unnamed: 0,product_id,category_idx,level2_idx,level1_idx,img_idx
0,0,5055,455,45,0
1,1,5055,455,45,0
2,5,5055,455,45,0
3,11,5055,455,45,0
4,16,5055,455,45,0


In [8]:
train_images_df.tail()

Unnamed: 0,product_id,category_idx,level2_idx,level1_idx,img_idx
98,97,5079,455,45,1
99,84,5047,455,45,0
100,87,1285,78,10,0
101,99,3257,250,27,0
102,101,2522,191,22,0


In [9]:
val_images_df.head()

Unnamed: 0,product_id,category_idx,level2_idx,level1_idx,img_idx
0,30,5055,455,45,0
1,44,5055,455,45,0
2,47,5055,455,45,0
3,60,5055,455,45,0
4,63,5055,455,45,0


In [10]:
print("Number of training images:", len(train_images_df))
print("Number of validation images:", len(val_images_df))
print("Total images:", len(train_images_df) + len(val_images_df))

Number of training images: 103
Number of validation images: 7
Total images: 110


Are all categories represented in the train/val split? (Note: if the drop percentage is high, then very small categories won't have enough products left to make it into the validation set.)

In [11]:
len(train_images_df["category_idx"].unique()), len(val_images_df["category_idx"].unique())

(36, 1)

Quickly verify that the split really is approximately 80-20:

In [12]:
category_idx = 5055
num_train = np.sum(train_images_df["category_idx"] == category_idx)
num_val = np.sum(val_images_df["category_idx"] == category_idx)
num_val / num_train

0.21875

Close enough. ;-) Remember that we split on products but not all products have the same number of images, which is where the slightly discrepancy comes from. (Also, there tend to be fewer validation images if `drop_percentage` > 0.)

Save the lookup tables as CSV so that we don't need to repeat the above procedure again.

In [13]:
train_images_df.to_csv("train_images_withlevel.csv")
val_images_df.to_csv("val_images_withlevel.csv")