In [1]:
import os, sys, math, io
import numpy as np
import pandas as pd
import multiprocessing as mp
import bson
import struct
from skimage.data import imread
from skimage import transform

%matplotlib inline
import matplotlib.pyplot as plt

from collections import defaultdict
from tqdm import *


In [2]:
data_dir = "./input/"
file_dir = r'C:\Users\YANG\Downloads\cdiscount'
train_bson_path = os.path.join(file_dir, "train.bson")
num_train_products = 7069896

In [6]:
train_offsets_df = pd.read_csv("train_offsets.csv", index_col=0)

  mask |= (ar1 == a)


In [7]:
train_offsets_df.head()

Unnamed: 0_level_0,num_imgs,offset,length,category_id
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1,0,6979,1000010653
1,1,6979,7318,1000010653
2,1,14297,5455,1000004079
3,1,19752,4580,1000004141
4,1,24332,6346,1000015539


## Lookup table for categories

In [8]:
categories_path = os.path.join(data_dir, "category_names.csv")
categories_df = pd.read_csv(categories_path, index_col="category_id")

# Maps the category_id to an integer index.
categories_df["category_idx"] = pd.Series(range(len(categories_df)), index=categories_df.index)

In [9]:
id_idx = zip(categories_df.index, categories_df['category_idx'])
idx_id = zip(categories_df['category_idx'], categories_df.index)
cat2idx = dict(id_idx)
idx2cat = dict(idx_id)

## Create a random train/validation split

We split on products, not on individual images. Since some of the categories only have a few products, we do the split separately for each category.

This creates two new tables, one for the training images and one for the validation images. There is a row for every single image, so if a product has more than one image it occurs more than once in the table.

In [10]:
def make_val_set(df, split_percentage=0.2, drop_percentage=0.):
    # Find the product_ids for each category.
    category_dict = defaultdict(list)
    for ir in tqdm(df.itertuples()):
        category_dict[ir[4]].append(ir[0]) #key: category_id, value: [products_id]

    train_list = []
    val_list = []
    with tqdm(total=len(df)) as pbar:
        for category_id, product_ids in category_dict.items():
            category_idx = cat2idx[category_id]

            # Randomly remove products to make the dataset smaller.
            keep_size = int(len(product_ids) * (1. - drop_percentage))
            if keep_size < len(product_ids):
                product_ids = np.random.choice(product_ids, keep_size, replace=False)

            # Randomly choose the products that become part of the validation set.
            val_size = int(len(product_ids) * split_percentage)
            if val_size > 0:
                val_ids = np.random.choice(product_ids, val_size, replace=False)
            else:
                val_ids = []

            # Create a new row for each image.
            for product_id in product_ids:
                row = [product_id, category_idx]
                for img_idx in range(df.loc[product_id, "num_imgs"]):
                    if product_id in val_ids:
                        val_list.append(row + [img_idx])
                    else:
                        train_list.append(row + [img_idx])
                pbar.update()
                
    columns = ["product_id", "category_idx", "img_idx"]
    train_df = pd.DataFrame(train_list, columns=columns)
    val_df = pd.DataFrame(val_list, columns=columns)   
    return train_df, val_df

Create a 98/2 split.

In [11]:
train_images_df, val_images_df = make_val_set(train_offsets_df, split_percentage=0.02, 
                                              drop_percentage=0.)

7069896it [00:05, 1188426.77it/s]
100%|███████████████████████████████████████████████████████| 7069896/7069896 [02:02<00:00, 57765.76it/s]


In [12]:
train_images_df.head()

Unnamed: 0,product_id,category_idx,img_idx
0,0,5055,0
1,1,5055,0
2,5,5055,0
3,11,5055,0
4,16,5055,0


In [13]:
train_images_df.tail()

Unnamed: 0,product_id,category_idx,img_idx
12129136,19028368,5101,2
12129137,20643558,5101,0
12129138,20643558,5101,1
12129139,20643558,5101,2
12129140,20643558,5101,3


In [14]:
val_images_df.head()

Unnamed: 0,product_id,category_idx,img_idx
0,255,5055,0
1,322,5055,0
2,372,5055,0
3,455,5055,0
4,478,5055,0


In [15]:
print("Number of training images:", len(train_images_df))
print("Number of validation images:", len(val_images_df))
print("Total images:", len(train_images_df) + len(val_images_df))

Number of training images: 12129141
Number of validation images: 242152
Total images: 12371293


Are all categories represented in the train/val split? (Note: if the drop percentage is high, then very small categories won't have enough products left to make it into the validation set.)

In [16]:
len(train_images_df["category_idx"].unique()), len(val_images_df["category_idx"].unique())

(5270, 4289)

Quickly verify that the split really is approximately 98-2:

In [17]:
category_idx = 5055
num_train = np.sum(train_images_df["category_idx"] == category_idx)
num_val = np.sum(val_images_df["category_idx"] == category_idx)
num_val / num_train

0.020333047912380026

Close enough. ;-) Remember that we split on products but not all products have the same number of images, which is where the slightly discrepancy comes from. (Also, there tend to be fewer validation images if `drop_percentage` > 0.)

Save the lookup tables as CSV so that we don't need to repeat the above procedure again.

In [18]:
train_images_df.to_csv("train_images_all.csv")
val_images_df.to_csv("val_images_all.csv")