In [1]:
import os, sys, math, io
import numpy as np
import pandas as pd
import multiprocessing as mp
import bson
import struct

%matplotlib inline
import matplotlib.pyplot as plt

import tensorflow as tf

from collections import defaultdict
from tqdm import *

data_dir = '/mnt/data/cdiscount/'

from subprocess import check_output
print(check_output(["ls", data_dir]).decode("utf8"))

# Any results you write to the current directory are saved as output.

category_names.csv
sample_submission.csv
test.bson
train_block_0.bson
train_block_10.bson
train_block_11.bson
train_block_12.bson
train_block_13.bson
train_block_14.bson
train_block_15.bson
train_block_16.bson
train_block_17.bson
train_block_18.bson
train_block_19.bson
train_block_1.bson
train_block_20.bson
train_block_21.bson
train_block_22.bson
train_block_23.bson
train_block_24.bson
train_block_2.bson
train_block_3.bson
train_block_4.bson
train_block_5.bson
train_block_6.bson
train_block_7.bson
train_block_8.bson
train_block_9.bson
train.bson
train_example.bson



In [2]:
train_bson_path = os.path.join(data_dir, "train.bson")
num_train_products = 7069896

test_bson_path = os.path.join(data_dir, "test.bson")
num_test_products = 1768172

categories_path = os.path.join(data_dir, "category_names.csv")
categories_df = pd.read_csv(categories_path, index_col="category_id")

# Maps the category_id to an integer index. This is what we'll use to
# one-hot encode the labels.
categories_df["category_idx"] = pd.Series(range(len(categories_df)), index=categories_df.index)

categories_df.to_csv("categories.csv")
categories_df.head()

Unnamed: 0_level_0,category_level1,category_level2,category_level3,category_idx
category_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1000021794,ABONNEMENT / SERVICES,CARTE PREPAYEE,CARTE PREPAYEE MULTIMEDIA,0
1000012764,AMENAGEMENT URBAIN - VOIRIE,AMENAGEMENT URBAIN,ABRI FUMEUR,1
1000012776,AMENAGEMENT URBAIN - VOIRIE,AMENAGEMENT URBAIN,ABRI VELO - ABRI MOTO,2
1000012768,AMENAGEMENT URBAIN - VOIRIE,AMENAGEMENT URBAIN,FONTAINE A EAU,3
1000012755,AMENAGEMENT URBAIN - VOIRIE,SIGNALETIQUE,PANNEAU D'INFORMATION EXTERIEUR,4


In [3]:
def make_category_tables():
    cat2idx = {}
    idx2cat = {}
    for ir in categories_df.itertuples():
        category_id = ir[0]
        category_idx = ir[4]
        cat2idx[category_id] = category_idx
        idx2cat[category_idx] = category_id
    return cat2idx, idx2cat

cat2idx, idx2cat = make_category_tables()
cat2idx[1000012755], idx2cat[4]

(4, 1000012755)

In [10]:
def read_bson(bson_path, num_records, with_categories):
    rows = {}
    with open(bson_path, "rb") as f, tqdm(total=num_records) as pbar:
        offset = 0
        while True:
            item_length_bytes = f.read(4)
            if len(item_length_bytes) == 0:
                break

            length = struct.unpack("<i", item_length_bytes)[0]

            f.seek(offset)
            item_data = f.read(length)
            assert len(item_data) == length

            item = bson.BSON.decode(item_data)
            product_id = item["_id"]
            num_imgs = len(item["imgs"])

            row = [num_imgs, offset, length]
            if with_categories:
                row += [item["category_id"]]
            rows[product_id] = row

            offset += length
            f.seek(offset)
            pbar.update()

    columns = ["num_imgs", "offset", "length"]
    if with_categories:
        columns += ["category_id"]

    df = pd.DataFrame.from_dict(rows, orient="index")
    df.index.name = "product_id"
    df.columns = columns
    df.sort_index(inplace=True)
    return df

%time train_offsets_df = read_bson(train_bson_path, num_records=num_train_products, with_categories=True)

100%|██████████| 7069896/7069896 [02:16<00:00, 51833.55it/s]


CPU times: user 1min 37s, sys: 29.4 s, total: 2min 7s
Wall time: 2min 32s


In [11]:
train_offsets_df.head()

Unnamed: 0_level_0,num_imgs,offset,length,category_id
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1,0,6979,1000010653
1,1,6979,7318,1000010653
2,1,14297,5455,1000004079
3,1,19752,4580,1000004141
4,1,24332,6346,1000015539


In [13]:
train_offsets_df.to_csv("train_offsets.csv")

12371293

In [16]:
print('number of products = %d' % len(train_offsets_df))
print('number of catagories = %d' % len(train_offsets_df["category_id"].unique()))
print('number of images = %d' % train_offsets_df["num_imgs"].sum())

number of products = 7069896
number of catagories = 5270
number of images = 12371293


In [17]:
%time test_offsets_df = read_bson(test_bson_path, num_records=num_test_products, with_categories=False)

1768182it [00:29, 59293.44it/s]                             


CPU times: user 21.5 s, sys: 5.22 s, total: 26.7 s
Wall time: 32.7 s


In [18]:
test_offsets_df.head()

Unnamed: 0_level_0,num_imgs,offset,length
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10,3,0,15826
14,1,15826,5589
21,1,21415,7544
24,1,28959,4855
27,1,33814,2921


In [19]:
test_offsets_df.to_csv("test_offsets.csv")

In [20]:
def make_val_set(df, split_percentage=0.2, drop_percentage=0.):
    # Find the product_ids for each category.
    category_dict = defaultdict(list)
    for ir in tqdm(df.itertuples()):
        category_dict[ir[4]].append(ir[0])

    train_list = []
    val_list = []
    with tqdm(total=len(df)) as pbar:
        for category_id, product_ids in category_dict.items():
            category_idx = cat2idx[category_id]

            # Randomly remove products to make the dataset smaller.
            keep_size = int(len(product_ids) * (1. - drop_percentage))
            if keep_size < len(product_ids):
                product_ids = np.random.choice(product_ids, keep_size, replace=False)

            # Randomly choose the products that become part of the validation set.
            val_size = int(len(product_ids) * split_percentage)
            if val_size > 0:
                val_ids = np.random.choice(product_ids, val_size, replace=False)
            else:
                val_ids = []

            # Create a new row for each image.
            for product_id in product_ids:
                row = [product_id, category_idx]
                for img_idx in range(df.loc[product_id, "num_imgs"]):
                    if product_id in val_ids:
                        val_list.append(row + [img_idx])
                    else:
                        train_list.append(row + [img_idx])
                pbar.update()
                
    columns = ["product_id", "category_idx", "img_idx"]
    train_df = pd.DataFrame(train_list, columns=columns)
    val_df = pd.DataFrame(val_list, columns=columns)   
    return train_df, val_df

train_images_df, val_images_df = make_val_set(train_offsets_df, split_percentage=0.2, drop_percentage=0.)

7069896it [00:08, 864563.56it/s]
100%|██████████| 7069896/7069896 [05:03<00:00, 23282.91it/s]


In [21]:
print("Number of training images:", len(train_images_df))
print("Number of validation images:", len(val_images_df))
print("Total images:", len(train_images_df) + len(val_images_df))

Number of training images: 9900946
Number of validation images: 2470347
Total images: 12371293


In [None]:
train_images_df.head()

In [None]:
val_images_df.head()

In [22]:
len(train_images_df["category_idx"].unique()), len(val_images_df["category_idx"].unique())

(5270, 5270)

In [24]:
category_idx = 619
num_train = np.sum(train_images_df["category_idx"] == category_idx)
num_val = np.sum(val_images_df["category_idx"] == category_idx)
print('number of products in train set: %d' % num_train)
print('number of products in val set: %d' % num_val)
print('ratio = %f' % (num_val / num_train))
train_images_df.to_csv("train_images.csv")
val_images_df.to_csv("val_images.csv")

number of products in train set: 1766
number of products in val set: 443
ratio = 0.250849


In [25]:
def make_test_set(df):
    test_list = []
    for ir in tqdm(df.itertuples()):
        product_id = ir[0]
        num_imgs = ir[1]
        for img_idx in range(num_imgs):
            test_list.append([product_id, img_idx])

    columns = ["product_id", "img_idx"]
    test_df = pd.DataFrame(test_list, columns=columns)
    return test_df

In [26]:
test_images_df = make_test_set(test_offsets_df)

1768182it [00:03, 537480.72it/s]


In [29]:
test_images_df.head()

Unnamed: 0,product_id,img_idx
0,10,0
1,10,1
2,10,2
3,14,0
4,21,0


In [30]:
print("Number of test images:", len(test_images_df))

Number of test images: 3095080


In [31]:
test_images_df.to_csv("test_images.csv")