In [9]:
import os, sys, math, io
import numpy as np
import pandas as pd
import multiprocessing as mp
import bson
import struct

%matplotlib inline
import matplotlib.pyplot as pt

import keras
from keras.preprocessing.image import load_img, img_to_array
import tensorflow as tf

from collections import defaultdict
from tqdm import *

Using TensorFlow backend.


In [10]:
data_dir = './data/'

train_bson_path = os.path.join(data_dir, "train.bson")
num_train_products = 7069896

test_bson_path = os.path.join(data_dir, "test.bson")
num_test_products = 1768172


In [12]:
categories_path = os.path.join(data_dir, "category_names.csv")
categories_df = pd.read_csv(categories_path, index_col="category_id")

categories_df['category_idx'] = pd.Series(range(len(categories_df)), index=categories_df.index)
categories_df.to_csv("categories.csv")
categories_df.head()

Unnamed: 0_level_0,category_level1,category_level2,category_level3,category_idx
category_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1000021794,ABONNEMENT / SERVICES,CARTE PREPAYEE,CARTE PREPAYEE MULTIMEDIA,0
1000012764,AMENAGEMENT URBAIN - VOIRIE,AMENAGEMENT URBAIN,ABRI FUMEUR,1
1000012776,AMENAGEMENT URBAIN - VOIRIE,AMENAGEMENT URBAIN,ABRI VELO - ABRI MOTO,2
1000012768,AMENAGEMENT URBAIN - VOIRIE,AMENAGEMENT URBAIN,FONTAINE A EAU,3
1000012755,AMENAGEMENT URBAIN - VOIRIE,SIGNALETIQUE,PANNEAU D'INFORMATION EXTERIEUR,4


In [15]:
def make_category_tables():
    cat2idx = {}
    idx2cat = {}
    for ir in categories_df.itertuples():
        category_id = ir[0]
        category_idx = ir[4]
        cat2idx[category_id] = category_idx
        idx2cat[category_idx] = category_id
    return cat2idx, idx2cat

In [20]:
cat2idx, idx2cat = make_category_tables()

In [22]:
def read_bson(bson_path, num_records, with_categories):
    rows = {}
    with open(bson_path, "rb") as f, tqdm(total=num_records) as pbar:
        offset = 0
        while True:
            item_length_bytes = f.read(4)
            if len(item_length_bytes) == 0:
                break
                
            length = struct.unpack("<i", item_length_bytes)[0]
            
            f.seek(offset)
            item_data = f.read(length)
            assert len(item_data) == length
            
            item = bson.BSON.decode(item_data)
            product_id = item['_id']
            num_imgs = len(item['imgs'])
            
            row = [num_imgs, offset, length]
            if with_categories:
                row += [item["category_id"]]
            rows[product_id] = row
            
            offset += length
            f.seek(offset)
            pbar.update()
            
    columns = ["num_imgs", "offset", "length"]
    if with_categories:
        columns += ["category_id"]
        
    df = pd.DataFrame.from_dict(rows, orient="index")
    df.index.name = "product_id"
    df.columns = columns
    df.sort_index(inplace=True)
    return df

In [23]:
%time train_offsets_df = read_bson(train_bson_path, num_records=num_train_products, with_categories=True)

100%|██████████| 7069896/7069896 [08:27<00:00, 13940.57it/s]


CPU times: user 2min 53s, sys: 36.1 s, total: 3min 29s
Wall time: 8min 42s


In [27]:
train_offsets_df.head()

Unnamed: 0_level_0,num_imgs,offset,length,category_id
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1,0,6979,1000010653
1,1,6979,7318,1000010653
2,1,14297,5455,1000004079
3,1,19752,4580,1000004141
4,1,24332,6346,1000015539


In [28]:
train_offsets_df.to_csv("train_offsets.csv")

In [29]:
%time test_offsets_df = read_bson(test_bson_path, num_records=num_test_products, with_categories=False)

1768182it [02:03, 14298.30it/s]                             


CPU times: user 39.8 s, sys: 8.09 s, total: 47.9 s
Wall time: 2min 7s


In [30]:
test_offsets_df.head()

Unnamed: 0_level_0,num_imgs,offset,length
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10,3,0,15826
14,1,15826,5589
21,1,21415,7544
24,1,28959,4855
27,1,33814,2921


In [31]:
test_offsets_df.to_csv('etest_offsets.csv')

### Create a random train/validation split

In [None]:
def make_val_set(df, split_percentage=0.2, drop_percentage=0.):
    #Find the product_ids for each category.
    category_dict = defaultdict(list)
    for ir in tqdm(df.itertuples()):
        category_dict[ir[4]].append(ir[0])
    
    train_list = []
    val_list = []
    with tqdm(total=len(df)) as pbar:
        for category_id, product_ids in category_dict.items():
            category_idx = cat2idx[category_id]
            
            #Randomly remove products to make the dataset smaller.
            keep_size = int(len(product_ids) * (1. - drop_percentage))
            if keep_size < len(product_ids):
                product_ids = np.random.choice(product_ids, keep_size, replace=False)
                
            #Randomly choose the prodcuts that become part of the validation set.
            val_size = int(len(product_ids) * split_percentage)
            if val_size > 0:
                val_ids = np.random.choice(product_ids, val_size, replace=False)
            else:
                val_ids=[]
                
            #Create a new row for each images.
            for product_id in product_ids:
                row = [product_id, category_idx]
                for img_idx in range(df.loc[product_id, "num_imgs"]):
                    if product_id in val_ids:
                        val_list.append(row + [img_idx])
                    else:
                        train_list.append(row + [img_idx])
                pbar.udate()
                            