In [1]:
import os, sys, math, io
import numpy as np
import pandas as pd
import multiprocessing as mp
import bson
import struct
from skimage.data import imread
from skimage import transform

%matplotlib inline
import matplotlib.pyplot as plt

from collections import defaultdict
from tqdm import *


In [2]:
data_dir = "./input/"
train_bson_path = os.path.join(data_dir, "train_example.bson")
num_train_products = 82

# Part 1: Create lookup tables

The generator uses several lookup tables that describe the layout of the BSON file, which products and images are part of the training/validation sets, and so on.

You only need to generate these tables once, as they get saved to CSV files. If you already have these CSV files, skip to part 2.

## Lookup table for categories

In [3]:
categories_path = os.path.join(data_dir, "category_names.csv")
categories_df = pd.read_csv(categories_path, index_col="category_id")

# Maps the category_id to an integer index. This will be the ground truth label in training.
categories_df["category_idx"] = pd.Series(range(len(categories_df)), index=categories_df.index)

### Check if there is any null in the categories

In [4]:
categories_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5270 entries, 1000021794 to 1000003400
Data columns (total 4 columns):
category_level1    5270 non-null object
category_level2    5270 non-null object
category_level3    5270 non-null object
category_idx       5270 non-null int64
dtypes: int64(1), object(3)
memory usage: 205.9+ KB


In [5]:
categories_df.head()

Unnamed: 0_level_0,category_level1,category_level2,category_level3,category_idx
category_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1000021794,ABONNEMENT / SERVICES,CARTE PREPAYEE,CARTE PREPAYEE MULTIMEDIA,0
1000012764,AMENAGEMENT URBAIN - VOIRIE,AMENAGEMENT URBAIN,ABRI FUMEUR,1
1000012776,AMENAGEMENT URBAIN - VOIRIE,AMENAGEMENT URBAIN,ABRI VELO - ABRI MOTO,2
1000012768,AMENAGEMENT URBAIN - VOIRIE,AMENAGEMENT URBAIN,FONTAINE A EAU,3
1000012755,AMENAGEMENT URBAIN - VOIRIE,SIGNALETIQUE,PANNEAU D'INFORMATION EXTERIEUR,4


In [6]:
categories_df.tail()

Unnamed: 0_level_0,category_level1,category_level2,category_level3,category_idx
category_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1000003412,VIN - ALCOOL - LIQUIDES,JUS - SODA -SIROP-BOISSON LACTEE,SODA-THE GLACE,5265
1000003530,VIN - ALCOOL - LIQUIDES,VIN,ASSORTIMENT VIN,5266
1000003402,VIN - ALCOOL - LIQUIDES,VIN,VIN BLANC,5267
1000003404,VIN - ALCOOL - LIQUIDES,VIN,VIN ROSE,5268
1000003400,VIN - ALCOOL - LIQUIDES,VIN,VIN ROUGE,5269


## Create dictionaries for quick lookup of `category_id` to `category_idx` mapping.

### take a look at the categories.

In [7]:
print(len(categories_df))
print(len(categories_df['category_level1'].unique()))
print(len(categories_df['category_level2'].unique()))
print(len(categories_df['category_level3'].unique()))

5270
49
483
5263


In [8]:
categories_df['category_level1'].unique()

array(['ABONNEMENT / SERVICES', 'AMENAGEMENT URBAIN - VOIRIE',
       'ANIMALERIE', 'APICULTURE', 'ART DE LA TABLE - ARTICLES CULINAIRES',
       'ARTICLES POUR FUMEUR', 'AUTO - MOTO', 'BAGAGERIE',
       'BATEAU MOTEUR - VOILIER', 'BIJOUX -  LUNETTES - MONTRES',
       'BRICOLAGE - OUTILLAGE - QUINCAILLERIE', 'CHAUSSURES - ACCESSOIRES',
       'COFFRET CADEAU BOX', 'CONDITIONNEMENT', 'DECO - LINGE - LUMINAIRE',
       'DROGUERIE', 'DVD - BLU-RAY', 'ELECTROMENAGER', 'ELECTRONIQUE',
       'EPICERIE', 'FUNERAIRE', 'HYGIENE - BEAUTE - PARFUM',
       'INFORMATIQUE', 'INSTRUMENTS DE MUSIQUE', 'JARDIN - PISCINE',
       'JEUX - JOUETS', 'JEUX VIDEO', 'LIBRAIRIE', 'LITERIE',
       'LOISIRS CREATIFS - BEAUX ARTS - PAPETERIE', 'MANUTENTION',
       'MATERIEL DE BUREAU', 'MATERIEL MEDICAL', 'MERCERIE', 'MEUBLE',
       'MUSIQUE', 'PARAPHARMACIE', 'PHOTO - OPTIQUE',
       'POINT DE VENTE - COMMERCE - ADMINISTRATION', 'PRODUITS FRAIS',
       'PRODUITS SURGELES', 'PUERICULTURE', 'SONO - DJ', '

### maps the category_id to (level1_idx, level2_idx, level3_idx, category_idx)
level1_idx, level2_idx, category_idx will be useful in the training.

In [9]:
level1_map = dict(zip(categories_df['category_level1'].unique(), range(49)))
level2_map = dict(zip(categories_df['category_level2'].unique(), range(483)))
level3_map = dict(zip(categories_df['category_level3'].unique(), range(5263)))
#no need for level3_map, which is almost the same as the category id.

In [10]:
categories_df['category_level1'] = categories_df['category_level1'].map(level1_map).astype(int)
categories_df['category_level2'] = categories_df['category_level2'].map(level2_map).astype(int)
categories_df['category_level3'] = categories_df['category_level3'].map(level3_map).astype(int)
categories_df.head()

Unnamed: 0_level_0,category_level1,category_level2,category_level3,category_idx
category_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1000021794,0,0,0,0
1000012764,1,1,1,1
1000012776,1,1,2,2
1000012768,1,1,3,3
1000012755,1,2,4,4


In [11]:
categories_df.to_csv('categories_name_to_id.csv')

In [12]:
id_idx = zip(categories_df.index, categories_df['category_idx']) #cat_id --> cat_idx
idx_id = zip(categories_df['category_idx'], categories_df.index) #cat_idx --> cat_id
id_level1 = zip(categories_df.index, categories_df['category_level1']) #cat_id --> level1_idx
id_level2 = zip(categories_df.index, categories_df['category_level2']) #cat_id --> level2_idx
cat2idx = dict(id_idx)
idx2cat = dict(idx_id)
cat2l1 = dict(id_level1)
cat2l2 = dict(id_level2)

In [13]:
cat2idx[1000012755], idx2cat[4]

(4, 1000012755)

In [14]:
cat2l1[1000012755], cat2l2[1000012755]

(1, 2)