# Project - Cdiscount Image Classification



## Data ingestion
## 1. Intro
The primary training set is a 57GB bson file, having ~15 Million images (180x180 images in Base64 format) of ~7.06 Million products. We have imported the dataset into a MongoDB instance on a VPS, so we were able to query among the records.
We have chosen 99 categories, which overally consist of ~246K images of ~110K products.


#### Dataset preparation

First we need to ensure that the "gdown" library is installed and accessible in the environment and download the train_shuffled_100cat data from Google Drive:

In [None]:
! pip install gdown

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import base64
import io
from pathlib import Path
import pandas as pd
import base64
from PIL import Image
import base64
import io
import numpy as np
import tensorflow as tf
FILE="train_shuffled_100cat.csv"


df=pd.read_csv(FILE, header=3)
df.describe()

categories = df['category_id'].unique()
categories.sort()
category_id_map = {k: v for v, k in enumerate(categories)}
df["class"] = df["category_id"].apply(lambda x: category_id_map[x])

rdf = df.sample(frac=1, random_state=123)
rdf.reset_index(drop=True, inplace=True)
count = rdf.shape[0]
num_train = int(count * .75) #= splitting point of train/val set
num_val = num_train + int(count * .2)

for idx, rec in rdf.iterrows():
    folder = "train" if idx < num_train else ("val" if idx < num_val else "test")
    classname = rec["class"]
    Path("data-100cat/%s/%d"%(folder, classname)).mkdir(parents=True, exist_ok=True)
    fh = open("data-100cat/%s/%d/%d-%d-%d.jpg"%(folder,  classname, rec["id"], idx, classname ) , "wb")
    fh.write(
                base64.b64decode(
                    rec["image"]
                )
            )
    fh.close()
    if idx % 10000==0:
        print(idx, "Done")


DATA_ROOT = 'data-100cat/'
    

0 Done
10000 Done
20000 Done
30000 Done
40000 Done
50000 Done
60000 Done
70000 Done
80000 Done
90000 Done
100000 Done
110000 Done
120000 Done
130000 Done
140000 Done
150000 Done
160000 Done
170000 Done
180000 Done
190000 Done
200000 Done
210000 Done
220000 Done
230000 Done
240000 Done


## Environment setup

### The pre-trained models

#### The ResNet152v2 pre-trained model using ImageNet 

In [None]:
def get_ResNet_model(ishape = (180,180,3), k = 99, lr = 1e-4, train_base = True):
    input_layer = tf.keras.layers.Input(shape=ishape, dtype="float")
    base_model = tf.keras.applications.ResNet152V2(
       input_shape=ishape,
       include_top=False,
       weights="imagenet",
       input_tensor=None,
       pooling=None,
       classes=k,
       classifier_activation="softmax",
    )
    base_model.trainable = train_base
    x1 = base_model(input_layer, training=False)
    x2 = tf.keras.layers.Flatten()(x1)
    out = tf.keras.layers.Dense(k,activation = 'softmax')(x2)
    model = tf.keras.Model(inputs = input_layer, outputs =out)
    model.compile(optimizer=tf.keras.optimizers.Adam(lr = lr),
              loss='categorical_crossentropy',
              metrics=['accuracy'])
    print(model.summary())
    return model
ResNet_model = get_ResNet_model(train_base=True, lr = 1e-6)


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet152v2_weights_tf_dim_ordering_tf_kernels_notop.h5
Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 180, 180, 3)]     0         
_________________________________________________________________
resnet152v2 (Functional)     (None, 6, 6, 2048)        58331648  
_________________________________________________________________
flatten (Flatten)            (None, 73728)             0         
_________________________________________________________________
dense (Dense)                (None, 99)                7299171   
Total params: 65,630,819
Trainable params: 65,487,075
Non-trainable params: 143,744
_________________________________________________________________
None


### loading Augmentated data


In [None]:
seed = 909 # (IMPORTANT) to input image and corresponding target with same augmentation parameter.

gen_params = {"rescale":1.0/255,"featurewise_center":False,"samplewise_center":False,"featurewise_std_normalization":False,\
              "samplewise_std_normalization":False,"zca_whitening":False,"rotation_range":20,"width_shift_range":0.1,"height_shift_range":0.1,\
              "shear_range":0.2, "zoom_range":0.1,"horizontal_flip":True,"fill_mode":'constant',\
               "cval": 0}

train_image_datagen = tf.keras.preprocessing.image.ImageDataGenerator(**gen_params) 

train_image_generator = train_image_datagen.flow_from_directory(DATA_ROOT+"train/",
                                                    class_mode="categorical",  classes=[str(i) for i in range(99)], target_size=(180, 180), batch_size = 32,seed=seed,shuffle = True)

val_image_datagen = tf.keras.preprocessing.image.ImageDataGenerator(rescale=1./255) 

val_image_generator = val_image_datagen.flow_from_directory(DATA_ROOT+"val/",
                                                     class_mode="categorical",  classes=[str(i) for i in range(99)],batch_size = 32,seed=seed, target_size=(180, 180),color_mode='rgb',shuffle = True)

test_image_datagen = tf.keras.preprocessing.image.ImageDataGenerator(rescale=1./255) 

test_image_generator = val_image_datagen.flow_from_directory(DATA_ROOT+"test/",
                                                     class_mode="categorical", classes=[str(i) for i in range(99)],batch_size = 32,seed=seed, target_size=(180, 180),color_mode='rgb')

Found 184695 images belonging to 99 classes.
Found 49252 images belonging to 99 classes.
Found 12314 images belonging to 99 classes.


Below we have a look at the generated data in all train/val/test set and verify that only the training set has been augmented.

In [None]:
x, y = next(train_image_generator)
print(x.shape, y.shape)

xv, yv = next(val_image_generator)
print(xv.shape, yv.shape)


(32, 180, 180, 3) (32, 99)
(32, 180, 180, 3) (32, 99)


## 6. Define your callbacks (save your model, patience, etc.)

In [None]:
from time import time; 
model_name_cnn = "cdiscount_{}.h5".format(int(time()))

print("Saving model: {}".format(model_name_cnn))

early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience = 20)

monitor = tf.keras.callbacks.ModelCheckpoint(model_name_cnn, monitor='val_loss',\
                                             verbose=0,save_best_only=True,\
                                             save_weights_only=True,\
                                             mode='min')
# Learning rate schedule
def scheduler(epoch, lr):
    if epoch%10 == 0:
        lr = lr/2
    return lr

lr_schedule = tf.keras.callbacks.LearningRateScheduler(scheduler,verbose = 0)

Saving model: cdiscount_1618524235.h5


## 7. Train your model

In [None]:
ResNet_model.load_weights('/content/drive/MyDrive/ENEL645/ENEL645-Project/ENEL645W21-Cdiscount-image-classification/models/cdiscount_Augmented_tuned_6Ephocs.h5')
ResNet_model.fit(train_image_generator, validation_data = (val_image_generator),\
                    epochs=2,verbose=1, callbacks = [early_stop, monitor, lr_schedule])

## 8. Test your model

In [None]:
#Augmented and Fine-Tuned

ResNet_model.load_weights('/content/drive/MyDrive/ENEL645/ENEL645-Project/ENEL645W21-Cdiscount-image-classification/models/cdiscount_Augmented_tuned_6Ephocs.h5')

metrics = ResNet_model.evaluate(test_image_generator)




1


  
