## Import librairies

In [None]:
import os
import sys

#Import config file. Update config.py according to your environment
import config

import pandas as pd
import numpy as np

import tensorflow as tf

from Rakuten_preprocessing import Rakuten_img_path

from src.image.classifiers import ImgClassifier

from src.utils.batch import fit_save_all
from src.utils.load import load_batch_results

import datetime


## Import the pre-processed data

In [2]:
data_train = pd.read_csv(os.path.join(config.path_to_data, 'df_train_index.csv'))
data_train['testset'] = False
data_test = pd.read_csv(os.path.join(config.path_to_data, 'df_test_index.csv'))
data_test['testset'] = True
data = pd.concat([data_train, data_test], axis=0)

#merging text into token column
colnames = ['designation_translated', 'description_translated'] #['designation', 'description']#
data['tokens'] = data[colnames].apply(lambda row: ' '.join(s.lower() for s in row if isinstance(s, str)), axis=1)

#path to images into img_path column
data['img_path'] = Rakuten_img_path(img_folder=config.path_to_images,
                             imageid=data['imageid'], productid=data['productid'], suffix='_resized')


In [3]:
#labels of encoded classes
class_labels = data.groupby('prdtypedesignation')['prdtypeindex'].first().reset_index()
class_labels.index = class_labels['prdtypeindex']
class_labels = class_labels.drop(columns='prdtypeindex').sort_index()

## Creating train and test sets

In [4]:
Img_train = data.loc[~data['testset'], 'img_path']
Img_test = data.loc[data['testset'], 'img_path']

Txt_train = data.loc[~data['testset'], 'tokens']
Txt_test = data.loc[data['testset'], 'tokens']

y_train = data.loc[~data['testset'],'prdtypeindex']
y_test = data.loc[data['testset'],'prdtypeindex']

#To be fed into any of our sklearn classifiers, X_train and X_test
#should be dataframes with columns tokens and img_path
X_train = pd.DataFrame({'tokens': Txt_train, 'img_path': Img_train})
X_test = pd.DataFrame({'tokens': Txt_test, 'img_path': Img_test})

#All data for cross-validated scores
X = pd.concat([X_train, X_test], axis=0)
y = pd.concat([y_train, y_test], axis=0)

#Number of classes
num_classes = len(np.unique(data['prdtypeindex']))

## Example usage: how to train TFbertClassifier

In [None]:
#defining callbacks
callbacks = []
callbacks.append(('EarlyStopping', {'monitor': 'val_accuracy', 'min_delta': 0, 'mode': 'max', 'patience': 2, 'restore_best_weights': True, 'verbose': 1, }))

clf_cnn = ImgClassifier(base_name='ResNet152', img_size=(224, 224, 3), num_class=num_classes, drop_rate=0.2, epochs=5, batch_size=32, 
                    validation_data=(X_test, y_test), learning_rate=5e-5, callbacks=callbacks)

clf_cnn.fit(X_train, y_train)
clf_cnn.classification_score(X_test, y_test)

clf_cnn.save('image/my_resnet152_model')

## CNN and ViT benchmarks

In [None]:
#Name of the summary csv file to save results to
result_file_name = 'results_benchmark_img.csv'

#type of modality
modality = 'image'

#Type of classifier
class_type = 'ImgClassifier'

#training parameters (or list of parameters for gridsearchCV)
num_class = num_classes
img_size = (224, 224, 3)
n_epochs = 8
batch_size = 32
drop_rate = 0.2
lr0 = 5e-5
lr_min = 1e-6
lr_decay_rate = 0.8

#defining callbacks
callbacks = []
#adding earlystopping callback
callbacks.append(('EarlyStopping', {'monitor': 'val_accuracy', 'min_delta': 0, 'mode': 'max', 'patience': 2, 'restore_best_weights': True, 'verbose': 1}))
#Adding tensorboard callback as the last one
callbacks.append(('TensorBoard', {'log_dir': np.nan, 'histogram_freq': 1, 'update_freq': 'epoch'}))

#grid search number of folds
nfolds_grid = 0

#cross-validation of f1-score
nfolds_cv = 0

#network to test
base_name_list = ['vit_b16', 'ResNet101', 'ResNet50', 'EfficientNetB1', 'VGG16', 'ResNet152']

#Initializing the list of parameters to batch over
params_list = []

for base_name in base_name_list:
  #Adjusting tensorboard log directory
  log_dir = os.path.join(config.path_to_tflogs, base_name, datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
  callbacks[-1][1]['log_dir'] = log_dir
  #adding the set of parameters to the list
  params_list.append({'modality': modality,
                      'class': class_type,
                      'base_name': base_name, 
                      'param_grid': {'img_size': img_size, 'num_class': num_class, 'drop_rate': drop_rate, 
                                    'epochs': n_epochs, 'batch_size': batch_size, 
                                    'learning_rate':lr0, 'lr_decay_rate': lr_decay_rate, 'lr_min': lr_min,
                                    'validation_data': (X_test, y_test), 'callbacks': [callbacks], 'parallel_gpu': False},
                      'nfolds_grid': nfolds_grid, 'nfolds_cv': nfolds_cv
                    })

#Running the batch over params_list
results = fit_save_all(params_list, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test, result_file_name = result_file_name)

## Load and check the saved result file

In [6]:
df_results = load_batch_results('results_benchmark_img')
display(df_results)

Unnamed: 0,modality,class,vectorization,classifier,tested_params,best_params,score_test,score_cv_test,score_cv_train,fit_cv_time,model_path
0,image,ImgClassifier,,VGG16,"{'img_size': [(224, 224, 3)], 'num_class': [27...",,0.443956,,,,image/VGG16
