### Connect drive to google colab and installing requirements

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')
%cd /content/gdrive/MyDrive/adot_challenge
!ls
!pip install -r requirements.txt
!pip install torch==1.2.0+cu92 torchvision==0.4.0+cu92 -f https://download.pytorch.org/whl/torch_stable.html

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
/content/gdrive/MyDrive/adot_challenge
data		   preprocess.py     saved_models	TFcamemBert.py	util.py
data_preprocessed  requirements.txt  TFcamemBert.ipynb	TFmodeling.py
Looking in links: https://download.pytorch.org/whl/torch_stable.html


In [None]:
from preprocess import preprocess_data
from TFmodeling import build_camembert_model
from util import print_evaluation_scores, get_train_test_val
from preprocess import extract_url

import torch
import pandas as pd
import os
import numpy as np
import dill as pickle
from transformers import CamembertTokenizer
import tensorflow as tf

In [None]:
MAX_LEN = 30
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 3
LEARNING_RATE = 1e-05

In [None]:
data_dir = "data/"
save_data_dir = "data_preprocessed/"
save_model_dir = "saved_models/tf_camemBert"

# Preprocess data
preprocessed_file = os.path.join(save_data_dir, "preprocessed.pkl")
if not os.path.exists(preprocessed_file):
  preprocess_data(data_dir=data_dir, save_data_dir=save_data_dir)

# Read data
with open(preprocessed_file, 'rb') as f:
  data = pickle.load(f)

# Split data into train, validation and test
X_train, X_val, X_test, y_train, y_val, y_test = get_train_test_val(data)

X_train_shape: (28266,)
y_train shape: (28266, 238)
X_test shape: (4188,)
y_test shape: (4188, 238)
X_val shape: (9423,)
y_val shape: (9423, 238)


In [None]:
tokenizer = CamembertTokenizer.from_pretrained("camembert-base")
X_train_tokenized = tokenizer(list(X_train), max_length=MAX_LEN, padding='max_length', truncation=True)
X_val_tokenized = tokenizer(list(X_val), max_length=MAX_LEN, padding='max_length', truncation=True)
X_test_tokenized = tokenizer(list(X_test), max_length=MAX_LEN, padding='max_length', truncation=True)

In [None]:
len(y_train[0])

238

In [None]:
nb_class = len(y_train[0])
model = build_camembert_model(nb_class=nb_class, seq_length=MAX_LEN, learning_rate=LEARNING_RATE)

print('OK Setup model')
history = model.fit({"input_ids": np.array(X_train_tokenized["input_ids"]),
                         "attention_mask": np.array(X_train_tokenized["attention_mask"])},
                        y=np.array(y_train),
                        batch_size=TRAIN_BATCH_SIZE,
                        epochs=EPOCHS,
                        verbose=1)


Some layers from the model checkpoint at camembert-base were not used when initializing TFCamembertModel: ['lm_head']
- This IS expected if you are initializing TFCamembertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFCamembertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFCamembertModel were not initialized from the model checkpoint at camembert-base and are newly initialized: ['roberta/pooler/dense/kernel:0', 'roberta/pooler/dense/bias:0']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "CamemBert"
__________________________________________________________________________________________________________________________________
 Layer (type)                             Output Shape                 Param #         Connected to                               
 input_ids (InputLayer)                   [(None, 30)]                 0               []                                         
                                                                                                                                  
 attention_mask (InputLayer)              [(None, 30)]                 0               []                                         
                                                                                                                                  
 CamemBertMultilabelClassification (Camem  (None, 238)                 111395566       ['input_ids[0][0]',                        
 BertMultilabelClassification)                                  

In [None]:
# Save trained model
model.save_weights("saved_models/tf_camemBert/weights.h5")

### Evaluate model on val and test sets

In [None]:
def model_predict(ids, mask):
  outputs = model({"input_ids": ids, "attention_mask": mask})
  return outputs

In [None]:
val_pred = []
val_ids_split = np.array_split(X_val_tokenized['input_ids'], 100)
val_mask_split = np.array_split(X_val_tokenized['attention_mask'], 100)
for ids, mask in zip(val_ids_split, val_mask_split):
  val_pred.extend(model_predict(ids, mask))

val_pred = tf.math.sigmoid(val_pred)
val_pred = np.array(val_pred) >= 0.5

print("Evaluation on validation set:")
print_evaluation_scores(y_val, val_pred)

Evaluation on validation set:
Accuracy: 0.1392337896635891
Hamming loss: 0.01048792646635222
F1 score macro: 0.24094787687913213
F1 score micro: 0.6101857836556788
F1 score weighted: 0.49887306689738087


In [None]:
test_pred = []
test_ids_split = np.array_split(X_test_tokenized['input_ids'], 100)
test_mask_split = np.array_split(X_test_tokenized['attention_mask'], 100)
for ids, mask in zip(test_ids_split, test_mask_split):
  test_pred.extend(model_predict(ids, mask))

test_pred = tf.math.sigmoid(test_pred)
test_pred = np.array(test_pred) >= 0.5

print("Evaluation on test set:")
print_evaluation_scores(y_test, test_pred)

Evaluation on test set:
Accuracy: 0.14207258834765998
Hamming loss: 0.010243352355268754
F1 score macro: 0.24402418088576847
F1 score micro: 0.6175170450288454
F1 score weighted: 0.5050512673823906


### Print out some examples in test set

In [None]:
for i in range(20,40):
  print(X_test[i])
  y_pred = [idx for idx, val in enumerate(test_pred[i]) if val == 1]
  y_true = [idx for idx, val in enumerate(y_test[i]) if val == 1]
  print(y_pred)
  print(y_true)
  print('\n')

deskgram co patricialincow
[32, 58, 227]
[32, 51, 57, 58, 227]


www lalanguefrancaise com dictionnaire definition mijoter
[26, 68, 117, 128, 193]
[26, 68, 117, 128, 193]


annuaire 118712 fr bas rhin 67 erstein 67150 docteurs laffont grunenwald sens scm 0388986811_1e0080f00001r10400t80841g
[19, 44, 53, 58]
[19, 44, 53, 58, 167]


www automobile fr voiture mercedes benz b 200 vhc car pgn 2 pgs 10 srt price sro asc ms1 17200_15_ frn 2011 ful petrol mlx 200000 ger automatic_gear itc beige dmg false
[6, 96, 222, 228]
[28, 58]


www justwatch com fr serie ncis enquetes speciales saison 14
[12, 58, 175]
[12, 70, 122, 142, 234]


www marinetraffic com ais home centerx 5 9 centery 45 5 zoom 8
[58]
[58]


fr shopping rakuten com boutique dcz481 nav livres_litterature
[5, 207, 218]
[5, 65, 202, 207, 218]


www conforama fr special canape salon sejour canape canape droit 020101 nw 124 convertible relax electrique nw 4166 revetement cuir croute cuir nw 4166 revetement 100 cuir
[48, 75, 180]
[48, 

### Test inference model

In [None]:
url = 'https://www.fnac.com/Apple-iPhone-12-mini-5-4-64-Go-Double-SIM-5G-Blanc/a13745982/w-4'
url = extract_url(url)
print(f"Extracted url: {url}")

url_tokenized = tokenizer(url, max_length=MAX_LEN, padding='max_length', truncation=True)
url_ids = tf.expand_dims(url_tokenized["input_ids"], axis=0)
url_mask = tf.expand_dims(url_tokenized["attention_mask"], axis=0)

url_outputs = model_predict(url_ids, url_mask)
url_outputs = np.array(url_outputs) >= 0.5

print(f"Index of predicted classes: {[idx for idx, val in enumerate(url_outputs[0]) if val == 1]}")


Extracted url: www fnac com apple iphone 12 mini 5 4 64 go double sim 5g blanc a13745982 w 4
Index of predicted classes: [61, 131, 194, 204]


### Create and load weights for future inference

In [None]:
new_model = build_camembert_model(nb_class=nb_class, seq_length=MAX_LEN, learning_rate=LEARNING_RATE)

Some layers from the model checkpoint at camembert-base were not used when initializing TFCamembertModel: ['lm_head']
- This IS expected if you are initializing TFCamembertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFCamembertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFCamembertModel were not initialized from the model checkpoint at camembert-base and are newly initialized: ['roberta/pooler/dense/kernel:0', 'roberta/pooler/dense/bias:0']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "CamemBert"
__________________________________________________________________________________________________________________________________
 Layer (type)                             Output Shape                 Param #         Connected to                               
 input_ids (InputLayer)                   [(None, 30)]                 0               []                                         
                                                                                                                                  
 attention_mask (InputLayer)              [(None, 30)]                 0               []                                         
                                                                                                                                  
 CamemBertMultilabelClassification (Camem  (None, 238)                 111395566       ['input_ids[0][0]',                        
 BertMultilabelClassification)                                  

In [None]:
new_model.load_weights("saved_models/tf_camemBert/weights.h5")