# Trains a CrossEncoder for the score task

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
% cd drive/MyDrive/sna/

/content/drive/MyDrive/sna


In [3]:
! pip install -U -q sentence-transformers
! pip install -q transformers

[K     |████████████████████████████████| 79 kB 3.5 MB/s 
[K     |████████████████████████████████| 4.0 MB 11.0 MB/s 
[K     |████████████████████████████████| 1.2 MB 36.2 MB/s 
[K     |████████████████████████████████| 77 kB 5.5 MB/s 
[K     |████████████████████████████████| 895 kB 53.0 MB/s 
[K     |████████████████████████████████| 6.6 MB 35.7 MB/s 
[K     |████████████████████████████████| 596 kB 49.7 MB/s 
[?25h  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone


## Check out GPU Type

In [4]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Mon Apr 25 22:55:14 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   41C    P0    27W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## Check out RAM Size

In [5]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 54.8 gigabytes of available RAM

You are using a high-RAM runtime!


## Import all packages

In [16]:
from torch.utils.data import DataLoader
import math
from sentence_transformers import LoggingHandler, util
from sentence_transformers.cross_encoder import CrossEncoder
from sentence_transformers.cross_encoder.evaluation import CECorrelationEvaluator
from sentence_transformers import InputExample
import logging
from datetime import datetime
import sys
import os
import gzip
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

## Just some code to print debug information to stdout

In [8]:
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
logger = logging.getLogger(__name__)

## Read in three dataset, product description, training, and test

In [9]:
product = pd.read_csv('data/product_catalogue-v0.2.csv', dtype="string")
train = pd.read_csv("data/train-v0.2.csv", dtype="string")
test = pd.read_csv("data/test_public-v0.2.csv", dtype="string")

## Join product dataset with training dataset on product id

In [11]:
alled = train.merge(product, on="product_id", how="inner", validate="many_to_many")
alled = alled.fillna("")

## Concatenate all product attributes into one column 

In [14]:
alled['pro_all'] = alled['product_title'] + ". " +alled['product_brand']+ ". " + alled['product_color_name'] + ". " + alled["product_bullet_point"] + ". " + alled['product_description'] + "."
alled['esci_label'].unique()

<StringArray>
['irrelevant', 'exact', 'substitute', 'complement']
Length: 4, dtype: string

## Define our Cross-Encoder and use distilroberta-base as base model and set num_labels=1, which predicts a continous score between 0 and 1

In [None]:
train_batch_size = 16
num_epochs = 4
model_save_path = 'output/training_scores-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

model = CrossEncoder('distilroberta-base', num_labels=1)

## Convert our dataset into pairs with true scores

In [54]:
train_count = 100000
train_flag = True
dev_count = 200
dev_flag = True
test_count = 1000
test_flag = True
frac_train = 0.9
frac_test = 0.8

train_all, test = train_test_split(alled, train_size=frac_train, shuffle=True)
train, dev = train_test_split(train_all, train_size=frac_test, shuffle=True)
print('Before training shape:',train.shape)
print('Before dev shape:',dev.shape)
print('Before test shape:',test.shape)

train = train.sample(n=train_count)
dev = dev.sample(n=dev_count)
test = test.sample(n=test_count)

print('After training shape:',train.shape)
print('After dev shape:',dev.shape)
print('After test shape:',test.shape)

train.reset_index(drop=True, inplace=True)
dev.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

label2int = {"exact": 1.0, "substitute": 0.1, "complement": 0.01, "irrelevant": 0.0}
train_samples = []
dev_samples = []
test_samples = []

for i in range(train.shape[0]):
  train_samples.append(InputExample(texts=[train.at[i, 'query'], train.at[i, 'pro_all']], label=label2int[train.at[i, 'esci_label']]))

for i in range(dev.shape[0]):
  dev_samples.append(InputExample(texts=[dev.at[i, 'query'], dev.at[i, 'pro_all']], label=label2int[dev.at[i, 'esci_label']]))

for i in range(test.shape[0]):
  test_samples.append(InputExample(texts=[test.at[i, 'query'], test.at[i, 'pro_all']], label=label2int[test.at[i, 'esci_label']]))

Before training shape: (571223, 12)
Before dev shape: (142806, 12)
Before test shape: (79337, 12)
After training shape: (100000, 12)
After dev shape: (200, 12)
After test shape: (1000, 12)


## We wrap train_samples (which is a List[InputExample]) into a pytorch DataLoader

In [55]:
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)

## We add an evaluator, which evaluates the performance during training

In [56]:
evaluator = CECorrelationEvaluator.from_input_examples(dev_samples, name='dev')

## Configure the training

In [57]:
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up
logger.info("Warmup-steps: {}".format(warmup_steps))

2022-04-26 01:38:47 - Warmup-steps: 2500


## Train the model

In [58]:
model.fit(train_dataloader=train_dataloader,
          evaluator=evaluator,
          epochs=num_epochs,
          warmup_steps=warmup_steps,
          output_path=model_save_path)



Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6250 [00:00<?, ?it/s]

2022-04-26 02:23:48 - CECorrelationEvaluator: Evaluating the model on dev dataset after epoch 0:
2022-04-26 02:23:49 - Correlation:	Pearson: 0.3881	Spearman: 0.3773
2022-04-26 02:23:49 - Save model to output/training_scores-2022-04-25_23-36-55


Iteration:   0%|          | 0/6250 [00:00<?, ?it/s]

2022-04-26 03:08:49 - CECorrelationEvaluator: Evaluating the model on dev dataset after epoch 1:
2022-04-26 03:08:51 - Correlation:	Pearson: 0.4081	Spearman: 0.3989
2022-04-26 03:08:51 - Save model to output/training_scores-2022-04-25_23-36-55


Iteration:   0%|          | 0/6250 [00:00<?, ?it/s]

2022-04-26 03:53:52 - CECorrelationEvaluator: Evaluating the model on dev dataset after epoch 2:
2022-04-26 03:53:53 - Correlation:	Pearson: 0.3561	Spearman: 0.3534


Iteration:   0%|          | 0/6250 [00:00<?, ?it/s]

2022-04-26 04:38:53 - CECorrelationEvaluator: Evaluating the model on dev dataset after epoch 3:
2022-04-26 04:38:55 - Correlation:	Pearson: 0.3561	Spearman: 0.3479


## Load model and eval on test set

In [59]:
model = CrossEncoder(model_save_path)
evaluator = CECorrelationEvaluator.from_input_examples(test_samples, name='test')
evaluator(model)

2022-04-26 04:39:48 - Use pytorch device: cuda
2022-04-26 04:39:48 - CECorrelationEvaluator: Evaluating the model on test dataset:
2022-04-26 04:39:57 - Correlation:	Pearson: 0.5170	Spearman: 0.5208


0.5208260036703202

## Print out RMSE on test set

In [60]:
predictions = model.predict(evaluator.sentence_pairs)
targets=evaluator.scores
rmse = np.sqrt(np.mean((predictions-targets)**2))
print("RMSE:", rmse)

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

RMSE: 0.3968347346146894
