# Trains a CrossEncoder for the score task

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
% cd drive/MyDrive/sna/

/content/drive/MyDrive/sna


In [3]:
! pip install -U -q sentence-transformers
! pip install -q transformers

[K     |████████████████████████████████| 79 kB 3.5 MB/s 
[K     |████████████████████████████████| 4.0 MB 11.0 MB/s 
[K     |████████████████████████████████| 1.2 MB 36.2 MB/s 
[K     |████████████████████████████████| 77 kB 5.5 MB/s 
[K     |████████████████████████████████| 895 kB 53.0 MB/s 
[K     |████████████████████████████████| 6.6 MB 35.7 MB/s 
[K     |████████████████████████████████| 596 kB 49.7 MB/s 
[?25h  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone


## Check out GPU Type

In [4]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Mon Apr 25 22:55:14 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   41C    P0    27W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## Check out RAM Size

In [5]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 54.8 gigabytes of available RAM

You are using a high-RAM runtime!


## Import all packages

In [16]:
from torch.utils.data import DataLoader
import math
from sentence_transformers import LoggingHandler, util
from sentence_transformers.cross_encoder import CrossEncoder
from sentence_transformers.cross_encoder.evaluation import CECorrelationEvaluator
from sentence_transformers import InputExample
import logging
from datetime import datetime
import sys
import os
import gzip
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

## Just some code to print debug information to stdout

In [8]:
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
logger = logging.getLogger(__name__)

## Read in three dataset, product description, training, and test

In [9]:
product = pd.read_csv('data/product_catalogue-v0.2.csv', dtype="string")
train = pd.read_csv("data/train-v0.2.csv", dtype="string")
test = pd.read_csv("data/test_public-v0.2.csv", dtype="string")

## Join product dataset with training dataset on product id

In [11]:
alled = train.merge(product, on="product_id", how="inner", validate="many_to_many")
alled = alled.fillna("")

## Concatenate all product attributes into one column 

In [14]:
alled['pro_all'] = alled['product_title'] + ". " +alled['product_brand']+ ". " + alled['product_color_name'] + ". " + alled["product_bullet_point"] + ". " + alled['product_description'] + "."
alled['esci_label'].unique()

<StringArray>
['irrelevant', 'exact', 'substitute', 'complement']
Length: 4, dtype: string

## Define our Cross-Encoder and use distilroberta-base as base model and set num_labels=1, which predicts a continous score between 0 and 1

In [15]:
train_batch_size = 16
num_epochs = 4
model_save_path = 'output/training_stsbenchmark-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

model = CrossEncoder('distilroberta-base', num_labels=1)

Downloading:   0%|          | 0.00/480 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/316M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.out_proj.bias

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

2022-04-25 23:09:21 - Use pytorch device: cuda


## Convert our dataset into pairs with true scores

In [22]:
train_count = 1000
train_flag = True
dev_count = 200
dev_flag = True
test_count = 200
test_flag = True
frac_train = 0.9
frac_test = 0.8

train_all, test = train_test_split(alled, train_size=frac_train, shuffle=True)
train, dev = train_test_split(train_all, train_size=frac_test, shuffle=True)
print('Before training shape:',train.shape)
print('Before dev shape:',dev.shape)
print('Before test shape:',test.shape)

train = train.sample(n=train_count)
dev = dev.sample(n=dev_count)
test = test.sample(n=test_count)

print('After training shape:',train.shape)
print('After dev shape:',dev.shape)
print('After test shape:',test.shape)

train.reset_index(drop=True, inplace=True)
dev.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

label2int = {"exact": 1.0, "substitute": 0.1, "complement": 0.01, "irrelevant": 0.0}
train_samples = []
dev_samples = []
test_samples = []

for i in range(train.shape[0]):
  train_samples.append(InputExample(texts=[train.at[i, 'query'], train.at[i, 'pro_all']], label=label2int[train.at[i, 'esci_label']]))

for i in range(dev.shape[0]):
  dev_samples.append(InputExample(texts=[dev.at[i, 'query'], dev.at[i, 'pro_all']], label=label2int[dev.at[i, 'esci_label']]))

for i in range(test.shape[0]):
  test_samples.append(InputExample(texts=[test.at[i, 'query'], test.at[i, 'pro_all']], label=label2int[test.at[i, 'esci_label']]))

Before training shape: (571223, 12)
Before dev shape: (142806, 12)
Before test shape: (79337, 12)
After training shape: (1000, 12)
After dev shape: (200, 12)
After test shape: (200, 12)


## We wrap train_samples (which is a List[InputExample]) into a pytorch DataLoader

In [23]:
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)

## We add an evaluator, which evaluates the performance during training

In [24]:
evaluator = CECorrelationEvaluator.from_input_examples(dev_samples, name='dev')

## Configure the training

In [25]:
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up
logger.info("Warmup-steps: {}".format(warmup_steps))

2022-04-25 23:19:53 - Warmup-steps: 26


## Train the model

In [26]:
model.fit(train_dataloader=train_dataloader,
          evaluator=evaluator,
          epochs=num_epochs,
          warmup_steps=warmup_steps,
          output_path=model_save_path)



Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Iteration:   0%|          | 0/63 [00:00<?, ?it/s]

2022-04-25 23:20:35 - CECorrelationEvaluator: Evaluating the model on dev dataset after epoch 0:
2022-04-25 23:20:37 - Correlation:	Pearson: 0.0400	Spearman: 0.1043
2022-04-25 23:20:37 - Save model to output/training_stsbenchmark-2022-04-25_23-09-09


Iteration:   0%|          | 0/63 [00:00<?, ?it/s]

2022-04-25 23:21:05 - CECorrelationEvaluator: Evaluating the model on dev dataset after epoch 1:
2022-04-25 23:21:07 - Correlation:	Pearson: 0.0646	Spearman: 0.1197
2022-04-25 23:21:07 - Save model to output/training_stsbenchmark-2022-04-25_23-09-09


Iteration:   0%|          | 0/63 [00:00<?, ?it/s]

2022-04-25 23:21:35 - CECorrelationEvaluator: Evaluating the model on dev dataset after epoch 2:
2022-04-25 23:21:37 - Correlation:	Pearson: 0.0921	Spearman: 0.1458
2022-04-25 23:21:37 - Save model to output/training_stsbenchmark-2022-04-25_23-09-09


Iteration:   0%|          | 0/63 [00:00<?, ?it/s]

2022-04-25 23:22:05 - CECorrelationEvaluator: Evaluating the model on dev dataset after epoch 3:
2022-04-25 23:22:06 - Correlation:	Pearson: 0.1072	Spearman: 0.1525
2022-04-25 23:22:06 - Save model to output/training_stsbenchmark-2022-04-25_23-09-09


## Load model and eval on test set

In [28]:
model = CrossEncoder(model_save_path)
evaluator = CECorrelationEvaluator.from_input_examples(test_samples, name='test')
evaluator(model)

2022-04-25 23:23:13 - Use pytorch device: cuda
2022-04-25 23:23:13 - CECorrelationEvaluator: Evaluating the model on test dataset:
2022-04-25 23:23:15 - Correlation:	Pearson: 0.0806	Spearman: 0.0730


0.07301906488786791

## Print out RMSE on test set

In [31]:
predictions = model.predict(evaluator.sentence_pairs)
targets=evaluator.scores
rmse = np.sqrt(np.mean((predictions-targets)**2))

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

0.46854770723061184