<a href="https://colab.research.google.com/github/yongsun-yoon/multilingual-sentence-embedder/blob/main/run_downstream.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Run downstream

## 1. Setup

In [None]:
!pip install -q transformers datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m70.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m452.9/452.9 KB[0m [31m42.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m97.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.4/182.4 KB[0m [31m22.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m213.0/213.0 KB[0m [31m25.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.0/132.0 KB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m140.6/140.6 KB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import easydict
from tqdm.auto import tqdm
from scipy.stats import pearsonr, spearmanr

import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
from datasets import load_dataset

In [None]:
cfg = easydict.EasyDict(
    device = 'cuda:0',
    model_name = 'yongsun-yoon/bilingual-sentence-embedder-mMiniLMv2-L6-H384',
    batch_size = 32,
    max_length = 128,
)

## 2. Semantic Textual Similarity

In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = data
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data[idx]
        return item

def collate_fn(batch, tokenizer, max_length):
    sent1 = tokenizer([i['sentence1'] for i in batch], max_length=max_length, padding='max_length', truncation=True, return_tensors='pt')
    sent2 = tokenizer([i['sentence2'] for i in batch], max_length=max_length, padding='max_length', truncation=True, return_tensors='pt')
    labels = torch.tensor([i['labels']['label'] for i in batch])
    return sent1, sent2, labels

def to_device(d, device):
    return {k:v.to(device) for k,v in d.items()}

In [None]:
tokenizer = AutoTokenizer.from_pretrained(cfg.model_name)
model = AutoModel.from_pretrained(cfg.model_name, trust_remote_code=True)
_ = model.eval().requires_grad_(False).to(cfg.device)

Downloading:   0%|          | 0.00/501 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/280 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/575 [00:00<?, ?B/s]

Explicitly passing a `revision` is encouraged when loading a configuration with custom code to ensure no malicious code has been contributed in a newer revision.


Downloading:   0%|          | 0.00/463 [00:00<?, ?B/s]

Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.


Downloading:   0%|          | 0.00/938 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/615 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
data = load_dataset('klue', 'sts')['validation']
dataset = Dataset(data)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=cfg.batch_size, shuffle=False, collate_fn=lambda x: collate_fn(x, tokenizer, cfg.max_length))



  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
preds, labels = [], []

for sent1, sent2, label in tqdm(dataloader):
    sent1, sent2 = to_device(sent1, model.device), to_device(sent2, model.device)
    with torch.no_grad():
        sent1_out = model(**sent1)
        sent2_out = model(**sent2)

    sent1_emb = sent1_out.pooler_output.cpu()
    sent2_emb = sent2_out.pooler_output.cpu()

    pred = F.cosine_similarity(sent1_emb, sent2_emb, dim=-1)
    preds.append(pred)
    labels.append(label)

preds = torch.cat(preds, dim=0).numpy()
labels = torch.cat(labels, dim=0).numpy()

  0%|          | 0/17 [00:00<?, ?it/s]

In [None]:
pr = pearsonr(preds, labels)[0]
spr = spearmanr(preds, labels)[0]

print(f'pearsonr {pr:.3f} | spearmanr {spr:.3f}')     

pearsonr 0.757 | spearmanr 0.816
