<a href="https://colab.research.google.com/github/respect5716/deep-learning-paper-implementation/blob/main/03_NLP/SimCSE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# SimCSE

## 0. Info

### Paper
* title: SimCSE: Simple Contrastive Learning of Sentence Embeddings
* author: Tianyu Gao et al.
* url: https://arxiv.org/abs/2104.08821

### Features
* supervised learning
* dataset: klue-nli (train), klue-sts (eval)

### Reference
* https://github.com/princeton-nlp/SimCSE

## 1. Setup

In [None]:
!pip install -q transformers datasets

In [1]:
import easydict
from tqdm.auto import tqdm
import pandas as pd
from scipy.stats import pearsonr, spearmanr

import torch
import torch.nn as nn
import torch.nn.functional as F

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel

In [2]:
config = easydict.EasyDict(
    model_name_or_path = 'klue/roberta-small',
    max_length = 64,
    batch_size = 16,
    num_epochs = 3,

    temperature = 0.05,
    lr = 5e-5
)

## 2. Data

In [78]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = data
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data.iloc[idx]
        return item

In [79]:
def collate_fn(batch, tokenizer, max_length):
    sent = tokenizer([i['sentence'] for i in batch], max_length=max_length, padding='max_length', truncation=True, return_tensors='pt')
    pos = tokenizer([i['positive'] for i in batch], max_length=max_length, padding='max_length', truncation=True, return_tensors='pt')
    neg = tokenizer([i['negative'] for i in batch], max_length=max_length, padding='max_length', truncation=True, return_tensors='pt')
    return sent, pos, neg

In [None]:
tokenizer = AutoTokenizer.from_pretrained(config.model_name_or_path)

In [80]:
data = load_dataset('klue', 'nli')
train_data = data['train'].to_pandas()

pos = train_data.loc[train_data['label'] == 0]
neg = train_data.loc[train_data['label'] == 1]

pos = pos[['premise', 'hypothesis']]
pos.columns = ['sentence', 'positive']

neg = neg[['premise', 'hypothesis']]
neg.columns = ['sentence', 'negative']

data = pd.merge(pos,  neg, how='inner', on='sentence')
data.head()

Reusing dataset klue (/root/.cache/huggingface/datasets/klue/nli/1.0.0/e0fc3bc3de3eb03be2c92d72fd04a60ecc71903f821619cb28ca0e1e29e4233e)


  0%|          | 0/2 [00:00<?, ?it/s]

Unnamed: 0,sentence,positive,negative
0,힛걸 진심 최고다 그 어떤 히어로보다 멋지다,힛걸 진심 최고로 멋지다.,힛걸 액션 장면 진심 그 어떤 히어로보다 멋지다.
1,101빌딩 근처에 나름 즐길거리가 많습니다.,101빌딩 부근에서는 여러가지를 즐길수 있습니다.,101빌딩 주변에 젊은이들이 즐길거리가 많습니다.
2,10년 만에 찾는 피터를 웬디는 따뜻하게 맞이하고 피터는 성공리에 연설을 마치는데 ...,"피터 배닝, 잭, 매기는 남매사이다.",잭과 매기는 피터 배닝의 동생들이다.
3,"10년 전 한 병원에서 입원 중인 한 소녀가 실종되는 사건이 일어나지만, 목격자도 ...",10년 전 한 병원에서 입원 중인 소녀가 실종된다.,10년 후 과학 기술이 발달되어 10년전 찾지 못했던 소녀를 찾게된다.
4,"10년 전, 공동육아를 매개로 성미산 마을에 들어와 산 강석필, 홍형숙 부부는 맥가...","강석필, 홍형숙 부부가 성미산 마을에 들어온 것은 10년전이다.","강석필, 홍형숙 부부는 마을에서 좋은 평판을 가지고 있다."


In [82]:
dataset = Dataset(data)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=config.batch_size, shuffle=True, collate_fn=lambda x: collate_fn(x, tokenizer, config.max_length))

In [83]:
sent, pos, neg = next(iter(dataloader))
sent['input_ids'].size(), pos['input_ids'].size(), neg['input_ids'].size()

(torch.Size([16, 64]), torch.Size([16, 64]), torch.Size([16, 64]))

## 3. Model

In [132]:
model = AutoModel.from_pretrained(config.model_name_or_path).cuda()

Some weights of the model checkpoint at klue/roberta-small were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.decoder.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-small and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it f

In [134]:
optim = torch.optim.AdamW(model.parameters(), lr=config.lr, weight_decay=0.)

## 4. Train

In [137]:
def to_device(d, device):
    return {k:v.to(device) for k,v in d.items()}

In [139]:
for ep in range(config.num_epochs):
    ep_loss = 0.
    pbar = tqdm(dataloader)
    for sent, pos, neg in pbar:
        sent, pos, neg = to_device(sent, model.device), to_device(pos, model.device), to_device(neg, model.device)
        
        sent_out = model(**sent)
        pos_out = model(**pos)
        neg_out = model(**neg)

        sent_emb = sent_out.pooler_output
        pos_emb = pos_out.pooler_output
        neg_emb = neg_out.pooler_output

        sent_pos_sim = F.cosine_similarity(sent_emb.unsqueeze(1), pos_emb.unsqueeze(0), dim=-1) / config.temperature
        sent_neg_sim = F.cosine_similarity(sent_emb.unsqueeze(1), neg_emb.unsqueeze(0), dim=-1) / config.temperature
        total_sim = torch.cat([sent_pos_sim, sent_neg_sim], dim=1)

        labels = torch.arange(total_sim.size(0)).to(total_sim.device)
        loss = F.cross_entropy(total_sim, labels)

        optim.zero_grad()
        loss.backward()
        optim.step()

        pbar.set_postfix({'loss': loss.item()})
        ep_loss += loss.item()

    ep_loss /= len(dataloader)
    print(f'ep {ep:02d} | loss {ep_loss:.3f}')

  0%|          | 0/485 [00:00<?, ?it/s]

ep 00 | loss 0.410


  0%|          | 0/485 [00:00<?, ?it/s]

ep 01 | loss 0.072


  0%|          | 0/485 [00:00<?, ?it/s]

ep 02 | loss 0.025


In [140]:
model_name = config.model_name_or_path.replace('/', '-')
model.save_pretrained(f'{model_name}-simcse')
tokenizer.save_pretrained(f'{model_name}-simcse')

('klue-roberta-small-simcse/tokenizer_config.json',
 'klue-roberta-small-simcse/special_tokens_map.json',
 'klue-roberta-small-simcse/vocab.txt',
 'klue-roberta-small-simcse/added_tokens.json',
 'klue-roberta-small-simcse/tokenizer.json')

## 5. Evaluate

In [3]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = data
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data[idx]
        return item

def collate_fn(batch, tokenizer, max_length):
    sent1 = tokenizer([i['sentence1'] for i in batch], max_length=max_length, padding='max_length', truncation=True, return_tensors='pt')
    sent2 = tokenizer([i['sentence2'] for i in batch], max_length=max_length, padding='max_length', truncation=True, return_tensors='pt')
    labels = torch.tensor([i['labels']['label'] for i in batch])
    return sent1, sent2, labels

def to_device(d, device):
    return {k:v.to(device) for k,v in d.items()}

In [4]:
model_name = config.model_name_or_path.replace('/', '-')
model = AutoModel.from_pretrained(f'{model_name}-simcse').cuda()
tokenizer = AutoTokenizer.from_pretrained(f'{model_name}-simcse')

In [5]:
data = load_dataset('klue', 'sts')['validation']
dataset = Dataset(data)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=config.batch_size, shuffle=False, collate_fn=lambda x: collate_fn(x, tokenizer, config.max_length))

Reusing dataset klue (/root/.cache/huggingface/datasets/klue/sts/1.0.0/e0fc3bc3de3eb03be2c92d72fd04a60ecc71903f821619cb28ca0e1e29e4233e)


  0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
preds, labels = [], []

for sent1, sent2, label in tqdm(dataloader):
    sent1, sent2 = to_device(sent1, model.device), to_device(sent2, model.device)
    with torch.no_grad():
        sent1_out = model(**sent1)
        sent2_out = model(**sent2)

    sent1_emb = sent1_out.pooler_output.cpu()
    sent2_emb = sent2_out.pooler_output.cpu()

    pred = F.cosine_similarity(sent1_emb, sent2_emb, dim=-1)
    preds.append(pred)
    labels.append(label)

preds = torch.cat(preds, dim=0).numpy()
labels = torch.cat(labels, dim=0).numpy()

  0%|          | 0/33 [00:00<?, ?it/s]

In [7]:
pr = pearsonr(preds, labels)[0]
spr = spearmanr(preds, labels)[0]

print(f'pearsonr {pr:.3f} | spearmanr {spr:.3f}')

pearsonr 0.728 | spearmanr 0.721
