<a href="https://colab.research.google.com/github/yongsun-yoon/deep-learning-paper-implementation/blob/main/03-natural-language-process/SetFit.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# SetFit

## 0. Info

### Paper
* title: Efficient Few-shot Learning with Sentence Transformers
* author: Lewis Tunstall
* url: https://arxiv.org/abs/2209.11055

### Features
* dataset: banking77

### Reference
* https://github.com/huggingface/setfit

## 1. Setup

In [4]:
!pip install -q transformers datasets pytorch_metric_learning

[K     |████████████████████████████████| 432 kB 32.7 MB/s 
[K     |████████████████████████████████| 115 kB 75.7 MB/s 
[K     |████████████████████████████████| 212 kB 72.9 MB/s 
[K     |████████████████████████████████| 163 kB 72.5 MB/s 
[K     |████████████████████████████████| 127 kB 74.1 MB/s 
[?25h

In [15]:
import os
import easydict
from glob import glob
from tqdm.auto import tqdm

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression

import torch
import torch.nn as nn
import torch.nn.functional as F

from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModel
from pytorch_metric_learning import losses

In [2]:
cfg = easydict.EasyDict(
    dataset = 'banking77',
    device = 'cuda',
    pretrained = 'sentence-transformers/all-MiniLM-L6-v2',

    k = 5,
    batch_size = 16,
    num_epochs = 20,

    lr = 1e-4,
)

## 2. Data

In [3]:
def sample_data(df):
    sampled = []
    labels = df['label'].unique()
    for label in labels:
        subset = df.query(f"label == {label}")
        subset = subset.sample(min(len(subset), cfg.k), replace=False)
        sampled.append(subset)
    return pd.concat(sampled, ignore_index=True)

In [4]:
data = load_dataset(cfg.dataset)
train_data = data['train'].to_pandas()
eval_data = data['test'].to_pandas()



  0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
train_data = sample_data(train_data)
train_ds = Dataset.from_pandas(train_data)
train_loader = torch.utils.data.DataLoader(train_ds, batch_size=cfg.batch_size, shuffle=True)

## 3. Model

In [6]:
tokenizer = AutoTokenizer.from_pretrained(cfg.pretrained)

In [7]:
model = AutoModel.from_pretrained(cfg.pretrained)
_ = model.train().to(cfg.device)

## 4. Train

### 4.1. Fine-tune pretrained ST

In [8]:
def sample_pos(data, anchor_labels):
    texts, labels = [], []
    for l in anchor_labels:
        item = data.query(f'label == {l}').sample().iloc[0]
        texts.append(item['text'])
        labels.append(item['label'])
    return texts, labels


def sample_neg(data, anchor_labels):
    texts, labels = [], []
    for l in anchor_labels:
        item = data.query(f'label != {l}').sample().iloc[0]
        texts.append(item['text'])
        labels.append(item['label'])
    return texts, labels


def mean_pooling(token_embeddings, attention_mask):
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [9]:
loss_func = losses.TripletMarginLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=cfg.lr)

In [10]:
for ep in range(cfg.num_epochs):
    pbar = tqdm(train_loader)
    for batch in pbar:
        anc_texts = batch['text']
        anc_labels = batch['label'].tolist()
        pos_texts, pos_labels = sample_pos(train_data, anc_labels)
        neg_texts, neg_labels = sample_neg(train_data, anc_labels)

        texts = anc_texts + pos_texts + neg_texts
        labels = anc_labels + pos_labels + neg_labels

        inputs = tokenizer(texts, padding=True, truncation=True, return_tensors='pt').to(cfg.device)
        labels = torch.tensor(labels).to(cfg.device)

        token_embeddings = model(**inputs)[0]
        sentence_embeddings = mean_pooling(token_embeddings, inputs.attention_mask)
        loss = loss_func(sentence_embeddings, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        pbar.set_postfix({'loss': loss.item()})

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

### 4.2. Train Classifier

In [11]:
def encode(model, tokenizer, texts, device, batch_size=32):
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        inputs = tokenizer(batch, padding=True, truncation=True, return_tensors='pt').to(device)

        with torch.no_grad():
            token_embeddings = model(**inputs)[0]
        sentence_embeddings = mean_pooling(token_embeddings, inputs.attention_mask).cpu()
        embeddings.append(sentence_embeddings)

    embeddings = torch.cat(embeddings, dim=0)
    return embeddings

In [12]:
texts = train_data['text'].tolist()
embeddings = encode(model, tokenizer, texts, cfg.device)

In [16]:
classifier = LogisticRegression()

In [17]:
X = embeddings.numpy()
y = train_data['label']
classifier.fit(X, y)

LogisticRegression()

## 5. Test

In [18]:
texts = eval_data['text'].tolist()
embeddings = encode(model, tokenizer, texts, cfg.device)

In [19]:
X = embeddings.numpy()
y = eval_data['label']
preds = classifier.predict(X)

In [20]:
acc = (preds == y).mean() * 100
print(f'Accuracy: {acc:.2f}')

Accuracy: 76.46
