<a href="https://colab.research.google.com/github/yongsun-yoon/deep-learning-paper-implementation/blob/main/03-natural-language-process/GPT3Mix.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# GPT3Mix

## 0. Info

### Paper
* title: GPT3Mix: Leveraging Large-scale Language Models for Text Augmentation
* author: Kang Min Yoo et al.
* url: https://arxiv.org/abs/2104.08826

### Feats
* dataset: gpt3mix/rt20

### Refs
* https://github.com/naver-ai/hypermix

## 1. Setup

In [1]:
!pip install -q transformers datasets openai

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m28.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m452.9/452.9 KB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.3/55.3 KB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.4/182.4 KB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m49.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m213.0/213.0 KB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.0/132.0 KB[0m [31m15.3 MB/s[0m et

In [2]:
import openai
import easydict
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import torch
import torch.nn.functional as F
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer

In [5]:
cfg = easydict.EasyDict(
    dataset_name = 'gpt3mix/rt20',
    openai_api_key = 'OPENAI API KEY',
    model_name = 'distilbert-base-uncased',
    device = 'cuda:0',
    batch_size = 16,
)

In [6]:
openai.api_key = cfg.openai_api_key

## 2. Data

In [7]:
def construct_prompt(examples):
    prompt = 'Each item in the following list contains a movie review and the respective sentiment. Sentiment is one of `positive` or `negative`.\n\n'
    for ex in examples:
        prompt += f"Movie review: {ex['text'].capitalize()} (Sentiment: {ex['label_name']})\n"
    prompt += 'Movie review:'
    return prompt

def softmax(x):
    return (np.exp(x - np.max(x)) / np.exp(x - np.max(x)).sum())

def augment(dataset, label_names):
    examples = []
    for label in label_names:
        ex = dataset.loc[dataset['label_name']==label].sample().iloc[0].to_dict()
        examples.append(ex)
    
    prompt = construct_prompt(examples)

    response = openai.Completion.create(
        model = "davinci",
        prompt = [prompt],
        echo = False,
        logprobs = 100,
        max_tokens = 100,
        frequency_penalty = 0.01,
        stop = "\n"
    )

    for choice in response.choices:
        aug_text = choice['text'].split('(Sentiment')[0].strip()

        label_idx = [i for i,t in enumerate(choice['logprobs']['tokens']) if t == '\n'][0] - 2
        label_logprobs = choice['logprobs']['top_logprobs'][label_idx]
        neg_logprob = label_logprobs.get(' negative', -100)
        pos_logprob = label_logprobs.get(' positive', -100)

        pos_prob, neg_prob = softmax([pos_logprob, neg_logprob])
        label_name = 'positive' if pos_prob > neg_prob else 'negative'
        label = label_names.index(label_name)
        return {'text': aug_text, 'label': label, 'label_name': label_name, 'pos_prob': pos_prob, 'neg_prob': neg_prob}

In [8]:
dataset = load_dataset(cfg.dataset_name)

label_names = dataset['train'].features['label'].names
label_dict = {i:l for i,l in enumerate(label_names)}

Downloading builder script:   0%|          | 0.00/1.73k [00:00<?, ?B/s]

Downloading and preparing dataset rt20/default to /root/.cache/huggingface/datasets/gpt3mix___rt20/default/0.0.0/0ffb781bbb1f45619a4c8067f6e98dbf9058834e17472feafddf2f63a14ae15e...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/225k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/44.9k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/61.0k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset rt20 downloaded and prepared to /root/.cache/huggingface/datasets/gpt3mix___rt20/default/0.0.0/0ffb781bbb1f45619a4c8067f6e98dbf9058834e17472feafddf2f63a14ae15e. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [9]:
train_dataset = dataset['train'].to_pandas().sample(100)
valid_dataset = dataset['validation'].to_pandas()
test_dataset = dataset['test'].to_pandas()

In [10]:
train_dataset['label_name'] = train_dataset['label'].map(label_dict)
valid_dataset['label_name'] = valid_dataset['label'].map(label_dict)
test_dataset['label_name'] = test_dataset['label'].map(label_dict)

In [11]:
train_dataset['pos_prob'] = 1 - train_dataset['label'].astype('float')
train_dataset['neg_prob'] = train_dataset['label'].astype('float')

In [12]:
augmented = []
for _ in tqdm(range(len(train_dataset))):
    try:
        aug = augment(train_dataset, label_names)
        augmented.append(aug)
    except:
        continue

augmented = pd.DataFrame(augmented)

  0%|          | 0/100 [00:00<?, ?it/s]

In [None]:
train_dataset = pd.concat([train_dataset, augmented], ignore_index=True)
train_dataset.to_csv('train_dataset.csv', index=False)

In [15]:
augmented.head()

Unnamed: 0,text,label,label_name,pos_prob,neg_prob
0,A series of high - energy music numbers propel...,0,positive,0.945609,0.054391
1,The rare film that adequately captures how mir...,0,positive,0.998034,0.001966
2,While some of the concepts might polarise ... ...,0,positive,0.878777,0.121223
3,Tenet is built to absorb genre fanatics who wa...,0,positive,0.715986,0.284014
4,From its whisper of a plot to its thinly sketc...,1,negative,0.045139,0.954861


## 3. Train

In [16]:
def evaluate(model, tokenizer, dataset, device, batch_size=16):
    preds, labels = [], []

    for i in tqdm(range(0, len(dataset), batch_size)):
        batch = dataset.iloc[i:i+batch_size]

        inputs = tokenizer(batch['text'].tolist(), padding=True, truncation=True, max_length=256, return_tensors='pt')
        _labels = torch.tensor(batch['label'].values)
        inputs = inputs.to(device)

        with torch.no_grad():
            outputs = model(**inputs)
        _preds = outputs.logits.argmax(dim=-1).cpu()
        preds.append(_preds)
        labels.append(_labels)

    preds = torch.cat(preds)
    labels = torch.cat(labels)
    acc = (preds == labels).float().mean().item() * 100.
    
    return acc

In [17]:
tokenizer = AutoTokenizer.from_pretrained(cfg.model_name)
model = AutoModelForSequenceClassification.from_pretrained(cfg.model_name, num_labels=len(label_names))
_ = model.train().to(cfg.device)

optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'pre_classifi

In [18]:
pbar = tqdm(range(1, 1000+1))
for st in pbar:
    batch = train_dataset.sample(cfg.batch_size)
    inputs = tokenizer(batch['text'].tolist(), padding=True, truncation=True, max_length=256, return_tensors='pt')
    labels = torch.tensor(batch[['pos_prob', 'neg_prob']].values)
    inputs, labels = inputs.to(cfg.device), labels.to(cfg.device)

    outputs = model(**inputs)
    logits = outputs.logits
    loss = F.cross_entropy(logits, labels)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    pbar.set_postfix({'loss': loss.item()})

    if st % 100 == 0:
        model.eval()
        acc = evaluate(model, tokenizer, test_dataset, cfg.device)
        print(f'st {st:05d} | acc {acc:.2f}')
        model.train()

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

st 00100 | acc 68.00


  0%|          | 0/25 [00:00<?, ?it/s]

st 00200 | acc 66.25


  0%|          | 0/25 [00:00<?, ?it/s]

st 00300 | acc 68.75


  0%|          | 0/25 [00:00<?, ?it/s]

st 00400 | acc 68.50


  0%|          | 0/25 [00:00<?, ?it/s]

st 00500 | acc 68.75


  0%|          | 0/25 [00:00<?, ?it/s]

st 00600 | acc 68.75


  0%|          | 0/25 [00:00<?, ?it/s]

st 00700 | acc 68.00


  0%|          | 0/25 [00:00<?, ?it/s]

st 00800 | acc 68.50


  0%|          | 0/25 [00:00<?, ?it/s]

st 00900 | acc 67.25


  0%|          | 0/25 [00:00<?, ?it/s]

st 01000 | acc 68.25


In [19]:
model.eval()
acc = evaluate(model, tokenizer, test_dataset, cfg.device)
print(f'acc {acc:.2f}')

  0%|          | 0/25 [00:00<?, ?it/s]

acc 68.25
