# GPT2 as text classifier

[https://drlee.io/fine-tuning-gpt-2-for-sentiment-analysis-94ebdd7b5b24](https://drlee.io/fine-tuning-gpt-2-for-sentiment-analysis-94ebdd7b5b24)

In [None]:
!pip install datasets
!pip install -U accelerate
!pip install -U transformers



## Load Dataset

TODO: Also import title, now it is only labels and text.

In [None]:
from datasets import load_dataset
dataset = load_dataset('ag_news')


  from .autonotebook import tqdm as notebook_tqdm


Reduce the size of the dataset whilst keeping it balanced.

In [None]:
from datasets import Dataset, DatasetDict
import pandas as pd

def take_a_percentage_of_data(dataset, percentage=0.1, shuffle=True, random_state=None):
    df = pd.DataFrame(dataset)
    df_sorted = df.sort_values(by='label')
    grouped = df_sorted.groupby('label')

    # ensure that proportions of the groups remains the same
    filtered_dfs = []
    for label, group in grouped:
        num_samples_to_keep = int(len(group) * percentage)
        filtered_group = group.head(num_samples_to_keep)
        filtered_dfs.append(filtered_group)

    filtered_df = pd.concat(filtered_dfs)
    if shuffle:
        filtered_df = filtered_df.sample(frac=1, random_state=random_state)

    # filtered_df = pd.concat(filtered_dfs)
    filtered_df.reset_index(drop=True, inplace=True)
    filtered_dict = filtered_df.to_dict(orient='list')
    filtered_dataset = Dataset.from_dict(filtered_dict)
    return filtered_dataset

dataset_train_1percent = take_a_percentage_of_data(dataset['train'], percentage=0.01)
dataset_test_1percent = take_a_percentage_of_data(dataset['test'], percentage=0.01)

combined_dataset_1percent = DatasetDict({
    'train': dataset_train_1percent,
    'test': dataset_test_1percent
})

Tokenize the dataset in the exact same way as the GPT-2 model.

In [None]:
from transformers import GPT2Tokenizer

training_data = combined_dataset_1percent

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = training_data.map(tokenize_function, batched=True)

Map: 100%|██████████| 1200/1200 [00:00<00:00, 1254.79 examples/s]
Map: 100%|██████████| 76/76 [00:00<00:00, 827.94 examples/s]


Load the GPT2-Model for sequence classification.

In [None]:
from transformers import GPT2ForSequenceClassification

model = GPT2ForSequenceClassification.from_pretrained("gpt2", num_labels=4)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Download the package provided by the paper.

In [None]:
!pip3 install gaussian-adaptive-attention

Collecting gaussian-adaptive-attention
  Downloading gaussian_adaptive_attention-0.1.5-py3-none-any.whl.metadata (4.4 kB)
Downloading gaussian_adaptive_attention-0.1.5-py3-none-any.whl (8.7 kB)
Installing collected packages: gaussian-adaptive-attention
Successfully installed gaussian-adaptive-attention-0.1.5


Define a GPT-2 transformer block that uses a Gaussian attention mechanism rather than an attention function that is based on matrix multiplications

In [None]:
import torch
import torch.nn as nn
from transformers import GPT2Model
from gaussian_adaptive_attention import GaussianBlock, MultiHeadGaussianAdaptiveAttention

class GPT2BlockWithGaussian(nn.Module):
    def __init__(self, config, norm_axis, num_heads, num_gaussians, padding_value=None, eps=1e-8):
        super().__init__()
        self.ln_1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_epsilon)
        self.ln_2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_epsilon)
        self.attn = MultiHeadGaussianAdaptiveAttention(norm_axis, num_heads, num_gaussians, padding_value, eps)
        self.mlp = nn.Sequential(
            nn.Linear(config.hidden_size, 4 * config.hidden_size),
            nn.GELU(),
            nn.Linear(4 * config.hidden_size, config.hidden_size),
            nn.Dropout(config.resid_pdrop),
        )

    def forward(self, x, layer_past=None, attention_mask=None, head_mask=None, use_cache=False, output_attentions=False, **kwargs):
        # Assume MultiHeadGaussianAdaptiveAttention handles the mask internally
        attn_outputs = self.attn(self.ln_1(x))  # This only returns output tensors, adjust if needed
        a = attn_outputs[0]  # output of the attention

        # add attention to the input
        x = x + a
        # residual connection (?)
        x = x + self.mlp(self.ln_2(x))

        # Ensure we return the expected tuple format
        outputs = (x,)
        if use_cache:
            outputs += (None,)  # Placeholder for 'presents' if your attention mechanism does not support caching
        if output_attentions:
            outputs += (attn_outputs[1],)  # Assuming attn_outputs[1] contains attention weights
        return outputs


Initialize new GPT-model where each transformer block uses the Gaussian attention mechanism defined above.

In [None]:
class CustomGPT2ForSequenceClassification(GPT2ForSequenceClassification):
    def __init__(self, config):
        super().__init__(config)
        self.transformer = GPT2Model(config)

        # Replace all transformer blocks with our custom Gaussian blocks
        self.transformer.h = nn.ModuleList([
            GPT2BlockWithGaussian(config, norm_axis=2, num_heads=config.num_attention_heads,
                                  num_gaussians=10, padding_value=config.pad_token_id)
            for _ in range(config.num_hidden_layers)
        ])

        self.score = nn.Linear(config.hidden_size, config.num_labels, bias=False)

    def forward(self, input_ids, attention_mask=None, labels=None):
        outputs = self.transformer(input_ids, attention_mask=attention_mask)

        hidden_state = outputs[0]  # Last layer hidden-state
        logits = self.score(hidden_state[:, -1])  # Only use the last hidden state

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))

        return {'loss': loss, 'logits': logits}


In [None]:
from transformers import GPT2Config

config = GPT2Config.from_pretrained('gpt2', num_labels=4)

model = CustomGPT2ForSequenceClassification(config)


In [None]:
print(model)

In [None]:
input_ids = torch.randint(0, config.vocab_size, (1, 512))  # Random input ids
labels = torch.tensor([1]).unsqueeze(0)  # Example label

outputs = gpt2_model(input_ids=input_ids, labels=labels)
loss, logits = outputs['loss'], outputs['logits']
print(loss, logits)


tensor(1.2020, grad_fn=<NllLossBackward0>) tensor([[-0.2166,  0.4634,  0.1983,  0.5153]], grad_fn=<IndexBackward0>)


In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
)


In [None]:
model_name = "gpt2-ag_news-1percent"

trainer.train()
trainer.save_model(model_name)
# Save the tokenizer used by the model as well
tokenizer.save_pretrained(model_name)


  0%|          | 3/36000 [00:30<98:43:37,  9.87s/it] 

KeyboardInterrupt: 

In [None]:
trainer.evaluate()

In [None]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device)

def predict_label(sentence):
    inputs = tokenizer(sentence, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        prediction = outputs.logits.argmax(-1).item()

    return prediction

# Test the function with an example from the dataset
text = dataset['test'][1]['text']
label = dataset['test'][1]['label']
print("Predicted label:", predict_label(text))
print("Expected label:", label)


In [None]:
from torch.utils.data import DataLoader

def calculate_accuracy(model, data):
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for item in data:
            prediction = predict_label(item['text'])
            label = item['label']
            total += 1
            if label == prediction:
              correct += 1

    return correct / total

data = dataset['test']

# Calculate accuracy
accuracy = calculate_accuracy(model, data)
print(f"Accuracy: {accuracy:.4f}")