# GPT2 as text classifier

[https://drlee.io/fine-tuning-gpt-2-for-sentiment-analysis-94ebdd7b5b24](https://drlee.io/fine-tuning-gpt-2-for-sentiment-analysis-94ebdd7b5b24)

In [1]:
!pip install datasets
!pip install -U accelerate
!pip install -U transformers

Collecting datasets
  Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/542.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━[0m [32m307.2/542.0 kB[0m [31m9.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.

## Load Dataset

TODO: Also import title, now it is only labels and text.

In [2]:
from datasets import load_dataset
dataset = load_dataset('ag_news')


Downloading readme:   0%|          | 0.00/8.07k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

Reduce the size of the dataset whilst keeping it balanced.

In [3]:
from datasets import Dataset, DatasetDict
import pandas as pd

def take_a_percentage_of_data(dataset, percentage=0.1, shuffle=True, random_state=None):
    df = pd.DataFrame(dataset)
    df_sorted = df.sort_values(by='label')
    grouped = df_sorted.groupby('label')

    # ensure that proportions of the groups remains the same
    filtered_dfs = []
    for label, group in grouped:
        num_samples_to_keep = int(len(group) * percentage)
        filtered_group = group.head(num_samples_to_keep)
        filtered_dfs.append(filtered_group)

    filtered_df = pd.concat(filtered_dfs)
    if shuffle:
        filtered_df = filtered_df.sample(frac=1, random_state=random_state)

    # filtered_df = pd.concat(filtered_dfs)
    filtered_df.reset_index(drop=True, inplace=True)
    filtered_dict = filtered_df.to_dict(orient='list')
    filtered_dataset = Dataset.from_dict(filtered_dict)
    return filtered_dataset

dataset_train_1percent = take_a_percentage_of_data(dataset['train'], percentage=0.01)
dataset_test_1percent = take_a_percentage_of_data(dataset['test'], percentage=0.01)

combined_dataset_1percent = DatasetDict({
    'train': dataset_train_1percent,
    'test': dataset_test_1percent
})

Tokenize the dataset in the exact same way as the GPT-2 model.

In [4]:
from transformers import GPT2Tokenizer

training_data = combined_dataset_1percent

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = training_data.map(tokenize_function, batched=True)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Map:   0%|          | 0/1200 [00:00<?, ? examples/s]

Map:   0%|          | 0/76 [00:00<?, ? examples/s]

Load the GPT2-Model for sequence classification.

In [5]:
from transformers import GPT2ForSequenceClassification

gpt2_model = GPT2ForSequenceClassification.from_pretrained("gpt2", num_labels=4)

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
from transformers import GPT2ForSequenceClassification

gpt2_base_model = GPT2ForSequenceClassification.from_pretrained("gpt2", num_labels=4)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
from gaussian_adaptive_attention import MultiHeadGaussianAdaptiveAttention

model = gpt2_model

class MultiHeadGaussianAdaptiveAttentionWrapper(nn.Module):
    def __init__(self, config, num_gaussians=10):
        super().__init__()
        self.attention = MultiHeadGaussianAdaptiveAttention(
            norm_axis=2,
            num_heads=config.num_attention_heads,
            num_gaussians=num_gaussians,
            padding_value=config.pad_token_id,
            eps=config.layer_norm_epsilon
        )


    def forward(self, hidden_states, **kwargs):
        # Pass arguments using **kwargs to the underlying attention mechanism
        attention_output = self.attention(hidden_states)
        return (hidden_states,) + tuple(attention_output)  # Ensure the return value is a tuple


# Replace the attention mechanism in each transformer block
for block in model.transformer.h:
    block.attn = MultiHeadGaussianAdaptiveAttentionWrapper(model.config)


Download the package provided by the paper.

In [8]:
!pip3 install gaussian-adaptive-attention

Collecting gaussian-adaptive-attention
  Downloading gaussian_adaptive_attention-0.1.5-py3-none-any.whl (8.7 kB)
Installing collected packages: gaussian-adaptive-attention
Successfully installed gaussian-adaptive-attention-0.1.5


Define a GPT-2 transformer block that uses a Gaussian attention mechanism rather than an attention function that is based on matrix multiplications

In [25]:
print(model)
print(gpt2_base_model)

GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): MultiHeadGaussianAdaptiveAttentionWrapper(
          (attention): MultiHeadGaussianAdaptiveAttention(
            (attention_heads): ModuleList(
              (0-11): 12 x GaussianAdaptiveAttention()
            )
          )
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=4, bias=False)
)
GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (

In [26]:
for param in model.transformer.parameters():
    param.requires_grad = False

for param in model.score.parameters():
    param.requires_grad = True

for block in model.transformer.h:
  for param in block.attn.parameters():
      param.requires_grad = True


In [27]:
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
frozen_params = sum(p.numel() for p in model.parameters() if not p.requires_grad)

print(f"Trainable Parameters: {trainable_params}")
print(f"Frozen Parameters: {frozen_params}")

Trainable Parameters: 5952
Frozen Parameters: 96091392


In [30]:
input_ids = torch.randint(0, config.vocab_size, (1, 512))
labels = torch.tensor([1]).unsqueeze(0)

outputs = gpt2_model(input_ids=input_ids, labels=labels)
loss, logits = outputs['loss'], outputs['logits']
print(loss, logits)


tensor(1.3229, grad_fn=<NllLossBackward0>) tensor([[-0.8242,  0.4077,  0.3528,  0.8237]], grad_fn=<IndexBackward0>)


In [15]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=5,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
)


In [16]:
model_name = "gpt2-ag_news-1percent"

trainer.train()
trainer.save_model(model_name)
# Save the tokenizer used by the model as well
tokenizer.save_pretrained(model_name)


Step,Training Loss
500,1.4097
1000,1.4009
1500,1.4079
2000,1.3978
2500,1.401
3000,1.3954
3500,1.3934
4000,1.39
4500,1.3954
5000,1.3844


('gpt2-ag_news-1percent/tokenizer_config.json',
 'gpt2-ag_news-1percent/special_tokens_map.json',
 'gpt2-ag_news-1percent/vocab.json',
 'gpt2-ag_news-1percent/merges.txt',
 'gpt2-ag_news-1percent/added_tokens.json')

In [17]:
trainer.evaluate()

{'eval_loss': 1.3872193098068237,
 'eval_runtime': 24.2315,
 'eval_samples_per_second': 3.136,
 'eval_steps_per_second': 3.136,
 'epoch': 5.0}

In [18]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device)

def predict_label(sentence):
    inputs = tokenizer(sentence, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        prediction = outputs.logits.argmax(-1).item()

    return prediction

# Test the function with an example from the dataset
text = dataset['test'][1]['text']
label = dataset['test'][1]['label']
print("Predicted label:", predict_label(text))
print("Expected label:", label)


Predicted label: 1
Expected label: 3


In [21]:
from torch.utils.data import DataLoader

def calculate_accuracy(model, data):
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for item in data:
            prediction = predict_label(item['text'])
            label = item['label']
            total += 1
            if label == prediction:
              correct += 1

    return correct / total

data = combined_dataset_1percent['test']

# Calculate accuracy
accuracy = calculate_accuracy(model, data)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.2632
