# GPT2 as text classifier

[https://drlee.io/fine-tuning-gpt-2-for-sentiment-analysis-94ebdd7b5b24](https://drlee.io/fine-tuning-gpt-2-for-sentiment-analysis-94ebdd7b5b24)

In [None]:
!pip install datasets
!pip install -U accelerate
!pip install -U transformers
!pip3 install gaussian-adaptive-attention

Collecting datasets
  Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.21.2 (from datasets)
  Downloading huggingface_hub-0.22.2-py3-none-an

Collecting accelerate
  Downloading accelerate-0.29.3-py3-none-any.whl (297 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/297.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━[0m [32m276.5/297.6 kB[0m [31m8.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.6/297.6 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.29.3
Collecting transformers
  Downloading transformers-4.40.1-py3-none-any.whl (9.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.0/9.0 MB[0m [31m27.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.40.0
    Uninstalling transformers-4.40.0:
      Successfully uninstalled transformers-4.40.0
Successfully installed transformer

In [None]:
from datasets import Dataset, DatasetDict
import pandas as pd
from transformers import GPT2ForSequenceClassification, GPT2Tokenizer
from gaussian_adaptive_attention import MultiHeadGaussianAdaptiveAttention, GaussianAdaptiveAttention
import torch
import torch.nn as nn
from datasets import load_dataset


## Load Dataset

TODO: Also import title, now it is only labels and text.

In [25]:
dataset = load_dataset('ag_news')

Reduce the size of the dataset whilst keeping it balanced.

In [36]:

def take_a_percentage_of_data(dataset, percentage=0.1, shuffle=True, random_state=None):
    df = pd.DataFrame(dataset)
    df_sorted = df.sort_values(by='label')
    grouped = df_sorted.groupby('label')

    # ensure that proportions of the groups remains the same
    filtered_dfs = []
    for label, group in grouped:
        num_samples_to_keep = int(len(group) * percentage)
        filtered_group = group.head(num_samples_to_keep)
        filtered_dfs.append(filtered_group)

    filtered_df = pd.concat(filtered_dfs)
    if shuffle:
        filtered_df = filtered_df.sample(frac=1, random_state=random_state)

    # filtered_df = pd.concat(filtered_dfs)
    filtered_df.reset_index(drop=True, inplace=True)
    filtered_dict = filtered_df.to_dict(orient='list')
    filtered_dataset = Dataset.from_dict(filtered_dict)
    return filtered_dataset

dataset_train_1percent = take_a_percentage_of_data(dataset['train'], percentage=0.01)
dataset_test_1percent = take_a_percentage_of_data(dataset['test'], percentage=0.01)

combined_dataset_1percent = DatasetDict({
    'train': dataset_train_1percent,
    'test': dataset_test_1percent
})

Tokenize the dataset in the exact same way as the GPT-2 model.

In [37]:

training_data = combined_dataset_1percent

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = training_data.map(tokenize_function, batched=True)

Map:   0%|          | 0/1200 [00:00<?, ? examples/s]

Map:   0%|          | 0/76 [00:00<?, ? examples/s]

Load the GPT2-Model for sequence classification.

In [28]:
from transformers import GPT2ForSequenceClassification

gpt2_model = GPT2ForSequenceClassification.from_pretrained("gpt2", num_labels=4)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:

gpt2_base_model = GPT2ForSequenceClassification.from_pretrained("gpt2", num_labels=4)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:


model = gpt2_model

class MultiHeadGaussianAdaptiveAttentionWrapper(nn.Module):
    def __init__(self, config, num_gaussians=10):
        super().__init__()
        self.attention = MultiHeadGaussianAdaptiveAttention(
            norm_axis=2,
            num_heads=8,
            num_gaussians=num_gaussians,
            padding_value=config.pad_token_id,
            eps=config.layer_norm_epsilon
        )



    def forward(self, hidden_states, **kwargs):
        # Pass arguments using **kwargs to the underlying attention mechanism
        attention_output = self.attention(hidden_states)
        return (hidden_states,) + tuple(attention_output)  # Ensure the return value is a tuple


# Replace the attention mechanism in each transformer block
for block in model.transformer.h:
    block.attn = MultiHeadGaussianAdaptiveAttentionWrapper(model.config)


In [31]:
def init_weights(module):
    if isinstance(module, GaussianAdaptiveAttention):
        # Assuming mean_offsets and c are the names of the parameters in your GaussianAdaptiveAttention implementation
        nn.init.constant_(module.mean_offsets, 0)  # Initialize mean_offsets to 0
        nn.init.constant_(module.c, 2)             # Initialize c to 2

model.apply(init_weights)

GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): MultiHeadGaussianAdaptiveAttentionWrapper(
          (attention): MultiHeadGaussianAdaptiveAttention(
            (attention_heads): ModuleList(
              (0-7): 8 x GaussianAdaptiveAttention()
            )
          )
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=4, bias=False)
)

Download the package provided by the paper.

Define a GPT-2 transformer block that uses a Gaussian attention mechanism rather than an attention function that is based on matrix multiplications

In [32]:
for param in model.transformer.parameters():
    param.requires_grad = False

for param in model.score.parameters():
    param.requires_grad = True

for block in model.transformer.h:
  for param in block.attn.parameters():
      param.requires_grad = True


In [None]:
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
frozen_params = sum(p.numel() for p in model.parameters() if not p.requires_grad)

print(f"Trainable Parameters: {trainable_params}")
print(f"Frozen Parameters: {frozen_params}")

Trainable Parameters: 4992
Frozen Parameters: 96091392


In [33]:
print(model)
print(gpt2_base_model)

GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): MultiHeadGaussianAdaptiveAttentionWrapper(
          (attention): MultiHeadGaussianAdaptiveAttention(
            (attention_heads): ModuleList(
              (0-7): 8 x GaussianAdaptiveAttention()
            )
          )
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=4, bias=False)
)
GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wt

In [None]:
input_ids = torch.randint(0, config.vocab_size, (1, 512))
labels = torch.tensor([1]).unsqueeze(0)

outputs = gpt2_model(input_ids=input_ids, labels=labels)
loss, logits = outputs['loss'], outputs['logits']
print(loss, logits)


NameError: name 'config' is not defined

In [38]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=35,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
)


In [None]:
model_name = "gpt2-ag_news-1percent"

trainer.train()
trainer.save_model(model_name)
# Save the tokenizer used by the model as well
tokenizer.save_pretrained(model_name)


Step,Training Loss
500,1.4029
1000,1.3951
1500,1.4031
2000,1.3987
2500,1.3938
3000,1.3858
3500,1.3934
4000,1.3951
4500,1.399
5000,1.4013


In [None]:
trainer.evaluate()

{'eval_loss': 1.3863459825515747,
 'eval_runtime': 16.4819,
 'eval_samples_per_second': 4.611,
 'eval_steps_per_second': 4.611,
 'epoch': 5.0}

In [None]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device)

def predict_label(sentence):
    inputs = tokenizer(sentence, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        prediction = outputs.logits.argmax(-1).item()

    return prediction

# Test the function with an example from the dataset
text = dataset['test'][1]['text']
label = dataset['test'][1]['label']
print("Predicted label:", predict_label(text))
print("Expected label:", label)


Predicted label: 3
Expected label: 3


In [None]:
from torch.utils.data import DataLoader

def calculate_accuracy(model, data):
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for item in data:
            prediction = predict_label(item['text'])
            label = item['label']
            total += 1
            if label == prediction:
              correct += 1

    return correct / total

data = combined_dataset_1percent['test']

# Calculate accuracy
accuracy = calculate_accuracy(model, data)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.2447
