In [4]:
import torch

from transformers import GPT2Model, GPT2Config, GPT2Tokenizer

class GPT2WithIntermediateOutputs(GPT2Model):

    def __init__(self, config):

        super().__init__(config)

    def forward(self, input_ids, attention_mask=None):

        # Get embeddings from input

        input_shape = input_ids.size()

        input_ids = input_ids.view(-1, input_shape[-1])

        device = input_ids.device

        if attention_mask is None:

            attention_mask = torch.ones(input_shape, device=device)

        # Prepare attention mask

        if attention_mask.dim() == 3:

            extended_attention_mask = attention_mask[:, None, :, :]

        elif attention_mask.dim() == 2:

            extended_attention_mask = attention_mask[:, None, None, :]

        # Prepare head mask if needed

        head_mask = self.get_head_mask(None, self.config.n_layer)

        # Transformer layers

        hidden_states = self.wte(input_ids) + self.wpe(torch.arange(0, input_shape[-1], device=device))

        hidden_states = self.drop(hidden_states)

        output_shape = input_shape + (hidden_states.size(-1),)

        all_hidden_states = torch.empty((*output_shape, self.config.n_layer + 1), device=device)

        all_hidden_states[..., 0] = hidden_states

        for i, (block, layer_past) in enumerate(zip(self.h, [None]*len(self.h))):

            outputs = block(hidden_states, layer_past=layer_past, attention_mask=extended_attention_mask, head_mask=head_mask[i])

            hidden_states = outputs[0]

            all_hidden_states[..., i+1] = hidden_states

        # Concatenate all hidden states

        concatenated_outputs = all_hidden_states.permute(2, 0, 1, 3).reshape(self.config.n_layer + 1, -1, hidden_states.size(-1))

        return concatenated_outputs
 

# Example usage:

config = GPT2Config.from_pretrained("gpt2")

model = GPT2WithIntermediateOutputs(config)

print(model)

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

input_ids = torch.tensor([tokenizer.encode("if the vocabulary is known, then the sequence length is correct")])

# Output will have dimensions: [num_layers + 1, batch_size, seq_length, features] (discard element 0 , that's the embedding initial layer)

outputs = model(input_ids)


print(outputs.shape)  # Output dimensions

GPT2WithIntermediateOutputs(
  (wte): Embedding(50257, 768)
  (wpe): Embedding(1024, 768)
  (drop): Dropout(p=0.1, inplace=False)
  (h): ModuleList(
    (0-11): 12 x GPT2Block(
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): GPT2Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): GPT2MLP(
        (c_fc): Conv1D()
        (c_proj): Conv1D()
        (act): NewGELUActivation()
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
torch.Size([13, 12, 768])


In [20]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import GPT2Model, GPT2Config, GPT2Tokenizer

class GPT2WithIntermediateOutputsAndClassifier(GPT2Model):
    def __init__(self, config, num_classes):
        super().__init__(config)
        self.num_classes = num_classes
        self.conv = nn.Sequential(
            nn.Conv1d(in_channels=config.n_embd, out_channels=512, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv1d(in_channels=512, out_channels=256, kernel_size=3, padding=1),
            nn.ReLU()
        )
        self.classifier = nn.Linear(256, num_classes)
        self.loss_fn = nn.CrossEntropyLoss()

    def forward(self, input_ids, attention_mask=None, labels=None):
        # Get GPT-2 outputs
        outputs = super().forward(input_ids, attention_mask=attention_mask).last_hidden_state

        # Apply convolutional layers
        conv_output = self.conv(outputs.permute(0, 2, 1))

        # Mean pooling and linear classifier
        pooled_output = conv_output.mean(dim=-1)
        logits = self.classifier(pooled_output)

        # Calculate loss if labels are provided
        if labels is not None:
            loss = self.loss_fn(logits, labels)
            return loss, logits
        return logits

# Example usage:
config = GPT2Config.from_pretrained("gpt2")
num_classes = 4

model = GPT2WithIntermediateOutputsAndClassifier(config, num_classes)
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")


In [11]:
from datasets import load_dataset
dataset = load_dataset('ag_news')

In [15]:
import pandas as pd
from datasets import Dataset, DatasetDict


In [16]:

def take_a_percentage_of_data(dataset, percentage=0.1, shuffle=True, random_state=None):
    df = pd.DataFrame(dataset)
    df_sorted = df.sort_values(by='label')
    grouped = df_sorted.groupby('label')

    # ensure that proportions of the groups remains the same
    filtered_dfs = []
    for label, group in grouped:
        num_samples_to_keep = int(len(group) * percentage)
        filtered_group = group.head(num_samples_to_keep)
        filtered_dfs.append(filtered_group)

    filtered_df = pd.concat(filtered_dfs)
    if shuffle:
        filtered_df = filtered_df.sample(frac=1, random_state=random_state)

    # filtered_df = pd.concat(filtered_dfs)
    filtered_df.reset_index(drop=True, inplace=True)
    filtered_dict = filtered_df.to_dict(orient='list')
    filtered_dataset = Dataset.from_dict(filtered_dict)
    return filtered_dataset

dataset_train_1percent = take_a_percentage_of_data(dataset['train'], percentage=0.01)
dataset_test_1percent = take_a_percentage_of_data(dataset['test'], percentage=0.01)

combined_dataset_1percent = DatasetDict({
    'train': dataset_train_1percent,
    'test': dataset_test_1percent
})

In [17]:

training_data = combined_dataset_1percent

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = training_data.map(tokenize_function, batched=True)

Map: 100%|██████████| 1200/1200 [00:01<00:00, 849.93 examples/s]
Map: 100%|██████████| 76/76 [00:00<00:00, 896.93 examples/s]


In [23]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",  # Specify the directory for logs
    logging_steps=10,  # Log every 10 steps
    save_steps=100,  # Save the model every 100 steps
    evaluation_strategy="steps",  # Evaluate at each logging step
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
)


In [24]:
model_name = "gpt2-ag_news-1percent"

trainer.train()
trainer.save_model(model_name)
# Save the tokenizer used by the model as well
tokenizer.save_pretrained(model_name)


  0%|          | 0/6000 [02:38<?, ?it/s]
                                                    
  0%|          | 10/6000 [01:55<19:19:51, 11.62s/it]

{'loss': 1.5962, 'grad_norm': 15.322352409362793, 'learning_rate': 1.9966666666666666e-05, 'epoch': 0.01}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

KeyboardInterrupt: 

In [7]:
import torch
from transformers import GPT2Model, GPT2Config
from gaussian_adaptive_attention import MultiHeadGaussianAdaptiveAttention

class GPT2WithGAAM(GPT2Model):
    def __init__(self, config):
        super().__init__(config)
        self.gaam_attention = MultiHeadGaussianAdaptiveAttention(
            norm_axis=-1,  # Adjust based on how you want to normalize
            num_heads=config.n_head,
            num_gaussians=3,  # Adjust based on your needs
            padding_value=config.pad_token_id,
        )
import torch.nn as nn

class ClassifierWithGAAM(nn.Module):
    def __init__(self, num_layers, num_classes, hidden_dim, num_gaussians):
        super().__init__()
        self.gaam = MultiHeadGaussianAdaptiveAttention(
            norm_axis=2,  # Adjust based on requirements
            num_heads=8,  # Example head count
            num_gaussians=num_gaussians,
            padding_value=0
        )
        self.conv1 = nn.Conv2d(1, 32, kernel_size=(3, 3), padding=(1, 1))
        self.conv2 = nn.Conv2d(32, 64, kernel_size=(3, 3), padding=(1, 1))
        self.fc = nn.Linear(num_layers * hidden_dim * 64, num_classes)

    def forward(self, x):
        x = self.gaam(x)
        x = self.conv1(x.unsqueeze(1))
        x = nn.ReLU()(x)
        x = self.conv2(x)
        x = nn.ReLU()(x)
        x = x.view(x.size(0), -1)
        return self.fc(x)

# Example usage
num_layers, num_classes, hidden_dim, num_gaussians = 12, 2, 768, 3
classifier = ClassifierWithGAAM(num_layers, num_classes, hidden_dim, num_gaussians)



In [8]:
print(model)

GPT2WithIntermediateOutputs(
  (wte): Embedding(50257, 768)
  (wpe): Embedding(1024, 768)
  (drop): Dropout(p=0.1, inplace=False)
  (h): ModuleList(
    (0-11): 12 x GPT2Block(
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): GPT2Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): GPT2MLP(
        (c_fc): Conv1D()
        (c_proj): Conv1D()
        (act): NewGELUActivation()
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)


In [2]:

# Assuming `preprocessed_data` is loaded from the pickle file and is a list of (tensor, label) tuples
for x, label in preprocessed_data:
    output = classifier(x)
    print(output)

    def forward(self, input_ids, attention_mask=None):
        outputs = super().forward(input_ids, attention_mask=attention_mask)
        last_hidden_state = outputs.last_hidden_state

        # Apply multi-head GAAM
        gaam_output = self.gaam_attention(last_hidden_state)

        return gaam_output

# Example usage
config = GPT2Config.from_pretrained("gpt2")
model = GPT2WithGAAM(config)

# The rest of the code remains the same as in your earlier usage

GPT2WithGAAM(
  (wte): Embedding(50257, 768)
  (wpe): Embedding(1024, 768)
  (drop): Dropout(p=0.1, inplace=False)
  (h): ModuleList(
    (0-11): 12 x GPT2Block(
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): GPT2Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): GPT2MLP(
        (c_fc): Conv1D()
        (c_proj): Conv1D()
        (act): NewGELUActivation()
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (gaam_attention): MultiHeadGaussianAdaptiveAttention(
    (attention_heads): ModuleList(
      (0-11): 12 x GaussianAdaptiveAttention()
    )
  )
)


In [5]:
# Run model
outputs_2 = model(outputs)

# Output the resulting tensor (or process further)
print(outputs_2)

RuntimeError: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.FloatTensor instead (while checking arguments for embedding)