In [2]:
import torch

from transformers import GPT2Model, GPT2Config, GPT2Tokenizer

class GPT2WithIntermediateOutputs(GPT2Model):

    def __init__(self, config):

        super().__init__(config)

    def forward(self, input_ids, attention_mask=None):

        # Get embeddings from input

        input_shape = input_ids.size()

        input_ids = input_ids.view(-1, input_shape[-1])

        device = input_ids.device

        if attention_mask is None:

            attention_mask = torch.ones(input_shape, device=device)

        # Prepare attention mask

        if attention_mask.dim() == 3:

            extended_attention_mask = attention_mask[:, None, :, :]

        elif attention_mask.dim() == 2:

            extended_attention_mask = attention_mask[:, None, None, :]

        # Prepare head mask if needed

        head_mask = self.get_head_mask(None, self.config.n_layer)

        # Transformer layers

        hidden_states = self.wte(input_ids) + self.wpe(torch.arange(0, input_shape[-1], device=device))

        hidden_states = self.drop(hidden_states)

        output_shape = input_shape + (hidden_states.size(-1),)

        all_hidden_states = torch.empty((*output_shape, self.config.n_layer + 1), device=device)

        all_hidden_states[..., 0] = hidden_states

        for i, (block, layer_past) in enumerate(zip(self.h, [None]*len(self.h))):

            outputs = block(hidden_states, layer_past=layer_past, attention_mask=extended_attention_mask, head_mask=head_mask[i])

            hidden_states = outputs[0]

            all_hidden_states[..., i+1] = hidden_states

        # Concatenate all hidden states

        concatenated_outputs = all_hidden_states.permute(2, 0, 1, 3).reshape(self.config.n_layer + 1, -1, hidden_states.size(-1))

        return concatenated_outputs
 

# Example usage:

config = GPT2Config.from_pretrained("gpt2")

model = GPT2WithIntermediateOutputs(config)

print(model)

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

input_ids = torch.tensor([tokenizer.encode("if the vocabulary is known, then the sequence length is correct")])

# Output will have dimensions: [num_layers + 1, batch_size, seq_length, features] (discard element 0 , that's the embedding initial layer)

outputs = model(input_ids)


print(outputs.shape)  # Output dimensions

  from .autonotebook import tqdm as notebook_tqdm


GPT2WithIntermediateOutputs(
  (wte): Embedding(50257, 768)
  (wpe): Embedding(1024, 768)
  (drop): Dropout(p=0.1, inplace=False)
  (h): ModuleList(
    (0-11): 12 x GPT2Block(
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): GPT2Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): GPT2MLP(
        (c_fc): Conv1D()
        (c_proj): Conv1D()
        (act): NewGELUActivation()
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
torch.Size([13, 12, 768])


In [49]:
import torch
import pickle
from transformers import GPT2Tokenizer

# Example dataset with labels
data = [("Hello, world!", 1), ("This is a test.", 0)]

# Initialize tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
gpt2_model = GPT2WithIntermediateOutputs.from_pretrained("gpt2")

preprocessed_data = []

for text, label in data:
    input_ids = tokenizer.encode(text, return_tensors="pt")
    with torch.no_grad():
        intermediate_outputs = gpt2_model(input_ids)
    # Ensure the outputs align with the classifier
    preprocessed_data.append((intermediate_outputs, label))

# Save preprocessed data
with open("preprocessed_data_with_labels.pkl", "wb") as f:
    pickle.dump(preprocessed_data, f)

import torch.nn as nn
from gaussian_adaptive_attention import MultiHeadGaussianAdaptiveAttention

import torch.nn as nn
from gaussian_adaptive_attention import MultiHeadGaussianAdaptiveAttention

class ClassifierWithGAAM(nn.Module):
    def __init__(self, num_classes, num_gaussians):
        super().__init__()
        self.gaam = MultiHeadGaussianAdaptiveAttention(
            norm_axis=-1,
            num_heads=5,
            num_gaussians=num_gaussians,
            padding_value=0
        )
        
        # Adjusted convolutional layers
        self.conv1 = nn.Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        self.conv2 = nn.Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        
        # Set a placeholder linear layer; dynamically update in forward
        self.fc = None

    def forward(self, x):
        print("Input shape:", x.shape)
        x = self.gaam(x)
        print("GAAM output shape:", x.shape)
        
        # Add a channel dimension if GAAM output is 3D
        if x.dim() == 3:
            x = x.unsqueeze(1)
        print("After unsqueeze:", x.shape)

        x = self.conv1(x)
        print("After conv1:", x.shape)

        x = nn.ReLU()(x)
        print("After ReLU1:", x.shape)

        x = self.conv2(x)
        print("After conv2:", x.shape)

        x = nn.ReLU()(x)
        print("After ReLU2:", x.shape)

        x = x.view(x.size(0), -1)
        print("After flattening:", x.shape)

        flattened_features = x.size(1)
        print("Flattened features:", flattened_features)

        self.fc = nn.Linear(flattened_features, num_classes)
        
        return self.fc(x)


# Initialize the classifier with appropriate dimensions
num_layers, num_classes, hidden_dim, num_gaussians = 12, 2, 768, 3
classifier = ClassifierWithGAAM(num_classes, num_gaussians)

# Load preprocessed data with labels
with open("preprocessed_data_with_labels.pkl", "rb") as f:
    preprocessed_data = pickle.load(f)

# Training classifier
for x, label in preprocessed_data:
    output = classifier(x)
    print(f"Predicted: {torch.argmax(output)}, Actual: {label}")


Predicted: 12, Actual: 1


RuntimeError: mat1 and mat2 shapes cannot be multiplied (13x24576 and 12288x1)