In [1]:
!pip install -U datasets



In [2]:
# Requisite imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import spearmanr
# Import transformers
from transformers import AutoTokenizer, AutoModel
import json
import pickle
from tqdm import tqdm
from datasets import load_dataset
import torch

In [3]:
# Load the Pythia 70M model
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-70m")
model = AutoModel.from_pretrained("EleutherAI/pythia-70m")

In [4]:
# Move model to device
model.to("cuda")

GPTNeoXModel(
  (embed_in): Embedding(50304, 512)
  (emb_dropout): Dropout(p=0.0, inplace=False)
  (layers): ModuleList(
    (0-5): 6 x GPTNeoXLayer(
      (input_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (post_attention_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (post_attention_dropout): Dropout(p=0.0, inplace=False)
      (post_mlp_dropout): Dropout(p=0.0, inplace=False)
      (attention): GPTNeoXAttention(
        (query_key_value): Linear(in_features=512, out_features=1536, bias=True)
        (dense): Linear(in_features=512, out_features=512, bias=True)
      )
      (mlp): GPTNeoXMLP(
        (dense_h_to_4h): Linear(in_features=512, out_features=2048, bias=True)
        (dense_4h_to_h): Linear(in_features=2048, out_features=512, bias=True)
        (act): GELUActivation()
      )
    )
  )
  (final_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  (rotary_emb): GPTNeoXRotaryEmbedding()
)

In [5]:
# Identify the dataset
# Let us use wikitext or something?

# We will use the  pile test dataset in a streaming way
wikitext = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")


In [6]:
# Throw out a sample from the dataset
sentence = wikitext[625]
print(sentence)

{'text': ' = = = Torpedoes = = = \n'}


In [7]:
# How many tokens are presesnt in the text
tokens = tokenizer.tokenize(sentence['text'])
print(len(tokens))

10


In [8]:
# Iterate through the inputs
for i in wikitext:
    text = i["text"]
    tokens = tokenizer.tokenize(text)

    if len(tokens) < 25:
      continue
    print(len(tokens))
    # Send the input to the mdoel
    inputs = tokenizer(text, return_tensors="pt").to("cuda")
    outputs = model(**inputs, output_attentions = True)
    # SPrint the shape of the activations sot hat we can check stuff out
    print(outputs.attentions[0].shape)
    for layer in outputs.attentions:
        print(layer.shape)
    break



Setting `attention_type` to `eager` because `sdpa` does not support `output_attentions=True` or `head_mask`.


185
torch.Size([1, 8, 185, 185])
torch.Size([1, 8, 185, 185])
torch.Size([1, 8, 185, 185])
torch.Size([1, 8, 185, 185])
torch.Size([1, 8, 185, 185])
torch.Size([1, 8, 185, 185])
torch.Size([1, 8, 185, 185])


In [9]:
attention_weights = outputs.attentions

In [10]:
### THese are the attention weights after softmax

In [11]:
# Now, we wanna compute some sort of importance ( column wise means maybe ? )

In [12]:
# Remove batch dimension
attention_weights = [attn.squeeze(0) for attn in attention_weights]
# Mean the attentions across the heads
aggregate_attn_weights = torch.mean(attention_weights[0], dim=0)


# Then, mean the aggregates column wise
column_wise_mean = torch.mean(aggregate_attn_weights, dim=0)
print(column_wise_mean.shape)

# This will give us an ordering




torch.Size([185])


In [13]:
print(column_wise_mean)

tensor([0.0142, 0.0081, 0.0067, 0.0046, 0.0178, 0.0189, 0.0054, 0.0095, 0.0087,
        0.0076, 0.0157, 0.0082, 0.0100, 0.0036, 0.0097, 0.0074, 0.0140, 0.0034,
        0.0073, 0.0109, 0.0063, 0.0146, 0.0059, 0.0120, 0.0106, 0.0048, 0.0041,
        0.0125, 0.0036, 0.0137, 0.0035, 0.0027, 0.0072, 0.0100, 0.0050, 0.0105,
        0.0114, 0.0120, 0.0052, 0.0121, 0.0093, 0.0038, 0.0040, 0.0021, 0.0037,
        0.0093, 0.0034, 0.0025, 0.0057, 0.0071, 0.0090, 0.0053, 0.0123, 0.0035,
        0.0076, 0.0094, 0.0050, 0.0058, 0.0059, 0.0023, 0.0062, 0.0054, 0.0103,
        0.0022, 0.0043, 0.0119, 0.0081, 0.0042, 0.0033, 0.0051, 0.0040, 0.0039,
        0.0042, 0.0115, 0.0037, 0.0020, 0.0043, 0.0029, 0.0030, 0.0023, 0.0017,
        0.0049, 0.0101, 0.0036, 0.0031, 0.0079, 0.0023, 0.0077, 0.0089, 0.0079,
        0.0061, 0.0072, 0.0038, 0.0040, 0.0024, 0.0044, 0.0072, 0.0139, 0.0075,
        0.0037, 0.0026, 0.0097, 0.0018, 0.0026, 0.0024, 0.0042, 0.0075, 0.0036,
        0.0044, 0.0029, 0.0027, 0.0081, 

In [14]:
# Get the ordering
ordering = torch.argsort(column_wise_mean, descending=True)
print(ordering)

tensor([  5,   4,  10,  21,   0,  16,  97,  29,  27, 124,  52,  39,  37,  23,
         65,  73,  36,  19,  24,  35,  62,  82,  12,  33,  14, 101,   7,  55,
         40,  45, 119,  50,  88,   8, 140,  11,  66, 111,   1,  89,  85, 120,
         87,   9,  54, 142, 106,  98,  15,  18,  96,  91,  32, 137,  49, 148,
          2, 125, 158, 117,  20,  60,  90, 143, 134,  58,  22,  57,  48, 154,
        136,   6, 152,  61,  51, 133,  38,  69,  56,  34,  81,  25, 116,   3,
        112,  95, 123, 108,  76,  64,  72, 105,  67,  26,  93,  42,  70,  71,
        122,  92, 132, 151,  41,  44,  74,  99,  28,  83,  13, 162, 107,  30,
         53,  46, 113,  17, 159, 139, 127,  68,  84,  78, 146, 135, 109, 126,
        155,  77, 149, 114, 179, 141, 165, 121, 110, 169,  31, 163, 100, 166,
        103,  47, 118, 161, 144, 104,  94, 167,  79,  86, 174,  59, 153, 180,
        170, 157, 129,  63,  43, 172, 130,  75, 176, 115, 173, 131, 156, 181,
        102, 177, 128, 150,  80, 138, 164, 160, 145, 178, 147, 1

In [15]:


# Load the Pythia 70M model
splits = [
    'test', 'train'
]

for split in splits:
  wikitext = load_dataset("wikitext", "wikitext-2-raw-v1", split=split)

  # Overall script
  # Coding the actual loop
  i = 0
  ### Need to change this according to the model
  num_layers = 6
  print("num layers", num_layers)
  all_orderings = [[] for _ in range(num_layers)]
  for sample in wikitext:
      text = sample["text"]
      if len(text) <125:
        continue
      i += 1
      # Pass the input through the model and get the first token with attentinos
      inputs = tokenizer(text, return_tensors="pt").to("cuda")
      try:
        outputs = model(**inputs, output_attentions = True)
      except:
        print("error for sample", sample)
        continue

      # Extract the attentinos
      attention_weights = outputs.attentions
      for layer in range(num_layers):
          # ggregate across heads
          aggregate_attn_weights = torch.mean(attention_weights[layer], dim=1)
          # Mean the aggregates column wise
          column_wise_mean = torch.mean(aggregate_attn_weights, dim=1)

          # Remove batch dimensin
          column_wise_mean = column_wise_mean.squeeze(0)
          # Get the ordering
          ordering = torch.argsort(column_wise_mean, descending=True).cpu().numpy()
          all_orderings[layer].append(ordering)

      # Write to file every 50 samples
      if i % 50 == 0:
          with open("orderings_" + split + ".pkl", "wb") as f:
              pickle.dump(all_orderings, f)


num layers 6
num layers 6
