In [1]:
try:
    import google.colab

    IN_COLAB = True
    print("Running as a Colab notebook")

    import subprocess # to install graphviz dependencies
    command = ['apt-get', 'install', 'graphviz-dev']
    subprocess.run(command, check=True)

    import os # make images folder
    os.mkdir("ims/")

    from IPython import get_ipython
    ipython = get_ipython()

    ipython.run_line_magic( # install ACDC
        "pip",
        "install git+https://github.com/ArthurConmy/Automatic-Circuit-Discovery.git@2cc2d6d71416bddd3a88f287ffccfc0863ac8ddc",
    )

except Exception as e:
    IN_COLAB = False
    print("Running as a outside of colab")

    import numpy # crucial to not get cursed error
    import plotly

    plotly.io.renderers.default = "colab"  # added by Arthur so running as a .py notebook with #%% generates .ipynb notebooks that display in colab
    # disable this option when developing rather than generating notebook outputs

    import os # make images folder
    if not os.path.exists("ims/"):
        os.mkdir("ims/")

    from IPython import get_ipython

    ipython = get_ipython()
    if ipython is not None:
        print("Running as a notebook")
        ipython.run_line_magic("load_ext", "autoreload")  # type: ignore
        ipython.run_line_magic("autoreload", "2")  # type: ignore
    else:
        print("Running as a script")

Running as a outside of colab
Running as a notebook


In [35]:
import wandb
import IPython
from IPython.display import Image, display
import torch
import gc
from tqdm import tqdm
import networkx as nx
import os
import torch
from torch import Tensor
import huggingface_hub
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import einops
from tqdm import tqdm
import yaml
from transformers import AutoModelForCausalLM, AutoConfig, AutoTokenizer
from jaxtyping import Float, Int, Bool

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.io as pio
from plotly.subplots import make_subplots
import plotly.graph_objects as go

import transformer_lens
from transformer_lens.hook_points import HookedRootModule, HookPoint
from transformer_lens.HookedTransformer import (
    HookedTransformer,
)



In [16]:

num_examples = 100
# things = get_all_ioi_things(
#     num_examples=num_examples, device=DEVICE, metric_name=args.metric
# )

import acdc
tl_model = acdc.ioi.utils.get_gpt2_small(device=DEVICE)

Using pad_token, but it is not set yet.


Loaded pretrained model gpt2 into HookedTransformer
Moving model to device:  cuda


### Testing how to do dropout...

In [17]:
import transformers
from transformers import GPT2LMHeadModel
from transformers import GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model_gpt2_small = GPT2LMHeadModel.from_pretrained('gpt2')

In [18]:
model_gpt2_small.config.attn_pdrop

0.1

In [19]:
# definitely dropout here in normal gpt2 small
model_gpt2_small.transformer.h[0]

GPT2Block(
  (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (attn): GPT2Attention(
    (c_attn): Conv1D()
    (c_proj): Conv1D()
    (attn_dropout): Dropout(p=0.1, inplace=False)
    (resid_dropout): Dropout(p=0.1, inplace=False)
  )
  (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (mlp): GPT2MLP(
    (c_fc): Conv1D()
    (c_proj): Conv1D()
    (act): NewGELUActivation()
    (dropout): Dropout(p=0.1, inplace=False)
  )
)

In [20]:
type(tl_model.blocks[0])

transformer_lens.components.TransformerBlock

In [21]:
transformer_lens.components.TransformerBlock

NameError: name 'transformer_lens' is not defined

# Implementing dropout using hooks

Plan from Callum:
- Use permanent hooks
- Write a hook for each kind of dropout: attention, resid, MLP
- Dropout hooks have a higher level
- Add a command line argument for dropout
- Change self.update_cur_metric

TODO now:
- Figure out the attention, resid, and MLP dropout hooks

In [22]:
tl_model = HookedTransformer.from_pretrained('gpt2')

Using pad_token, but it is not set yet.


Loaded pretrained model gpt2 into HookedTransformer


In [23]:
def dropout_hook(input: Float[Tensor, "batch ..."], hook: HookPoint,
                   p: float = 0.1) -> Float[Tensor, "batch ..."]:
    """
    Same function used in torch implementation of t.nn.Dropout
    """
    return torch.nn.functional.dropout(input, p)

def dropout_name_fn(name: str) -> bool:
    ret = "pattern" in name or "mlp_out" in name or "attn_out" in name or name == "blocks.0.hook_resid_pre"
    # print(ret, name)
    return ret


In [24]:
# def tl_logit_diff(seed=1234):
torch.manual_seed(1234)
input_tl = "hello my name is"
tl_model.reset_hooks()
logits_tl_eval = tl_model(input_tl)
tl_model.add_hook(dropout_name_fn, dropout_hook)
logits_tl_train = tl_model(input_tl)
# logits_tl_train.shape
logit_diff_tl = logits_tl_train - logits_tl_eval
(logit_diff_tl**2).mean()

tensor(2.0821, device='cuda:0')

In [25]:
# def hf_logit_diff(seed=1234):
torch.manual_seed(1234)
tokenizer.add_bos_token = True
input = tokenizer.encode("hello my name is", return_tensors="pt", add_special_tokens=True)
model_gpt2_small.train()
logits_train = model_gpt2_small(input).logits.cuda()
logits_train = logits_train - logits_train.mean(dim=2).unsqueeze(-1)

model_gpt2_small.eval()
logits_eval = model_gpt2_small(input).logits.cuda()
logits_eval = logits_eval - logits_eval.mean(dim=2).unsqueeze(-1)
logit_diff = logits_train - logits_eval
# print(logit_diff.shape)
(logit_diff**2).mean()

tensor(1.5261, device='cuda:0')

In [26]:
# Confirm that logits are very similar without dropout
# normalize mean of HF model
tl_vs_hf = logits_tl_eval - logits_eval

(tl_vs_hf**2).mean()

tensor(7.9908e-11, device='cuda:0')

In [27]:
# Are logits same with dropout? It could just be random seed...
# normalize mean of HF model
tl_vs_hf_train = logits_tl_train - logits_train

(tl_vs_hf_train**2).mean()

tensor(1.7458, device='cuda:0')

In [None]:
# Apply a hook on model_gpt2_small to save layer 0 resid_out to a global variable

In [28]:
generated_text_samples = model_gpt2_small.generate(input)

for i, beam in enumerate(generated_text_samples):
    print(f"{i}: {tokenizer.decode(beam, skip_special_tokens=True)}")
    print()


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Using `max_length`'s default (20) to control the generation length. This behaviour is deprecated and will be removed from the config in v5 of Transformers -- we recommend using `max_new_tokens` to control the maximum length of the generation.



0: hello my name is my name is my name is my name is my name is my name is



In [29]:
tl_model

HookedTransformer(
  (embed): Embed()
  (hook_embed): HookPoint()
  (pos_embed): PosEmbed()
  (hook_pos_embed): HookPoint()
  (blocks): ModuleList(
    (0): TransformerBlock(
      (ln1): LayerNormPre(
        (hook_scale): HookPoint()
        (hook_normalized): HookPoint()
      )
      (ln2): LayerNormPre(
        (hook_scale): HookPoint()
        (hook_normalized): HookPoint()
      )
      (attn): Attention(
        (hook_k): HookPoint()
        (hook_q): HookPoint()
        (hook_v): HookPoint()
        (hook_z): HookPoint()
        (hook_attn_scores): HookPoint()
        (hook_pattern): HookPoint()
        (hook_result): HookPoint()
      )
      (mlp): MLP(
        (hook_pre): HookPoint()
        (hook_post): HookPoint()
      )
      (hook_q_input): HookPoint()
      (hook_k_input): HookPoint()
      (hook_v_input): HookPoint()
      (hook_attn_out): HookPoint()
      (hook_mlp_in): HookPoint()
      (hook_mlp_out): HookPoint()
      (hook_resid_pre): HookPoint()
      (hook_re

In [30]:
model_gpt2_small.transformer

GPT2Model(
  (wte): Embedding(50257, 768)
  (wpe): Embedding(1024, 768)
  (drop): Dropout(p=0.1, inplace=False)
  (h): ModuleList(
    (0): GPT2Block(
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): GPT2Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): GPT2MLP(
        (c_fc): Conv1D()
        (c_proj): Conv1D()
        (act): NewGELUActivation()
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (1): GPT2Block(
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): GPT2Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwis

In [31]:
type(model_gpt2_small.transformer.h[0])

transformers.models.gpt2.modeling_gpt2.GPT2Block

In [32]:
transformers.models.gpt2.modeling_gpt2.GPT2Block

transformers.models.gpt2.modeling_gpt2.GPT2Block

In [33]:
model_gpt2_small.transformer.h[0].train()

GPT2Block(
  (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (attn): GPT2Attention(
    (c_attn): Conv1D()
    (c_proj): Conv1D()
    (attn_dropout): Dropout(p=0.1, inplace=False)
    (resid_dropout): Dropout(p=0.1, inplace=False)
  )
  (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (mlp): GPT2MLP(
    (c_fc): Conv1D()
    (c_proj): Conv1D()
    (act): NewGELUActivation()
    (dropout): Dropout(p=0.1, inplace=False)
  )
)

In [34]:
tl_model.eval()

HookedTransformer(
  (embed): Embed()
  (hook_embed): HookPoint()
  (pos_embed): PosEmbed()
  (hook_pos_embed): HookPoint()
  (blocks): ModuleList(
    (0): TransformerBlock(
      (ln1): LayerNormPre(
        (hook_scale): HookPoint()
        (hook_normalized): HookPoint()
      )
      (ln2): LayerNormPre(
        (hook_scale): HookPoint()
        (hook_normalized): HookPoint()
      )
      (attn): Attention(
        (hook_k): HookPoint()
        (hook_q): HookPoint()
        (hook_v): HookPoint()
        (hook_z): HookPoint()
        (hook_attn_scores): HookPoint()
        (hook_pattern): HookPoint()
        (hook_result): HookPoint()
      )
      (mlp): MLP(
        (hook_pre): HookPoint()
        (hook_post): HookPoint()
      )
      (hook_q_input): HookPoint()
      (hook_k_input): HookPoint()
      (hook_v_input): HookPoint()
      (hook_attn_out): HookPoint()
      (hook_mlp_in): HookPoint()
      (hook_mlp_out): HookPoint()
      (hook_resid_pre): HookPoint()
      (hook_re