## Imports

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
import numpy as np


In [3]:
from transformers import GPTNeoXForCausalLM, AutoTokenizer
from safetensors.torch import load_file

from quelle.approx_unrolling.utils import TensorDict

import os

In [14]:
from transformers import GPTNeoXForCausalLM, AutoTokenizer

model_str = "EleutherAI/pythia-14m"
step = 5000
model = GPTNeoXForCausalLM.from_pretrained(
    model_str,
    revision=f"step{step}",
    device_map="auto",
)

tokenizer = AutoTokenizer.from_pretrained(
    model_str,
    revision=f"step{step}",
)

## Debugging gradient covariance

In [6]:
path_gradient = "/root/quelle/quelle/approx_unrolling/.models/EleutherAI/pythia-14m/checkpoint_1000/influence_results/factors_ekfac_half/gradient_covariance.safetensors"
path_activation = "/root/quelle/quelle/approx_unrolling/.models/EleutherAI/pythia-14m/checkpoint_1000/influence_results/factors_ekfac_half/activation_covariance.safetensors"
path_lambda_matrix = "/root/quelle/quelle/approx_unrolling/.models/EleutherAI/pythia-14m/checkpoint_1000/influence_results/factors_ekfac_half/lambda_matrix.safetensors"

In [8]:
for path in [path_gradient, path_activation, path_lambda_matrix]:
    if not os.path.exists(path):
        raise FileNotFoundError(f"File not found: {path}")
gradient_covariance = TensorDict(load_file(path_gradient))
activation_covariance = TensorDict(load_file(path_activation))
lambda_matrix = TensorDict(load_file(path_lambda_matrix))


In [18]:
lambda_matrix.keys()

dict_keys(['gpt_neox.layers.0.attention.dense', 'gpt_neox.layers.0.attention.query_key_value', 'gpt_neox.layers.0.mlp.dense_4h_to_h', 'gpt_neox.layers.0.mlp.dense_h_to_4h', 'gpt_neox.layers.1.attention.dense', 'gpt_neox.layers.1.attention.query_key_value', 'gpt_neox.layers.1.mlp.dense_4h_to_h', 'gpt_neox.layers.1.mlp.dense_h_to_4h', 'gpt_neox.layers.2.attention.dense', 'gpt_neox.layers.2.attention.query_key_value', 'gpt_neox.layers.2.mlp.dense_4h_to_h', 'gpt_neox.layers.2.mlp.dense_h_to_4h', 'gpt_neox.layers.3.attention.dense', 'gpt_neox.layers.3.attention.query_key_value', 'gpt_neox.layers.3.mlp.dense_4h_to_h', 'gpt_neox.layers.3.mlp.dense_h_to_4h', 'gpt_neox.layers.4.attention.dense', 'gpt_neox.layers.4.attention.query_key_value', 'gpt_neox.layers.4.mlp.dense_4h_to_h', 'gpt_neox.layers.4.mlp.dense_h_to_4h', 'gpt_neox.layers.5.attention.dense', 'gpt_neox.layers.5.attention.query_key_value', 'gpt_neox.layers.5.mlp.dense_4h_to_h', 'gpt_neox.layers.5.mlp.dense_h_to_4h'])

In [26]:
for name in model.named_modules():
    if name[0] in lambda_matrix.keys():
        print(
            "model",
            name[1].weight.shape,
            "gradient",
            gradient_covariance[name[0]].shape,
            "activation",
            activation_covariance[name[0]].shape,
            "lambda",
            lambda_matrix[name[0]].shape,
        )

        print("-" * 50)


model torch.Size([384, 128]) gradient torch.Size([384, 384]) activation torch.Size([129, 129]) lambda torch.Size([384, 129])
--------------------------------------------------
model torch.Size([128, 128]) gradient torch.Size([128, 128]) activation torch.Size([129, 129]) lambda torch.Size([128, 129])
--------------------------------------------------
model torch.Size([512, 128]) gradient torch.Size([512, 512]) activation torch.Size([129, 129]) lambda torch.Size([512, 129])
--------------------------------------------------
model torch.Size([128, 512]) gradient torch.Size([128, 128]) activation torch.Size([513, 513]) lambda torch.Size([128, 513])
--------------------------------------------------
model torch.Size([384, 128]) gradient torch.Size([384, 384]) activation torch.Size([129, 129]) lambda torch.Size([384, 129])
--------------------------------------------------
model torch.Size([128, 128]) gradient torch.Size([128, 128]) activation torch.Size([129, 129]) lambda torch.Size([128, 1

In [11]:
gradient_covariance.size()


TensorDict({'gpt_neox.layers.0.attention.dense': torch.Size([128, 128]), 'gpt_neox.layers.0.attention.query_key_value': torch.Size([384, 384]), 'gpt_neox.layers.0.mlp.dense_4h_to_h': torch.Size([128, 128]), 'gpt_neox.layers.0.mlp.dense_h_to_4h': torch.Size([512, 512]), 'gpt_neox.layers.1.attention.dense': torch.Size([128, 128]), 'gpt_neox.layers.1.attention.query_key_value': torch.Size([384, 384]), 'gpt_neox.layers.1.mlp.dense_4h_to_h': torch.Size([128, 128]), 'gpt_neox.layers.1.mlp.dense_h_to_4h': torch.Size([512, 512]), 'gpt_neox.layers.2.attention.dense': torch.Size([128, 128]), 'gpt_neox.layers.2.attention.query_key_value': torch.Size([384, 384]), 'gpt_neox.layers.2.mlp.dense_4h_to_h': torch.Size([128, 128]), 'gpt_neox.layers.2.mlp.dense_h_to_4h': torch.Size([512, 512]), 'gpt_neox.layers.3.attention.dense': torch.Size([128, 128]), 'gpt_neox.layers.3.attention.query_key_value': torch.Size([384, 384]), 'gpt_neox.layers.3.mlp.dense_4h_to_h': torch.Size([128, 128]), 'gpt_neox.layers.3.

In [12]:
activation_covariance.size()


TensorDict({'gpt_neox.layers.0.attention.dense': torch.Size([129, 129]), 'gpt_neox.layers.0.attention.query_key_value': torch.Size([129, 129]), 'gpt_neox.layers.0.mlp.dense_4h_to_h': torch.Size([513, 513]), 'gpt_neox.layers.0.mlp.dense_h_to_4h': torch.Size([129, 129]), 'gpt_neox.layers.1.attention.dense': torch.Size([129, 129]), 'gpt_neox.layers.1.attention.query_key_value': torch.Size([129, 129]), 'gpt_neox.layers.1.mlp.dense_4h_to_h': torch.Size([513, 513]), 'gpt_neox.layers.1.mlp.dense_h_to_4h': torch.Size([129, 129]), 'gpt_neox.layers.2.attention.dense': torch.Size([129, 129]), 'gpt_neox.layers.2.attention.query_key_value': torch.Size([129, 129]), 'gpt_neox.layers.2.mlp.dense_4h_to_h': torch.Size([513, 513]), 'gpt_neox.layers.2.mlp.dense_h_to_4h': torch.Size([129, 129]), 'gpt_neox.layers.3.attention.dense': torch.Size([129, 129]), 'gpt_neox.layers.3.attention.query_key_value': torch.Size([129, 129]), 'gpt_neox.layers.3.mlp.dense_4h_to_h': torch.Size([513, 513]), 'gpt_neox.layers.3.

In [13]:
lambda_matrix.size()


TensorDict({'gpt_neox.layers.0.attention.dense': torch.Size([128, 129]), 'gpt_neox.layers.0.attention.query_key_value': torch.Size([384, 129]), 'gpt_neox.layers.0.mlp.dense_4h_to_h': torch.Size([128, 513]), 'gpt_neox.layers.0.mlp.dense_h_to_4h': torch.Size([512, 129]), 'gpt_neox.layers.1.attention.dense': torch.Size([128, 129]), 'gpt_neox.layers.1.attention.query_key_value': torch.Size([384, 129]), 'gpt_neox.layers.1.mlp.dense_4h_to_h': torch.Size([128, 513]), 'gpt_neox.layers.1.mlp.dense_h_to_4h': torch.Size([512, 129]), 'gpt_neox.layers.2.attention.dense': torch.Size([128, 129]), 'gpt_neox.layers.2.attention.query_key_value': torch.Size([384, 129]), 'gpt_neox.layers.2.mlp.dense_4h_to_h': torch.Size([128, 513]), 'gpt_neox.layers.2.mlp.dense_h_to_4h': torch.Size([512, 129]), 'gpt_neox.layers.3.attention.dense': torch.Size([128, 129]), 'gpt_neox.layers.3.attention.query_key_value': torch.Size([384, 129]), 'gpt_neox.layers.3.mlp.dense_4h_to_h': torch.Size([128, 513]), 'gpt_neox.layers.3.

In [105]:
for i in range(104):
    loss_1 = torch.load(os.path.join(path_1, f"loss_{i}.pt"))
    loss_2 = torch.load(os.path.join(path_2, f"loss_{i}.pt"))
    diff = loss_1 - loss_2
    print(f"Loss {i} diff: {diff.abs().mean()}")

Loss 0 diff: 0.0
Loss 1 diff: 0.0
Loss 2 diff: 0.0
Loss 3 diff: 0.0
Loss 4 diff: 0.0
Loss 5 diff: 0.0
Loss 6 diff: 0.0
Loss 7 diff: 0.0
Loss 8 diff: 0.0
Loss 9 diff: 0.0
Loss 10 diff: 0.0
Loss 11 diff: 0.0
Loss 12 diff: 0.0
Loss 13 diff: 0.0
Loss 14 diff: 0.0
Loss 15 diff: 0.0
Loss 16 diff: 0.0
Loss 17 diff: 0.0
Loss 18 diff: 0.0
Loss 19 diff: 0.0
Loss 20 diff: 0.0
Loss 21 diff: 0.0
Loss 22 diff: 0.0
Loss 23 diff: 0.0
Loss 24 diff: 0.0
Loss 25 diff: 0.0
Loss 26 diff: 0.0
Loss 27 diff: 0.0
Loss 28 diff: 0.0
Loss 29 diff: 0.0
Loss 30 diff: 0.0
Loss 31 diff: 0.0
Loss 32 diff: 0.0
Loss 33 diff: 0.0
Loss 34 diff: 0.0
Loss 35 diff: 0.0
Loss 36 diff: 0.0
Loss 37 diff: 0.0
Loss 38 diff: 0.0
Loss 39 diff: 0.0
Loss 40 diff: 0.0
Loss 41 diff: 0.0
Loss 42 diff: 0.0
Loss 43 diff: 0.0
Loss 44 diff: 0.0
Loss 45 diff: 0.0
Loss 46 diff: 0.0
Loss 47 diff: 0.0
Loss 48 diff: 0.0
Loss 49 diff: 0.0
Loss 50 diff: 0.0
Loss 51 diff: 0.0
Loss 52 diff: 0.0
Loss 53 diff: 0.0
Loss 54 diff: 0.0
Loss 55 diff: 0.0
Lo

## 1. Exammining the matrices

In [128]:
def test_comparison(path_1, path_2):
    files_1 = os.listdir(path_1)
    files_2 = os.listdir(path_2)

    for file_1 in files_1:
        if file_1 in files_2:
            if file_1.endswith(".safetensors"):
                tensor_1 = TensorDict(
                    load_file(
                        os.path.join(path_1, file_1),
                        device="cuda",
                    )
                )
                tensor_2 = TensorDict(
                    load_file(
                        os.path.join(path_2, file_1),
                        device="cuda",
                    )
                )
                diff = tensor_1 - tensor_2
                all_close = tensor_1.allclose(tensor_2, rtol=1e-5, atol=1e-5)
                all_close_values = all(all_close.values())
                if not all_close_values:
                    print(file_1)
                    print("Differences found:")
                    print(diff.max())
                # check if all_close has any key that is False


In [129]:
path_1 = "/root/quelle/tests/caches/cache_1/.models/EleutherAI/pythia-14m/checkpoint_1000/influence_results/factors_ekfac_half"
path_2 = "/root/quelle/tests/caches/cache_2/.models/EleutherAI/pythia-14m/checkpoint_1000/influence_results/factors_ekfac_half"

In [130]:
test_comparison(path_2, path_1)

gradient_covariance.safetensors
Differences found:
TensorDict({'gpt_neox.layers.0.attention.dense': tensor(8192., device='cuda:0', dtype=torch.bfloat16), 'gpt_neox.layers.0.attention.query_key_value': tensor(128., device='cuda:0', dtype=torch.bfloat16), 'gpt_neox.layers.0.mlp.dense_4h_to_h': tensor(8192., device='cuda:0', dtype=torch.bfloat16), 'gpt_neox.layers.0.mlp.dense_h_to_4h': tensor(1024., device='cuda:0', dtype=torch.bfloat16), 'gpt_neox.layers.1.attention.dense': tensor(4096., device='cuda:0', dtype=torch.bfloat16), 'gpt_neox.layers.1.attention.query_key_value': tensor(128., device='cuda:0', dtype=torch.bfloat16), 'gpt_neox.layers.1.mlp.dense_4h_to_h': tensor(4096., device='cuda:0', dtype=torch.bfloat16), 'gpt_neox.layers.1.mlp.dense_h_to_4h': tensor(512., device='cuda:0', dtype=torch.bfloat16), 'gpt_neox.layers.2.attention.dense': tensor(2048., device='cuda:0', dtype=torch.bfloat16), 'gpt_neox.layers.2.attention.query_key_value': tensor(256., device='cuda:0', dtype=torch.bflo

In [127]:
test_comparison(path_2, path_1)

gradient_covariance.safetensors
Differences found:
TensorDict({'gpt_neox.layers.0.attention.dense': tensor(32768., device='cuda:0', dtype=torch.bfloat16), 'gpt_neox.layers.0.attention.query_key_value': tensor(128., device='cuda:0', dtype=torch.bfloat16), 'gpt_neox.layers.0.mlp.dense_4h_to_h': tensor(32768., device='cuda:0', dtype=torch.bfloat16), 'gpt_neox.layers.0.mlp.dense_h_to_4h': tensor(1024., device='cuda:0', dtype=torch.bfloat16), 'gpt_neox.layers.1.attention.dense': tensor(4096., device='cuda:0', dtype=torch.bfloat16), 'gpt_neox.layers.1.attention.query_key_value': tensor(128., device='cuda:0', dtype=torch.bfloat16), 'gpt_neox.layers.1.mlp.dense_4h_to_h': tensor(4096., device='cuda:0', dtype=torch.bfloat16), 'gpt_neox.layers.1.mlp.dense_h_to_4h': tensor(512., device='cuda:0', dtype=torch.bfloat16), 'gpt_neox.layers.2.attention.dense': tensor(2048., device='cuda:0', dtype=torch.bfloat16), 'gpt_neox.layers.2.attention.query_key_value': tensor(1024., device='cuda:0', dtype=torch.b

In [84]:
tensor_1 = TensorDict(
    load_file(os.path.join(path_1, "gradient_covariance.safetensors"), device="cuda")
)
tensor_2 = TensorDict(
    load_file(os.path.join(path_2, "gradient_covariance.safetensors"), device="cuda")
)

In [89]:
(tensor_1 - tensor_2).argmax()

TensorDict({'gpt_neox.layers.0.attention.dense': tensor(774, device='cuda:0'), 'gpt_neox.layers.0.attention.query_key_value': tensor(38502, device='cuda:0'), 'gpt_neox.layers.0.mlp.dense_4h_to_h': tensor(774, device='cuda:0'), 'gpt_neox.layers.0.mlp.dense_h_to_4h': tensor(254961, device='cuda:0'), 'gpt_neox.layers.1.attention.dense': tensor(7358, device='cuda:0'), 'gpt_neox.layers.1.attention.query_key_value': tensor(13118, device='cuda:0'), 'gpt_neox.layers.1.mlp.dense_4h_to_h': tensor(7358, device='cuda:0'), 'gpt_neox.layers.1.mlp.dense_h_to_4h': tensor(215460, device='cuda:0'), 'gpt_neox.layers.2.attention.dense': tensor(1382, device='cuda:0'), 'gpt_neox.layers.2.attention.query_key_value': tensor(50052, device='cuda:0'), 'gpt_neox.layers.2.mlp.dense_4h_to_h': tensor(1382, device='cuda:0'), 'gpt_neox.layers.2.mlp.dense_h_to_4h': tensor(10443, device='cuda:0'), 'gpt_neox.layers.3.attention.dense': tensor(2395, device='cuda:0'), 'gpt_neox.layers.3.attention.query_key_value': tensor(68

In [93]:
for k, v in (tensor_1 - tensor_2).items():
    print(k, v.max().item(), v.min().item(), v.mean().item(), v.std().item())

gpt_neox.layers.0.attention.dense 65536.0 -32768.0 4.15625 1640.0
gpt_neox.layers.0.attention.query_key_value 64.0 -64.0 0.0032806396484375 1.15625
gpt_neox.layers.0.mlp.dense_4h_to_h 65536.0 -32768.0 4.15625 1640.0
gpt_neox.layers.0.mlp.dense_h_to_4h 4096.0 -2048.0 0.134765625 59.0
gpt_neox.layers.1.attention.dense 16384.0 -8192.0 -1.3984375 584.0
gpt_neox.layers.1.attention.query_key_value 128.0 -128.0 0.0040283203125 1.375
gpt_neox.layers.1.mlp.dense_4h_to_h 16384.0 -8192.0 -1.3984375 584.0
gpt_neox.layers.1.mlp.dense_h_to_4h 2048.0 -2048.0 -0.0198974609375 24.0
gpt_neox.layers.2.attention.dense 2048.0 -4096.0 0.5859375 147.0
gpt_neox.layers.2.attention.query_key_value 2048.0 -512.0 0.02783203125 8.9375
gpt_neox.layers.2.mlp.dense_4h_to_h 2048.0 -4096.0 0.5859375 147.0
gpt_neox.layers.2.mlp.dense_h_to_4h 256.0 -512.0 -0.0172119140625 7.75
gpt_neox.layers.3.attention.dense 512.0 -1024.0 -0.001220703125 17.375
gpt_neox.layers.3.attention.query_key_value 128.0 -32.0 0.00026321411132812

In [8]:
d = TensorDict(
    load_file(
        "/root/quelle/quelle/approx_unrolling/.models/EleutherAI/pythia-14m/segment_0/influence_results/factors_ekfac_half/average_gradient_covariance.safetensors",
        device="cuda",
    )
)


d_2 = TensorDict(
    load_file(
        "/root/quelle/.models/EleutherAI/influence_results/factors_ekfac_half/gradient_covariance.safetensors",
        device="cuda",
    )
)

diff = d - d_2

for k, v in diff.items():
    if v.max() < 1e-5:
        continue
    print(k)
    print(v.max())
    print("----" * 10)

gpt_neox.layers.0.attention.dense
tensor(196608., device='cuda:0', dtype=torch.bfloat16)
----------------------------------------
gpt_neox.layers.0.attention.query_key_value
tensor(768., device='cuda:0', dtype=torch.bfloat16)
----------------------------------------
gpt_neox.layers.0.mlp.dense_4h_to_h
tensor(196608., device='cuda:0', dtype=torch.bfloat16)
----------------------------------------
gpt_neox.layers.0.mlp.dense_h_to_4h
tensor(12288., device='cuda:0', dtype=torch.bfloat16)
----------------------------------------
gpt_neox.layers.1.attention.dense
tensor(163840., device='cuda:0', dtype=torch.bfloat16)
----------------------------------------
gpt_neox.layers.1.attention.query_key_value
tensor(512., device='cuda:0', dtype=torch.bfloat16)
----------------------------------------
gpt_neox.layers.1.mlp.dense_4h_to_h
tensor(163840., device='cuda:0', dtype=torch.bfloat16)
----------------------------------------
gpt_neox.layers.1.mlp.dense_h_to_4h
tensor(8192., device='cuda:0', dtyp

In [12]:
number = 15335424
# determine prime decomposition of number
number / 7488


2048.0

In [5]:
from quelle.approx_unrolling.utils import TensorDict


d = TensorDict(d)

In [7]:
test_list = [d, d]

In [8]:
sum(test_list)

TypeError: unsupported operand type(s) for +: 'int' and 'TensorDict'

In [42]:
path = "/home/louis/quelle/quelle/approx_unrolling/influence_results/wikitext/factors_ekfac_half"

# list of all subfolders or files
import os

subfolders = []
for dirpath, dirnames, filenames in os.walk(path):
    for dirname in dirnames:
        subfolders.append(os.path.join(dirpath, dirname))
    for filename in filenames:
        subfolders.append(os.path.join(dirpath, filename))


'/home/louis/quelle/quelle/approx_unrolling/influence_results/wikitext/factors_ekfac_half'

In [6]:
filenames
filenames_json = [f for f in subfolders if f.endswith(".json")]
filesnames_safetensors = [f for f in subfolders if f.endswith(".safetensors")]

In [7]:
filenames_json

['/home/louis/quelle/quelle/approx_unrolling/influence_results/wikitext/factors_ekfac_half/factor_arguments.json',
 '/home/louis/quelle/quelle/approx_unrolling/influence_results/wikitext/factors_ekfac_half/covariance_dataset_metadata.json']

In [8]:
import json

json_file = filenames_json[0]
with open(json_file, "r") as f:
    data = json.load(f)
data

{'strategy': 'ekfac',
 'use_empirical_fisher': False,
 'amp_dtype': 'torch.bfloat16',
 'amp_scale': 65536.0,
 'has_shared_parameters': False,
 'covariance_max_examples': 100000,
 'covariance_data_partitions': 1,
 'covariance_module_partitions': 1,
 'activation_covariance_dtype': 'torch.bfloat16',
 'gradient_covariance_dtype': 'torch.bfloat16',
 'eigendecomposition_dtype': 'torch.float64',
 'lambda_max_examples': 100000,
 'lambda_data_partitions': 1,
 'lambda_module_partitions': 1,
 'use_iterative_lambda_aggregation': False,
 'offload_activations_to_cpu': False,
 'per_sample_gradient_dtype': 'torch.bfloat16',
 'lambda_dtype': 'torch.bfloat16'}

In [9]:
filesnames_safetensors

['/home/louis/quelle/quelle/approx_unrolling/influence_results/wikitext/factors_ekfac_half/activation_covariance.safetensors',
 '/home/louis/quelle/quelle/approx_unrolling/influence_results/wikitext/factors_ekfac_half/gradient_covariance.safetensors',
 '/home/louis/quelle/quelle/approx_unrolling/influence_results/wikitext/factors_ekfac_half/num_activation_covariance_processed.safetensors',
 '/home/louis/quelle/quelle/approx_unrolling/influence_results/wikitext/factors_ekfac_half/num_gradient_covariance_processed.safetensors']

In [10]:
from safetensors import safe_open

for i in range(len(filesnames_safetensors)):
    path = filesnames_safetensors[i]
    tensors = {}
    with safe_open(path, framework="pt", device=0) as f:
        for k in f.keys():
            tensors[k] = f.get_tensor(k)
    print(tensors.keys())

dict_keys(['lm_head', 'transformer.h.0.attn.c_attn', 'transformer.h.0.attn.c_proj', 'transformer.h.0.mlp.c_fc', 'transformer.h.0.mlp.c_proj', 'transformer.h.1.attn.c_attn', 'transformer.h.1.attn.c_proj', 'transformer.h.1.mlp.c_fc', 'transformer.h.1.mlp.c_proj', 'transformer.h.10.attn.c_attn', 'transformer.h.10.attn.c_proj', 'transformer.h.10.mlp.c_fc', 'transformer.h.10.mlp.c_proj', 'transformer.h.11.attn.c_attn', 'transformer.h.11.attn.c_proj', 'transformer.h.11.mlp.c_fc', 'transformer.h.11.mlp.c_proj', 'transformer.h.2.attn.c_attn', 'transformer.h.2.attn.c_proj', 'transformer.h.2.mlp.c_fc', 'transformer.h.2.mlp.c_proj', 'transformer.h.3.attn.c_attn', 'transformer.h.3.attn.c_proj', 'transformer.h.3.mlp.c_fc', 'transformer.h.3.mlp.c_proj', 'transformer.h.4.attn.c_attn', 'transformer.h.4.attn.c_proj', 'transformer.h.4.mlp.c_fc', 'transformer.h.4.mlp.c_proj', 'transformer.h.5.attn.c_attn', 'transformer.h.5.attn.c_proj', 'transformer.h.5.mlp.c_fc', 'transformer.h.5.mlp.c_proj', 'transform

In [11]:
path = filesnames_safetensors[-2]
with safe_open(path, framework="pt", device=0) as f:
    for k in f.keys():
        tensors[k] = f.get_tensor(k)

In [29]:
gpt_2 = torch.load("/home/louis/quelle/quelle/approx_unrolling/checkpoints/model.pth")

In [6]:
all_weights = model.state_dict()

all_mlps = {k: v for k, v in all_weights.items() if "mlp" in k}


In [7]:
len(all_mlps)

24

In [22]:
type(model.named_modules())

generator

In [31]:
module_keys = [m[0] for m in model.named_modules()]

In [48]:
track_attention = True
track_mlp = True
total_modules = []
for m in module_keys:
    if "dropout" in m.lower() or "layernorm" in m.lower():
        continue

    if "attention" in m.lower() and track_attention:
        total_modules.append(m)
    if "mlp" in m.lower() and track_mlp:
        total_modules.append(m)


ValueError: Error initializing torch.distributed using env:// rendezvous: environment variable RANK expected, but not set

In [4]:
from examples.wikitext.pipeline import get_wikitext_dataset

train_dataset = get_wikitext_dataset(
    split="eval_train",
)

In [7]:
# sample from train_dataset
sample = train_dataset[0]
