## Imports

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
import numpy as np
from transformers import GPTNeoXForCausalLM, AutoTokenizer
from tqdm.notebook import tqdm


In [3]:
from transformers import GPTNeoXForCausalLM, AutoTokenizer

model_str = "EleutherAI/pythia-14m"
step = 143000
model = GPTNeoXForCausalLM.from_pretrained(
    model_str,
    revision=f"step{step}",
    device_map="auto",
)

tokenizer = AutoTokenizer.from_pretrained(
    model_str,
    revision=f"step{step}",
)

## 1. Exammining the matrices

In [61]:
path = "/home/louis/quelle/quelle/approx_unrolling/influence_results/wikitext/factors_ekfac_half"

# list of all subfolders or files
import os

subfolders = []
for dirpath, dirnames, filenames in os.walk(path):
    for dirname in dirnames:
        subfolders.append(os.path.join(dirpath, dirname))
    for filename in filenames:
        subfolders.append(os.path.join(dirpath, filename))


In [6]:
filenames
filenames_json = [f for f in subfolders if f.endswith(".json")]
filesnames_safetensors = [f for f in subfolders if f.endswith(".safetensors")]

In [7]:
filenames_json

['/home/louis/quelle/quelle/approx_unrolling/influence_results/wikitext/factors_ekfac_half/factor_arguments.json',
 '/home/louis/quelle/quelle/approx_unrolling/influence_results/wikitext/factors_ekfac_half/covariance_dataset_metadata.json']

In [8]:
import json

json_file = filenames_json[0]
with open(json_file, "r") as f:
    data = json.load(f)
data

{'strategy': 'ekfac',
 'use_empirical_fisher': False,
 'amp_dtype': 'torch.bfloat16',
 'amp_scale': 65536.0,
 'has_shared_parameters': False,
 'covariance_max_examples': 100000,
 'covariance_data_partitions': 1,
 'covariance_module_partitions': 1,
 'activation_covariance_dtype': 'torch.bfloat16',
 'gradient_covariance_dtype': 'torch.bfloat16',
 'eigendecomposition_dtype': 'torch.float64',
 'lambda_max_examples': 100000,
 'lambda_data_partitions': 1,
 'lambda_module_partitions': 1,
 'use_iterative_lambda_aggregation': False,
 'offload_activations_to_cpu': False,
 'per_sample_gradient_dtype': 'torch.bfloat16',
 'lambda_dtype': 'torch.bfloat16'}

In [9]:
filesnames_safetensors

['/home/louis/quelle/quelle/approx_unrolling/influence_results/wikitext/factors_ekfac_half/activation_covariance.safetensors',
 '/home/louis/quelle/quelle/approx_unrolling/influence_results/wikitext/factors_ekfac_half/gradient_covariance.safetensors',
 '/home/louis/quelle/quelle/approx_unrolling/influence_results/wikitext/factors_ekfac_half/num_activation_covariance_processed.safetensors',
 '/home/louis/quelle/quelle/approx_unrolling/influence_results/wikitext/factors_ekfac_half/num_gradient_covariance_processed.safetensors']

In [10]:
from safetensors import safe_open

for i in range(len(filesnames_safetensors)):
    path = filesnames_safetensors[i]
    tensors = {}
    with safe_open(path, framework="pt", device=0) as f:
        for k in f.keys():
            tensors[k] = f.get_tensor(k)
    print(tensors.keys())

dict_keys(['lm_head', 'transformer.h.0.attn.c_attn', 'transformer.h.0.attn.c_proj', 'transformer.h.0.mlp.c_fc', 'transformer.h.0.mlp.c_proj', 'transformer.h.1.attn.c_attn', 'transformer.h.1.attn.c_proj', 'transformer.h.1.mlp.c_fc', 'transformer.h.1.mlp.c_proj', 'transformer.h.10.attn.c_attn', 'transformer.h.10.attn.c_proj', 'transformer.h.10.mlp.c_fc', 'transformer.h.10.mlp.c_proj', 'transformer.h.11.attn.c_attn', 'transformer.h.11.attn.c_proj', 'transformer.h.11.mlp.c_fc', 'transformer.h.11.mlp.c_proj', 'transformer.h.2.attn.c_attn', 'transformer.h.2.attn.c_proj', 'transformer.h.2.mlp.c_fc', 'transformer.h.2.mlp.c_proj', 'transformer.h.3.attn.c_attn', 'transformer.h.3.attn.c_proj', 'transformer.h.3.mlp.c_fc', 'transformer.h.3.mlp.c_proj', 'transformer.h.4.attn.c_attn', 'transformer.h.4.attn.c_proj', 'transformer.h.4.mlp.c_fc', 'transformer.h.4.mlp.c_proj', 'transformer.h.5.attn.c_attn', 'transformer.h.5.attn.c_proj', 'transformer.h.5.mlp.c_fc', 'transformer.h.5.mlp.c_proj', 'transform

In [11]:
path = filesnames_safetensors[-2]
with safe_open(path, framework="pt", device=0) as f:
    for k in f.keys():
        tensors[k] = f.get_tensor(k)

In [29]:
gpt_2 = torch.load("/home/louis/quelle/quelle/approx_unrolling/checkpoints/model.pth")

In [34]:
total_modules = []

for i in range(12):
    total_modules.append(f"transformer.h.{i}.attn.c_attn")
    total_modules.append(f"transformer.h.{i}.attn.c_proj")

for i in range(12):
    total_modules.append(f"transformer.h.{i}.mlp.c_fc")
    total_modules.append(f"transformer.h.{i}.mlp.c_proj")


In [2]:
total_modules

NameError: name 'total_modules' is not defined

In [6]:
all_weights = model.state_dict()

all_mlps = {k: v for k, v in all_weights.items() if "mlp" in k}


In [7]:
len(all_mlps)

24

In [22]:
type(model.named_modules())

generator

In [31]:
module_keys = [m[0] for m in model.named_modules()]

In [48]:
track_attention = True
track_mlp = True
total_modules = []
for m in module_keys:
    if "dropout" in m.lower() or "layernorm" in m.lower():
        continue

    if "attention" in m.lower() and track_attention:
        total_modules.append(m)
    if "mlp" in m.lower() and track_mlp:
        total_modules.append(m)


In [55]:
total_modules

['gpt_neox.layers.0.attention',
 'gpt_neox.layers.0.attention.query_key_value',
 'gpt_neox.layers.0.attention.dense',
 'gpt_neox.layers.0.mlp',
 'gpt_neox.layers.0.mlp.dense_h_to_4h',
 'gpt_neox.layers.0.mlp.dense_4h_to_h',
 'gpt_neox.layers.0.mlp.act',
 'gpt_neox.layers.1.attention',
 'gpt_neox.layers.1.attention.query_key_value',
 'gpt_neox.layers.1.attention.dense',
 'gpt_neox.layers.1.mlp',
 'gpt_neox.layers.1.mlp.dense_h_to_4h',
 'gpt_neox.layers.1.mlp.dense_4h_to_h',
 'gpt_neox.layers.1.mlp.act',
 'gpt_neox.layers.2.attention',
 'gpt_neox.layers.2.attention.query_key_value',
 'gpt_neox.layers.2.attention.dense',
 'gpt_neox.layers.2.mlp',
 'gpt_neox.layers.2.mlp.dense_h_to_4h',
 'gpt_neox.layers.2.mlp.dense_4h_to_h',
 'gpt_neox.layers.2.mlp.act',
 'gpt_neox.layers.3.attention',
 'gpt_neox.layers.3.attention.query_key_value',
 'gpt_neox.layers.3.attention.dense',
 'gpt_neox.layers.3.mlp',
 'gpt_neox.layers.3.mlp.dense_h_to_4h',
 'gpt_neox.layers.3.mlp.dense_4h_to_h',
 'gpt_neox.lay

In [23]:
for m in model.named_modules():
    if "dropout" in m[0] or "layernorm" in m[0]:
        continue
    # if "mlp" in m[0] and "dropout" not in m[0]:
    #     print(m[0], m[1])
    #     print()
    if "attention" in m[0]:
        print(type(m[0]), type(m[1]))
        print(m[0], m[1])
        print()

<class 'str'> <class 'transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXAttention'>
gpt_neox.layers.0.attention GPTNeoXAttention(
  (query_key_value): Linear(in_features=128, out_features=384, bias=True)
  (dense): Linear(in_features=128, out_features=128, bias=True)
)

<class 'str'> <class 'torch.nn.modules.linear.Linear'>
gpt_neox.layers.0.attention.query_key_value Linear(in_features=128, out_features=384, bias=True)

<class 'str'> <class 'torch.nn.modules.linear.Linear'>
gpt_neox.layers.0.attention.dense Linear(in_features=128, out_features=128, bias=True)

<class 'str'> <class 'transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXAttention'>
gpt_neox.layers.1.attention GPTNeoXAttention(
  (query_key_value): Linear(in_features=128, out_features=384, bias=True)
  (dense): Linear(in_features=128, out_features=128, bias=True)
)

<class 'str'> <class 'torch.nn.modules.linear.Linear'>
gpt_neox.layers.1.attention.query_key_value Linear(in_features=128, out_features=384, bias=True)

<

In [13]:
all_weights.keys()

odict_keys(['gpt_neox.embed_in.weight', 'gpt_neox.layers.0.input_layernorm.weight', 'gpt_neox.layers.0.input_layernorm.bias', 'gpt_neox.layers.0.post_attention_layernorm.weight', 'gpt_neox.layers.0.post_attention_layernorm.bias', 'gpt_neox.layers.0.attention.query_key_value.weight', 'gpt_neox.layers.0.attention.query_key_value.bias', 'gpt_neox.layers.0.attention.dense.weight', 'gpt_neox.layers.0.attention.dense.bias', 'gpt_neox.layers.0.mlp.dense_h_to_4h.weight', 'gpt_neox.layers.0.mlp.dense_h_to_4h.bias', 'gpt_neox.layers.0.mlp.dense_4h_to_h.weight', 'gpt_neox.layers.0.mlp.dense_4h_to_h.bias', 'gpt_neox.layers.1.input_layernorm.weight', 'gpt_neox.layers.1.input_layernorm.bias', 'gpt_neox.layers.1.post_attention_layernorm.weight', 'gpt_neox.layers.1.post_attention_layernorm.bias', 'gpt_neox.layers.1.attention.query_key_value.weight', 'gpt_neox.layers.1.attention.query_key_value.bias', 'gpt_neox.layers.1.attention.dense.weight', 'gpt_neox.layers.1.attention.dense.bias', 'gpt_neox.layers

In [26]:
from examples.wikitext.pipeline import construct_gpt2

model_state_dict = construct_gpt2().state_dict()


odict_keys(['transformer.wte.weight', 'transformer.wpe.weight', 'transformer.h.0.ln_1.weight', 'transformer.h.0.ln_1.bias', 'transformer.h.0.attn.c_attn.weight', 'transformer.h.0.attn.c_attn.bias', 'transformer.h.0.attn.c_proj.weight', 'transformer.h.0.attn.c_proj.bias', 'transformer.h.0.ln_2.weight', 'transformer.h.0.ln_2.bias', 'transformer.h.0.mlp.c_fc.weight', 'transformer.h.0.mlp.c_fc.bias', 'transformer.h.0.mlp.c_proj.weight', 'transformer.h.0.mlp.c_proj.bias', 'transformer.h.1.ln_1.weight', 'transformer.h.1.ln_1.bias', 'transformer.h.1.attn.c_attn.weight', 'transformer.h.1.attn.c_attn.bias', 'transformer.h.1.attn.c_proj.weight', 'transformer.h.1.attn.c_proj.bias', 'transformer.h.1.ln_2.weight', 'transformer.h.1.ln_2.bias', 'transformer.h.1.mlp.c_fc.weight', 'transformer.h.1.mlp.c_fc.bias', 'transformer.h.1.mlp.c_proj.weight', 'transformer.h.1.mlp.c_proj.bias', 'transformer.h.2.ln_1.weight', 'transformer.h.2.ln_1.bias', 'transformer.h.2.attn.c_attn.weight', 'transformer.h.2.attn.

In [30]:
d = model_state_dict


In [41]:
d.keys()

odict_keys(['transformer.wte.weight', 'transformer.wpe.weight', 'transformer.h.0.ln_1.weight', 'transformer.h.0.ln_1.bias', 'transformer.h.0.attn.c_attn.weight', 'transformer.h.0.attn.c_attn.bias', 'transformer.h.0.attn.c_proj.weight', 'transformer.h.0.attn.c_proj.bias', 'transformer.h.0.ln_2.weight', 'transformer.h.0.ln_2.bias', 'transformer.h.0.mlp.c_fc.weight', 'transformer.h.0.mlp.c_fc.bias', 'transformer.h.0.mlp.c_proj.weight', 'transformer.h.0.mlp.c_proj.bias', 'transformer.h.1.ln_1.weight', 'transformer.h.1.ln_1.bias', 'transformer.h.1.attn.c_attn.weight', 'transformer.h.1.attn.c_attn.bias', 'transformer.h.1.attn.c_proj.weight', 'transformer.h.1.attn.c_proj.bias', 'transformer.h.1.ln_2.weight', 'transformer.h.1.ln_2.bias', 'transformer.h.1.mlp.c_fc.weight', 'transformer.h.1.mlp.c_fc.bias', 'transformer.h.1.mlp.c_proj.weight', 'transformer.h.1.mlp.c_proj.bias', 'transformer.h.2.ln_1.weight', 'transformer.h.2.ln_1.bias', 'transformer.h.2.attn.c_attn.weight', 'transformer.h.2.attn.

In [34]:
total_modules = []
total_modules += [k for k in model_state_dict.keys() if "attention" in k]
total_modules += [k for k in model_state_dict.keys() if "mlp" in k]

In [50]:
model.named_modules()

<generator object Module.named_modules at 0x7149cc173450>

In [54]:
for m in model.named_modules():
    print(m)
    print("--" * 20)

('', GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 128)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-5): 6 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (query_key_value): Linear(in_features=128, out_features=384, bias=True)
          (dense): Linear(in_features=128, out_features=128, bias=True)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=128, out_features=512, bias=True)
          (dense_4h_to_h): Linear(in_features=512, out_features=128, bias=True)
          (act): GELUActivation()
        )
      )
    )
    (final_layer_norm): LayerNorm((128,), eps=1e-05, elementwi

In [58]:
total_modules = []

for i in range(12):
    total_modules.append(f"transformer.h.{i}.attn.c_attn")
    total_modules.append(f"transformer.h.{i}.attn.c_proj")

for i in range(12):
    total_modules.append(f"transformer.h.{i}.mlp.c_fc")
    total_modules.append(f"transformer.h.{i}.mlp.c_proj")

In [None]:
from quelle.approx_unrolling.language_task import LanguageModelingTask
from kronfluence.module.tracked_module import TrackedModule

model = model
task = LanguageModelingTask()
factor_args = (None,)
score_args = (None,)


tracked_module_names = (
    task.get_influence_tracked_modules() if task is not None else None
)
tracked_module_exists_dict = None
if tracked_module_names is not None:
    tracked_module_exists_dict = {name: False for name in tracked_module_names}
per_sample_gradient_process_fnc = None
if task is not None and task.enable_post_process_per_sample_gradient:
    per_sample_gradient_process_fnc = task.post_process_per_sample_gradient

named_modules = model.named_modules()
for module_name, module in named_modules:
    if len(list(module.children())) > 0:
        continue

    # Filters modules based on the task's `get_influence_tracked_modules` if specified.
    if tracked_module_names is not None and module_name not in tracked_module_names:
        continue

    # Wraps the module if it is currently supported (e.g., nn.Linear & nn.Conv2d).
    if isinstance(module, tuple(TrackedModule.SUPPORTED_MODULES)):
        tracked_module = TrackedModule.SUPPORTED_MODULES[type(module)](
            name=module_name,
            original_module=module,
            per_sample_gradient_process_fnc=per_sample_gradient_process_fnc,
            factor_args=factor_args,
            score_args=score_args,
        )
        parent, target_name = _get_submodules(model=model, key=module_name)
        setattr(parent, target_name, tracked_module)

        if tracked_module_exists_dict is not None:
            tracked_module_exists_dict[module_name] = True

if tracked_module_exists_dict is not None and not all(
    list(tracked_module_exists_dict.values())
):
    error_msg = f"Some provided tracked modules were not found. The current mapping: `{tracked_module_exists_dict}`."
    raise IllegalTaskConfigurationError(error_msg)

if not any(isinstance(module, TrackedModule) for module in model.modules()):
    supported_modules = ", ".join(
        module.__name__ for module in TrackedModule.SUPPORTED_MODULES
    )
    raise IllegalTaskConfigurationError(
        f"No supported modules found. Kronfluence supports: {supported_modules}. "
        "Consider rewriting your model or subclassing `TrackedModule` for custom layers.\n"
        f"Current Model:\n{model}"
    )


In [3]:
with torch.no_grad():
    R = torch.randn(int(1e8), 100, dtype=torch.float32, device="cuda")
    U, S, Vh = torch.svd_lowrank(R)

ValueError: Error initializing torch.distributed using env:// rendezvous: environment variable RANK expected, but not set

ValueError: Default process group has not been initialized, please make sure to call init_process_group.