## Imports

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
import numpy as np
from transformers import GPTNeoXForCausalLM, AutoTokenizer
from tqdm.notebook import tqdm


In [3]:
from transformers import GPTNeoXForCausalLM, AutoTokenizer

model_str = "EleutherAI/pythia-14m"
step = 5000
model = GPTNeoXForCausalLM.from_pretrained(
    model_str,
    revision=f"step{step}",
    device_map="auto",
)

tokenizer = AutoTokenizer.from_pretrained(
    model_str,
    revision=f"step{step}",
)

In [85]:
tokenizer = AutoTokenizer.from_pretrained(
    model_str,
    revision=f"step{5000}",
)

In [46]:
tokenizer.pad_token

In [92]:
model_name = "EleutherAI/pythia-14m"
model = GPTNeoXForCausalLM.from_pretrained(
    model_name,
    revision=f"step{2000}",
    device_map="auto",
)

## 1. Exammining the matrices

In [42]:
path = "/home/louis/quelle/quelle/approx_unrolling/influence_results/wikitext/factors_ekfac_half"

# list of all subfolders or files
import os

subfolders = []
for dirpath, dirnames, filenames in os.walk(path):
    for dirname in dirnames:
        subfolders.append(os.path.join(dirpath, dirname))
    for filename in filenames:
        subfolders.append(os.path.join(dirpath, filename))


In [48]:
path2

'/home/louis/quelle/quelle/approx_unrolling/influence_results/wikitext/factors_ekfac_half'

In [6]:
filenames
filenames_json = [f for f in subfolders if f.endswith(".json")]
filesnames_safetensors = [f for f in subfolders if f.endswith(".safetensors")]

In [7]:
filenames_json

['/home/louis/quelle/quelle/approx_unrolling/influence_results/wikitext/factors_ekfac_half/factor_arguments.json',
 '/home/louis/quelle/quelle/approx_unrolling/influence_results/wikitext/factors_ekfac_half/covariance_dataset_metadata.json']

In [8]:
import json

json_file = filenames_json[0]
with open(json_file, "r") as f:
    data = json.load(f)
data

{'strategy': 'ekfac',
 'use_empirical_fisher': False,
 'amp_dtype': 'torch.bfloat16',
 'amp_scale': 65536.0,
 'has_shared_parameters': False,
 'covariance_max_examples': 100000,
 'covariance_data_partitions': 1,
 'covariance_module_partitions': 1,
 'activation_covariance_dtype': 'torch.bfloat16',
 'gradient_covariance_dtype': 'torch.bfloat16',
 'eigendecomposition_dtype': 'torch.float64',
 'lambda_max_examples': 100000,
 'lambda_data_partitions': 1,
 'lambda_module_partitions': 1,
 'use_iterative_lambda_aggregation': False,
 'offload_activations_to_cpu': False,
 'per_sample_gradient_dtype': 'torch.bfloat16',
 'lambda_dtype': 'torch.bfloat16'}

In [9]:
filesnames_safetensors

['/home/louis/quelle/quelle/approx_unrolling/influence_results/wikitext/factors_ekfac_half/activation_covariance.safetensors',
 '/home/louis/quelle/quelle/approx_unrolling/influence_results/wikitext/factors_ekfac_half/gradient_covariance.safetensors',
 '/home/louis/quelle/quelle/approx_unrolling/influence_results/wikitext/factors_ekfac_half/num_activation_covariance_processed.safetensors',
 '/home/louis/quelle/quelle/approx_unrolling/influence_results/wikitext/factors_ekfac_half/num_gradient_covariance_processed.safetensors']

In [10]:
from safetensors import safe_open

for i in range(len(filesnames_safetensors)):
    path = filesnames_safetensors[i]
    tensors = {}
    with safe_open(path, framework="pt", device=0) as f:
        for k in f.keys():
            tensors[k] = f.get_tensor(k)
    print(tensors.keys())

dict_keys(['lm_head', 'transformer.h.0.attn.c_attn', 'transformer.h.0.attn.c_proj', 'transformer.h.0.mlp.c_fc', 'transformer.h.0.mlp.c_proj', 'transformer.h.1.attn.c_attn', 'transformer.h.1.attn.c_proj', 'transformer.h.1.mlp.c_fc', 'transformer.h.1.mlp.c_proj', 'transformer.h.10.attn.c_attn', 'transformer.h.10.attn.c_proj', 'transformer.h.10.mlp.c_fc', 'transformer.h.10.mlp.c_proj', 'transformer.h.11.attn.c_attn', 'transformer.h.11.attn.c_proj', 'transformer.h.11.mlp.c_fc', 'transformer.h.11.mlp.c_proj', 'transformer.h.2.attn.c_attn', 'transformer.h.2.attn.c_proj', 'transformer.h.2.mlp.c_fc', 'transformer.h.2.mlp.c_proj', 'transformer.h.3.attn.c_attn', 'transformer.h.3.attn.c_proj', 'transformer.h.3.mlp.c_fc', 'transformer.h.3.mlp.c_proj', 'transformer.h.4.attn.c_attn', 'transformer.h.4.attn.c_proj', 'transformer.h.4.mlp.c_fc', 'transformer.h.4.mlp.c_proj', 'transformer.h.5.attn.c_attn', 'transformer.h.5.attn.c_proj', 'transformer.h.5.mlp.c_fc', 'transformer.h.5.mlp.c_proj', 'transform

In [11]:
path = filesnames_safetensors[-2]
with safe_open(path, framework="pt", device=0) as f:
    for k in f.keys():
        tensors[k] = f.get_tensor(k)

In [29]:
gpt_2 = torch.load("/home/louis/quelle/quelle/approx_unrolling/checkpoints/model.pth")

In [6]:
all_weights = model.state_dict()

all_mlps = {k: v for k, v in all_weights.items() if "mlp" in k}


In [7]:
len(all_mlps)

24

In [22]:
type(model.named_modules())

generator

In [31]:
module_keys = [m[0] for m in model.named_modules()]

In [48]:
track_attention = True
track_mlp = True
total_modules = []
for m in module_keys:
    if "dropout" in m.lower() or "layernorm" in m.lower():
        continue

    if "attention" in m.lower() and track_attention:
        total_modules.append(m)
    if "mlp" in m.lower() and track_mlp:
        total_modules.append(m)


ValueError: Error initializing torch.distributed using env:// rendezvous: environment variable RANK expected, but not set

In [4]:
from examples.wikitext.pipeline import get_wikitext_dataset

train_dataset = get_wikitext_dataset(
    split="eval_train",
)

In [7]:
# sample from train_dataset
sample = train_dataset[0]
