In [1]:
import torch
import os
import subprocess

from huggingface_hub import login, snapshot_download

In [2]:
print("CUDA Available:", torch.cuda.is_available())
print("Using Device:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")

CUDA Available: True
Using Device: NVIDIA A100 80GB PCIe


In [3]:
torch.backends.cuda.matmul.allow_tf32 = True

In [4]:
login(token="")

# !python -m huggingface-cli login

## Evaluating base models (before pruning)
- meta-llama/Llama-3.2-1B-Instruct
- facebook/layerskip-llama3.2-1B

In [8]:
!yes | python -m lm_eval \
    --model hf \
    --model_args pretrained=meta-llama/Llama-3.2-1B-Instruct \
    --tasks openbookqa,arc_easy,winogrande,hellaswag,arc_challenge,piqa,boolq \
    --device cuda:0 \
    --output_path ./LLM-Pruner/my_evaluations/llama3.2_1b_base_eval

  warn(
2025-03-18:23:09:55,598 INFO     [__main__:379] Selected Tasks: ['arc_challenge', 'arc_easy', 'boolq', 'hellaswag', 'openbookqa', 'piqa', 'winogrande']
2025-03-18:23:09:55,602 INFO     [lm_eval.evaluator:169] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
2025-03-18:23:09:55,602 INFO     [lm_eval.evaluator:206] Initializing hf model, with arguments: {'pretrained': 'meta-llama/Llama-3.2-1B-Instruct'}
2025-03-18:23:09:55,670 INFO     [lm_eval.models.huggingface:136] Using device 'cuda:0'
config.json: 100%|█████████████████████████████| 877/877 [00:00<00:00, 9.15MB/s]
tokenizer_config.json: 100%|███████████████| 54.5k/54.5k [00:00<00:00, 52.1MB/s]
tokenizer.json: 100%|██████████████████████| 9.09M/9.09M [00:00<00:00, 37.6MB/s]
special_tokens_map.json: 100%|█████████████████| 296/296 [00:00<00:00, 2.64MB/s]
2025-03-18:23:09:57,338 INFO     [lm_eval.models.huggingface:376] Model parallel was set to Fals

In [5]:
!yes | python -m lm_eval \
    --model hf \
    --model_args pretrained=facebook/layerskip-llama3.2-1B \
    --tasks openbookqa,arc_easy,winogrande,hellaswag,arc_challenge,piqa,boolq \
    --device cuda:0 \
    --output_path ./LLM-Pruner/my_evaluations/llama3.2_1b_layerskip_eval.json

  warn(
2025-03-20:08:41:40,369 INFO     [__main__:379] Selected Tasks: ['arc_challenge', 'arc_easy', 'boolq', 'hellaswag', 'openbookqa', 'piqa', 'winogrande']
2025-03-20:08:41:40,389 INFO     [lm_eval.evaluator:169] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
2025-03-20:08:41:40,389 INFO     [lm_eval.evaluator:206] Initializing hf model, with arguments: {'pretrained': 'facebook/layerskip-llama3.2-1B'}
2025-03-20:08:41:40,451 INFO     [lm_eval.models.huggingface:136] Using device 'cuda:0'
config.json: 100%|█████████████████████████████| 843/843 [00:00<00:00, 14.3MB/s]
tokenizer_config.json: 100%|███████████████| 50.5k/50.5k [00:00<00:00, 62.5MB/s]
tokenizer.json: 100%|██████████████████████| 17.2M/17.2M [00:00<00:00, 79.1MB/s]
special_tokens_map.json: 100%|█████████████████| 296/296 [00:00<00:00, 6.21MB/s]
2025-03-20:08:41:42,974 INFO     [lm_eval.models.huggingface:376] Model parallel was set to False,

## Evaluating pruned + tuned models
- ./LLM-Pruner/prune_log/vanilla_llama_1b_prune_0.25/pytorch_model.bin
- ./LLM-Pruner/prune_log/layerskip_1b_prune_0.25/pytorch_model.bin

In [12]:
!pip install -r LLM-Pruner/requirement.txt

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting sentencepiece (from -r LLM-Pruner/requirement.txt (line 4))
  Obtaining dependency information for sentencepiece from https://files.pythonhosted.org/packages/a6/27/33019685023221ca8ed98e8ceb7ae5e166032686fa3662c68f1f1edf334e/sentencepiece-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Downloading sentencepiece-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
Collecting wandb (from -r LLM-Pruner/requirement.txt (line 6))
  Obtaining dependency information for wandb from https://files.pythonhosted.org/packages/e0/71/7b7050ecab7288782ae0c7560f1ca06f4cf854a5ae08abeaf643785af1a0/wandb-0.19.8-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Downloading wandb-0.19.8-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting ptflo

In [5]:
snapshot_download(repo_id="Neooooo/cs7643_models", local_dir="./LLM-Pruner")

Fetching 133 files:   0%|          | 0/133 [00:00<?, ?it/s]

.DS_Store:   0%|          | 0.00/6.15k [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

train.sh:   0%|          | 0.00/397 [00:00<?, ?B/s]

description.txt:   0%|          | 0.00/703 [00:00<?, ?B/s]

training.log:   0%|          | 0.00/25.9k [00:00<?, ?B/s]

description.txt:   0%|          | 0.00/709 [00:00<?, ?B/s]

description.txt:   0%|          | 0.00/703 [00:00<?, ?B/s]

description.txt:   0%|          | 0.00/709 [00:00<?, ?B/s]

train.sh:   0%|          | 0.00/403 [00:00<?, ?B/s]

.DS_Store:   0%|          | 0.00/6.15k [00:00<?, ?B/s]

training.log:   0%|          | 0.00/38.1k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/3.28G [00:00<?, ?B/s]

.DS_Store:   0%|          | 0.00/6.15k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/3.28G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/20.0M [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/428 [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/40.1M [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

adapter_model.bin:   0%|          | 0.00/20.0M [00:00<?, ?B/s]

scaler.pt:   0%|          | 0.00/988 [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.37k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/20.0M [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/40.1M [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

trainer_state.json:   0%|          | 0.00/25.2k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.37k [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/40.1M [00:00<?, ?B/s]

trainer_state.json:   0%|          | 0.00/21.1k [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

scaler.pt:   0%|          | 0.00/988 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/20.0M [00:00<?, ?B/s]

scaler.pt:   0%|          | 0.00/988 [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

trainer_state.json:   0%|          | 0.00/29.3k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.37k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/20.0M [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/40.1M [00:00<?, ?B/s]

scaler.pt:   0%|          | 0.00/988 [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.37k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/20.0M [00:00<?, ?B/s]

trainer_state.json:   0%|          | 0.00/32.2k [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/40.1M [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

scaler.pt:   0%|          | 0.00/988 [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.37k [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

trainer_state.json:   0%|          | 0.00/4.91k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/20.0M [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/40.1M [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

scaler.pt:   0%|          | 0.00/988 [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.37k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/20.0M [00:00<?, ?B/s]

trainer_state.json:   0%|          | 0.00/8.98k [00:00<?, ?B/s]

scaler.pt:   0%|          | 0.00/988 [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/20.0M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.37k [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

trainer_state.json:   0%|          | 0.00/13.0k [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/40.1M [00:00<?, ?B/s]

scaler.pt:   0%|          | 0.00/988 [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/40.1M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.37k [00:00<?, ?B/s]

trainer_state.json:   0%|          | 0.00/17.1k [00:00<?, ?B/s]

.DS_Store:   0%|          | 0.00/6.15k [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/430 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/20.0M [00:00<?, ?B/s]

adapter_model.bin:   0%|          | 0.00/20.0M [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

scaler.pt:   0%|          | 0.00/988 [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

trainer_state.json:   0%|          | 0.00/21.1k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.37k [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/40.1M [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/40.1M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/20.0M [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

scaler.pt:   0%|          | 0.00/988 [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

trainer_state.json:   0%|          | 0.00/25.2k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.37k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/20.0M [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/40.1M [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

scaler.pt:   0%|          | 0.00/988 [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.37k [00:00<?, ?B/s]

trainer_state.json:   0%|          | 0.00/29.2k [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/430 [00:00<?, ?B/s]

adapter_model.bin:   0%|          | 0.00/20.0M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/20.0M [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/40.1M [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

scaler.pt:   0%|          | 0.00/988 [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

trainer_state.json:   0%|          | 0.00/32.2k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.37k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/20.0M [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/40.1M [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

scaler.pt:   0%|          | 0.00/988 [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.37k [00:00<?, ?B/s]

trainer_state.json:   0%|          | 0.00/4.89k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/20.0M [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/40.1M [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

scaler.pt:   0%|          | 0.00/988 [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.37k [00:00<?, ?B/s]

trainer_state.json:   0%|          | 0.00/8.95k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/20.0M [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/40.1M [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

scaler.pt:   0%|          | 0.00/988 [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

trainer_state.json:   0%|          | 0.00/13.0k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.37k [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/40.1M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/20.0M [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

scaler.pt:   0%|          | 0.00/988 [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

trainer_state.json:   0%|          | 0.00/17.0k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.37k [00:00<?, ?B/s]

'/storage/ice1/0/3/sdo36/LLM-Pruner'

In [6]:
tasks = ["openbookqa", "arc_easy", "winogrande", "hellaswag", "arc_challenge", "piqa", "boolq"]
tasks = ",".join(tasks)

In [12]:
# pruned + tuned llama
path_to_pruned_weights = "./LLM-Pruner/prune_log/vanilla_llama_1b_prune_0.25/pytorch_model.bin"
path_to_tuned_weights = "./LLM-Pruner/tune_log/llama3_1b_0.25_tune"
original_model = "meta-llama/Llama-3.2-1B-Instruct"
report_name = "llama3.2_1b_0.25_eval_jd.json"

In [9]:
# pruned + tuned layerskip
path_to_pruned_weights = "./LLM-Pruner/prune_log/layerskip_1b_prune_0.25/pytorch_model.bin"
path_to_tuned_weights = "./LLM-Pruner/tune_log/layerskip_1b_0.25_tune"
original_model = "facebook/layerskip-llama3.2-1B"
report_name = "llama3.2_1b_0.25_layerskip_eval.json"

In [10]:
os.environ['PATH_TO_PRUNED_WEIGHTS'] = path_to_pruned_weights
os.environ['PATH_TO_TUNED_WEIGHTS'] = path_to_tuned_weights
os.environ['ORIGINAL_MODEL'] = original_model
os.environ['TASKS'] = tasks
os.environ['REPORT_NAME'] = report_name

os.environ["PYTHONPATH"] = "./LLM-Pruner"
os.environ["HF_DATASETS_TRUST_REMOTE_CODE"] = "true"

In [14]:
# %%bash
# export PYTHONPATH='./LLM-Pruner'
# export HF_DATASETS_TRUST_REMOTE_CODE=true
# python LLM-Pruner/lm-evaluation-harness/main.py --model hf-causal-experimental \
#        --model_args "checkpoint=$PATH_TO_PRUNED_WEIGHTS,peft=$PATH_TO_TUNED_WEIGHTS,config_pretrained=$ORIGINAL_MODEL" \
#        --tasks "$TASKS" \
#        --device cuda:0 --no_cache \
#        --output_path "../my_evaluations/$REPORT_NAME"
    
command = [
    "python", "LLM-Pruner/lm-evaluation-harness/main.py",
    "--model", "hf-causal-experimental",
    "--model_args", f"checkpoint={os.getenv('PATH_TO_PRUNED_WEIGHTS')},peft={os.getenv('PATH_TO_TUNED_WEIGHTS')},config_pretrained={os.getenv('ORIGINAL_MODEL')}",
    "--tasks", os.getenv("TASKS"),
    "--device", "cuda:0",
    "--no_cache",
    "--output_path", f"../my_evaluations/{os.getenv('REPORT_NAME')}"
]

subprocess.run(command)

  warn(


Selected Tasks: ['arc_easy', 'openbookqa', 'piqa', 'arc_challenge', 'boolq', 'hellaswag', 'winogrande']
Load from Pruned Model: ./LLM-Pruner/prune_log/vanilla_llama_1b_prune_0.25/pytorch_model.bin
Config:  LoraConfig(peft_type='LORA', base_model_name_or_path='meta-llama/Llama-3.2-1B-Instruct', task_type='CAUSAL_LM', inference_mode=True, r=8, target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'down_proj', 'up_proj'], lora_alpha=16, lora_dropout=0.05, fan_in_fan_out=False, bias='none', modules_to_save=None, init_lora_weights=True)
Load from adapter: adapter_model.bin


Generating train split: 100%|██████████| 2251/2251 [00:00<00:00, 31214.47 examples/s]
Generating test split: 100%|██████████| 2376/2376 [00:00<00:00, 154279.22 examples/s]
Generating validation split: 100%|██████████| 570/570 [00:00<00:00, 97933.53 examples/s]
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
Generating train split: 100%|██████████| 1119/1119 [00:00<00:00, 109833.99 examples/s]
Generating test split: 100%|██████████| 1172/1172 [00:00<00:00, 115087.31 examples/s]
Generating validation split: 100%|██████████| 299/299 [00:00<00:00, 62839.95 examples/s]
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remot

Running loglikelihood requests


100%|██████████| 69078/69078 [51:15<00:00, 22.46it/s] 


{
  "results": {
    "arc_easy": {
      "acc": 0.5989057239057239,
      "acc_stderr": 0.010057051106534367,
      "acc_norm": 0.5492424242424242,
      "acc_norm_stderr": 0.010209906101011107
    },
    "openbookqa": {
      "acc": 0.21,
      "acc_stderr": 0.018233620865305916,
      "acc_norm": 0.348,
      "acc_norm_stderr": 0.021323728632807504
    },
    "piqa": {
      "acc": 0.705114254624592,
      "acc_stderr": 0.010639030620157003,
      "acc_norm": 0.7002176278563657,
      "acc_norm_stderr": 0.01068968696713809
    },
    "arc_challenge": {
      "acc": 0.29180887372013653,
      "acc_stderr": 0.013284525292403503,
      "acc_norm": 0.30802047781569963,
      "acc_norm_stderr": 0.01349142951729204
    },
    "boolq": {
      "acc": 0.6155963302752293,
      "acc_stderr": 0.008508133844703916
    },
    "hellaswag": {
      "acc": 0.3889663413662617,
      "acc_stderr": 0.004865193237024056,
      "acc_norm": 0.49950209121688904,
      "acc_norm_stderr": 0.0049897789373803

Traceback (most recent call last):
  File "/storage/ice1/0/3/sdo36/LLM-Pruner/lm-evaluation-harness/main.py", line 113, in <module>
    main()
  File "/storage/ice1/0/3/sdo36/LLM-Pruner/lm-evaluation-harness/main.py", line 100, in main
    os.makedirs(directory_path)
  File "/usr/lib/python3.10/os.py", line 225, in makedirs
    mkdir(name, mode)
PermissionError: [Errno 13] Permission denied: '../my_evaluations'


CompletedProcess(args=['python', 'LLM-Pruner/lm-evaluation-harness/main.py', '--model', 'hf-causal-experimental', '--model_args', 'checkpoint=./LLM-Pruner/prune_log/vanilla_llama_1b_prune_0.25/pytorch_model.bin,peft=./LLM-Pruner/tune_log/llama3_1b_0.25_tune,config_pretrained=meta-llama/Llama-3.2-1B-Instruct', '--tasks', 'openbookqa,arc_easy,winogrande,hellaswag,arc_challenge,piqa,boolq', '--device', 'cuda:0', '--no_cache', '--output_path', '../my_evaluations/llama3.2_1b_0.25_eval_jd.json'], returncode=1)

In [12]:
command = [
    "python", "LLM-Pruner/lm-evaluation-harness/main.py",
    "--model", "hf-causal-experimental",
    "--model_args", f"checkpoint={os.getenv('PATH_TO_PRUNED_WEIGHTS')},peft={os.getenv('PATH_TO_TUNED_WEIGHTS')},config_pretrained={os.getenv('ORIGINAL_MODEL')}",
    "--tasks", os.getenv("TASKS"),
    "--device", "cuda:0",
    "--no_cache",
    "--output_path", f"./LLM-Pruner/my_evaluations/{os.getenv('REPORT_NAME')}"
]

subprocess.run(command)

  warn(


Selected Tasks: ['arc_easy', 'hellaswag', 'arc_challenge', 'boolq', 'openbookqa', 'winogrande', 'piqa']
Load from Pruned Model: ./LLM-Pruner/prune_log/layerskip_1b_prune_0.25/pytorch_model.bin
Config:  LoraConfig(peft_type='LORA', base_model_name_or_path='facebook/layerskip-llama3.2-1B', task_type='CAUSAL_LM', inference_mode=True, r=8, target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'down_proj', 'up_proj'], lora_alpha=16, lora_dropout=0.05, fan_in_fan_out=False, bias='none', modules_to_save=None, init_lora_weights=True)
Load from adapter: adapter_model.bin


Generating train split: 100%|██████████| 2251/2251 [00:00<00:00, 207020.53 examples/s]
Generating test split: 100%|██████████| 2376/2376 [00:00<00:00, 254038.24 examples/s]
Generating validation split: 100%|██████████| 570/570 [00:00<00:00, 122987.46 examples/s]
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
Generating train split: 100%|██████████| 1119/1119 [00:00<00:00, 84764.79 examples/s]
Generating test split: 100%|██████████| 1172/1172 [00:00<00:00, 97625.25 examples/s]
Generating validation split: 100%|██████████| 299/299 [00:00<00:00, 41457.75 examples/s]
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remot

Running loglikelihood requests


100%|██████████| 69078/69078 [39:15<00:00, 29.33it/s]


{
  "results": {
    "arc_easy": {
      "acc": 0.5795454545454546,
      "acc_stderr": 0.010129114278546531,
      "acc_norm": 0.51010101010101,
      "acc_norm_stderr": 0.010257689687458363
    },
    "hellaswag": {
      "acc": 0.40201155148376816,
      "acc_stderr": 0.004893022130229098,
      "acc_norm": 0.5118502290380402,
      "acc_norm_stderr": 0.004988379805261168
    },
    "arc_challenge": {
      "acc": 0.26535836177474403,
      "acc_stderr": 0.012902554762313966,
      "acc_norm": 0.2883959044368601,
      "acc_norm_stderr": 0.013238394422428176
    },
    "boolq": {
      "acc": 0.5648318042813456,
      "acc_stderr": 0.008671229580582113
    },
    "openbookqa": {
      "acc": 0.21,
      "acc_stderr": 0.018233620865305916,
      "acc_norm": 0.344,
      "acc_norm_stderr": 0.02126575803797874
    },
    "winogrande": {
      "acc": 0.5351223362273086,
      "acc_stderr": 0.014017773120881595
    },
    "piqa": {
      "acc": 0.705658324265506,
      "acc_stderr": 0.01

CompletedProcess(args=['python', 'LLM-Pruner/lm-evaluation-harness/main.py', '--model', 'hf-causal-experimental', '--model_args', 'checkpoint=./LLM-Pruner/prune_log/layerskip_1b_prune_0.25/pytorch_model.bin,peft=./LLM-Pruner/tune_log/layerskip_1b_0.25_tune,config_pretrained=facebook/layerskip-llama3.2-1B', '--tasks', 'openbookqa,arc_easy,winogrande,hellaswag,arc_challenge,piqa,boolq', '--device', 'cuda:0', '--no_cache', '--output_path', './LLM-Pruner/my_evaluations/llama3.2_1b_0.25_layerskip_eval.json'], returncode=0)

## Testing MACs, Params and Memory
- base models
- pruned models

In [11]:
# modified test_speedup.py
!python LLM-Pruner/test_speedup.py --model_type pretrain --base_model meta-llama/Llama-3.2-1B-Instruct

  warn(
LlamaForCausalLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Some weights of LlamaForCausalLM were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B-Instruct and are newly initialized: ['model.layers.0.self_attn.rotary_emb.inv_freq', 'model.layers.1.self_attn.rotary_emb.inv_freq', 'model.layers.10.self_attn.rotary_emb.inv_freq', 'model.layers.11.self_attn.rotary_emb.inv_freq', 'model.layer

In [16]:
!python LLM-Pruner/test_speedup.py --model_type pruneLLM --ckpt ./LLM-Pruner/prune_log/vanilla_llama_1b_prune_0.25/pytorch_model.bin

  warn(
LlamaForCausalLM(
  1068.5 M, 99.994% Params, 68.39 GMac, 99.990% MACs, 
  (model): LlamaModel(
    805.83 M, 75.412% Params, 51.58 GMac, 75.412% MACs, 
    (embed_tokens): Embedding(0, 0.000% Params, 0.0 Mac, 0.000% MACs, 128256, 2048)
    (layers): ModuleList(
      (0-1): 2 x LlamaDecoderLayer(
        60.82 M, 5.691% Params, 3.89 GMac, 5.691% MACs, 
        (self_attn): LlamaAttention(
          10.49 M, 0.981% Params, 671.09 MMac, 0.981% MACs, 
          (q_proj): Linear(4.19 M, 0.393% Params, 268.44 MMac, 0.392% MACs, in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(1.05 M, 0.098% Params, 67.11 MMac, 0.098% MACs, in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(1.05 M, 0.098% Params, 67.11 MMac, 0.098% MACs, in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(4.19 M, 0.393% Params, 268.44 MMac, 0.392% MACs, in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
       

In [17]:
!python LLM-Pruner/test_speedup.py --model_type pretrain --base_model facebook/layerskip-llama3.2-1B

  warn(
LlamaForCausalLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Some weights of LlamaForCausalLM were not initialized from the model checkpoint at facebook/layerskip-llama3.2-1B and are newly initialized: ['model.layers.0.self_attn.rotary_emb.inv_freq', 'model.layers.1.self_attn.rotary_emb.inv_freq', 'model.layers.10.self_attn.rotary_emb.inv_freq', 'model.layers.11.self_attn.rotary_emb.inv_freq', 'model.layers.

In [19]:
!python LLM-Pruner/test_speedup.py --model_type pruneLLM --ckpt ./LLM-Pruner/prune_log/layerskip_1b_prune_0.25/pytorch_model.bin

  warn(
LlamaForCausalLM(
  1068.5 M, 99.994% Params, 68.39 GMac, 99.990% MACs, 
  (model): LlamaModel(
    805.83 M, 75.412% Params, 51.58 GMac, 75.412% MACs, 
    (embed_tokens): Embedding(0, 0.000% Params, 0.0 Mac, 0.000% MACs, 128256, 2048)
    (layers): ModuleList(
      (0-1): 2 x LlamaDecoderLayer(
        60.82 M, 5.691% Params, 3.89 GMac, 5.691% MACs, 
        (self_attn): LlamaAttention(
          10.49 M, 0.981% Params, 671.09 MMac, 0.981% MACs, 
          (q_proj): Linear(4.19 M, 0.393% Params, 268.44 MMac, 0.392% MACs, in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(1.05 M, 0.098% Params, 67.11 MMac, 0.098% MACs, in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(1.05 M, 0.098% Params, 67.11 MMac, 0.098% MACs, in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(4.19 M, 0.393% Params, 268.44 MMac, 0.392% MACs, in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
       

In [11]:
# !rm -rf ~/.cache/huggingface ~/.cache/torch /tmp/* __pycache__