In [5]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"

import gc

import pandas as pd
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

from fusion_bench.method.pruning.wanda_utils.eval import eval_ppl
from fusion_bench.models.modeling_losparse_llama import LoSparseLlamaForCausalLM
from fusion_bench.utils import print_parameters

In [6]:
from fusion_bench.models.modeling_losparse_llama.modeling_losparse_llama import (
    LoSparseLinear,
    LoSparseLlamaForCausalLM,
)


def model_eval_ppl(model_path):
    gc.collect()
    torch.cuda.empty_cache()
    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        torch_dtype=torch.float16,
        low_cpu_mem_usage=True,
        device_map="auto",
    )
    print_parameters(model)
    model.seqlen = model.config.max_position_embeddings
    tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)

    with torch.no_grad():
        result = eval_ppl(model, tokenizer)

    print(f"PPL for {model_path}: {result}")
    return result

## Dense

In [3]:
model_eval_ppl(
    "/data0/users/tanganke/projects/fusion_bench/outputs/llama-13b/Llama-2-13b-hf"
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

We've detected an older driver with an RTX 4000 series GPU. These drivers have issues with P2P. This can affect the multi-gpu inference when using accelerate device_map.Please make sure to update your driver to the latest version which resolves this.


evaluating on wikitext2


Using the latest cached version of the dataset since wikitext couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'wikitext-2-raw-v1' at /data0/users/tanganke/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3 (last modified on Wed Aug 28 13:14:33 2024).
Using the latest cached version of the dataset since wikitext couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'wikitext-2-raw-v1' at /data0/users/tanganke/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3 (last modified on Wed Aug 28 13:14:33 2024).


Generating samples:   0%|          | 0/128 [00:00<?, ?it/s]

nsamples 83
sample 0


We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


sample 50
PPL for /data0/users/tanganke/projects/fusion_bench/outputs/llama-13b/Llama-2-13b-hf: 4.573723793029785


4.573723793029785

# Magnitude

In [7]:
model_eval_ppl(
    "/data0/users/tanganke/projects/fusion_bench/outputs/llama-13b/magnitude/unstructured/0.5"
)

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

We've detected an older driver with an RTX 4000 series GPU. These drivers have issues with P2P. This can affect the multi-gpu inference when using accelerate device_map.Please make sure to update your driver to the latest version which resolves this.


trainable params: 13.02B || all params: 13.02B || trainable%: 100.0000
evaluating on wikitext2


Using the latest cached version of the dataset since wikitext couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'wikitext-2-raw-v1' at /data0/users/tanganke/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3 (last modified on Wed Aug 28 13:14:33 2024).
Using the latest cached version of the dataset since wikitext couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'wikitext-2-raw-v1' at /data0/users/tanganke/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3 (last modified on Wed Aug 28 13:14:33 2024).


Generating samples:   0%|          | 0/128 [00:00<?, ?it/s]

We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


nsamples 83
sample 0
sample 50
PPL for /data0/users/tanganke/projects/fusion_bench/outputs/llama-13b/magnitude/unstructured/0.5: 5.9772844314575195


5.9772844314575195

In [3]:
model_eval_ppl(
    "/data0/users/tanganke/projects/fusion_bench/outputs/llama-13b/magnitude/unstructured/0.6"
)

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

We've detected an older driver with an RTX 4000 series GPU. These drivers have issues with P2P. This can affect the multi-gpu inference when using accelerate device_map.Please make sure to update your driver to the latest version which resolves this.


evaluating on wikitext2


Using the latest cached version of the dataset since wikitext couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'wikitext-2-raw-v1' at /data0/users/tanganke/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3 (last modified on Wed Aug 28 13:14:33 2024).
Using the latest cached version of the dataset since wikitext couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'wikitext-2-raw-v1' at /data0/users/tanganke/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3 (last modified on Wed Aug 28 13:14:33 2024).


Generating samples:   0%|          | 0/128 [00:00<?, ?it/s]

nsamples 83
sample 0


We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


sample 50
PPL for /data0/users/tanganke/projects/fusion_bench/outputs/llama-13b/magnitude/unstructured/0.6: 9.907220840454102


9.907220840454102

In [4]:
model_eval_ppl(
    "/data0/users/tanganke/projects/fusion_bench/outputs/llama-13b/magnitude/unstructured/0.7"
)

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

We've detected an older driver with an RTX 4000 series GPU. These drivers have issues with P2P. This can affect the multi-gpu inference when using accelerate device_map.Please make sure to update your driver to the latest version which resolves this.


evaluating on wikitext2


Using the latest cached version of the dataset since wikitext couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'wikitext-2-raw-v1' at /data0/users/tanganke/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3 (last modified on Wed Aug 28 13:14:33 2024).
Using the latest cached version of the dataset since wikitext couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'wikitext-2-raw-v1' at /data0/users/tanganke/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3 (last modified on Wed Aug 28 13:14:33 2024).


Generating samples:   0%|          | 0/128 [00:00<?, ?it/s]

nsamples 83
sample 0
sample 50
PPL for /data0/users/tanganke/projects/fusion_bench/outputs/llama-13b/magnitude/unstructured/0.7: 408.7518310546875


408.7518310546875

In [5]:
model_eval_ppl(
    "/data0/users/tanganke/projects/fusion_bench/outputs/llama-13b/magnitude/semistructured/2_4"
)

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

We've detected an older driver with an RTX 4000 series GPU. These drivers have issues with P2P. This can affect the multi-gpu inference when using accelerate device_map.Please make sure to update your driver to the latest version which resolves this.


evaluating on wikitext2


Using the latest cached version of the dataset since wikitext couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'wikitext-2-raw-v1' at /data0/users/tanganke/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3 (last modified on Wed Aug 28 13:14:33 2024).
Using the latest cached version of the dataset since wikitext couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'wikitext-2-raw-v1' at /data0/users/tanganke/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3 (last modified on Wed Aug 28 13:14:33 2024).


Generating samples:   0%|          | 0/128 [00:00<?, ?it/s]

nsamples 83
sample 0
sample 50
PPL for /data0/users/tanganke/projects/fusion_bench/outputs/llama-13b/magnitude/semistructured/2_4: 8.319043159484863


8.319043159484863

In [6]:
model_eval_ppl(
    "/data0/users/tanganke/projects/fusion_bench/outputs/llama-13b/magnitude/semistructured/4_8"
)

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

We've detected an older driver with an RTX 4000 series GPU. These drivers have issues with P2P. This can affect the multi-gpu inference when using accelerate device_map.Please make sure to update your driver to the latest version which resolves this.


evaluating on wikitext2


Using the latest cached version of the dataset since wikitext couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'wikitext-2-raw-v1' at /data0/users/tanganke/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3 (last modified on Wed Aug 28 13:14:33 2024).
Using the latest cached version of the dataset since wikitext couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'wikitext-2-raw-v1' at /data0/users/tanganke/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3 (last modified on Wed Aug 28 13:14:33 2024).


Generating samples:   0%|          | 0/128 [00:00<?, ?it/s]

nsamples 83
sample 0
sample 50
PPL for /data0/users/tanganke/projects/fusion_bench/outputs/llama-13b/magnitude/semistructured/4_8: 6.7590107917785645


6.7590107917785645

# Sparselo

## Magnitude

In [7]:
model_eval_ppl(
    "/data0/users/tanganke/projects/fusion_bench/outputs/llama-13b/sparselo/magnitude/unstructured/0.5"
)

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

We've detected an older driver with an RTX 4000 series GPU. These drivers have issues with P2P. This can affect the multi-gpu inference when using accelerate device_map.Please make sure to update your driver to the latest version which resolves this.


evaluating on wikitext2


Using the latest cached version of the dataset since wikitext couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'wikitext-2-raw-v1' at /data0/users/tanganke/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3 (last modified on Wed Aug 28 13:14:33 2024).
Using the latest cached version of the dataset since wikitext couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'wikitext-2-raw-v1' at /data0/users/tanganke/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3 (last modified on Wed Aug 28 13:14:33 2024).


Generating samples:   0%|          | 0/128 [00:00<?, ?it/s]

We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


nsamples 83
sample 0
sample 50
PPL for /data0/users/tanganke/projects/fusion_bench/outputs/llama-13b/sparselo/magnitude/unstructured/0.5: 5.726884841918945


5.726884841918945

In [5]:
model_eval_ppl(
    "/data0/users/tanganke/projects/fusion_bench/outputs/llama-13b/sparselo/magnitude/unstructured/0.6"
)

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

We've detected an older driver with an RTX 4000 series GPU. These drivers have issues with P2P. This can affect the multi-gpu inference when using accelerate device_map.Please make sure to update your driver to the latest version which resolves this.


evaluating on wikitext2


Using the latest cached version of the dataset since wikitext couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'wikitext-2-raw-v1' at /data0/users/tanganke/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3 (last modified on Wed Aug 28 13:14:33 2024).
Using the latest cached version of the dataset since wikitext couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'wikitext-2-raw-v1' at /data0/users/tanganke/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3 (last modified on Wed Aug 28 13:14:33 2024).


Generating samples:   0%|          | 0/128 [00:00<?, ?it/s]

We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


nsamples 83
sample 0
sample 50
PPL for /data0/users/tanganke/projects/fusion_bench/outputs/llama-13b/sparselo/magnitude/unstructured/0.6: 8.883753776550293


8.883753776550293

In [6]:
model_eval_ppl(
    "/data0/users/tanganke/projects/fusion_bench/outputs/llama-13b/sparselo/magnitude/unstructured/0.7"
)

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

We've detected an older driver with an RTX 4000 series GPU. These drivers have issues with P2P. This can affect the multi-gpu inference when using accelerate device_map.Please make sure to update your driver to the latest version which resolves this.


evaluating on wikitext2


Using the latest cached version of the dataset since wikitext couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'wikitext-2-raw-v1' at /data0/users/tanganke/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3 (last modified on Wed Aug 28 13:14:33 2024).
Using the latest cached version of the dataset since wikitext couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'wikitext-2-raw-v1' at /data0/users/tanganke/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3 (last modified on Wed Aug 28 13:14:33 2024).


Generating samples:   0%|          | 0/128 [00:00<?, ?it/s]

nsamples 83
sample 0
sample 50
PPL for /data0/users/tanganke/projects/fusion_bench/outputs/llama-13b/sparselo/magnitude/unstructured/0.7: 163.96058654785156


163.96058654785156

In [8]:
model_eval_ppl(
    "/data0/users/tanganke/projects/fusion_bench/outputs/llama-13b/sparselo/magnitude/semistructured/2_4"
)

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

We've detected an older driver with an RTX 4000 series GPU. These drivers have issues with P2P. This can affect the multi-gpu inference when using accelerate device_map.Please make sure to update your driver to the latest version which resolves this.


evaluating on wikitext2


Using the latest cached version of the dataset since wikitext couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'wikitext-2-raw-v1' at /data0/users/tanganke/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3 (last modified on Wed Aug 28 13:14:33 2024).
Using the latest cached version of the dataset since wikitext couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'wikitext-2-raw-v1' at /data0/users/tanganke/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3 (last modified on Wed Aug 28 13:14:33 2024).


Generating samples:   0%|          | 0/128 [00:00<?, ?it/s]

nsamples 83
sample 0
sample 50
PPL for /data0/users/tanganke/projects/fusion_bench/outputs/llama-13b/sparselo/magnitude/semistructured/2_4: 8.864727973937988


8.864727973937988

In [9]:
model_eval_ppl(
    "/data0/users/tanganke/projects/fusion_bench/outputs/llama-13b/sparselo/magnitude/semistructured/4_8"
)

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

We've detected an older driver with an RTX 4000 series GPU. These drivers have issues with P2P. This can affect the multi-gpu inference when using accelerate device_map.Please make sure to update your driver to the latest version which resolves this.


evaluating on wikitext2


Using the latest cached version of the dataset since wikitext couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'wikitext-2-raw-v1' at /data0/users/tanganke/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3 (last modified on Wed Aug 28 13:14:33 2024).
Using the latest cached version of the dataset since wikitext couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'wikitext-2-raw-v1' at /data0/users/tanganke/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3 (last modified on Wed Aug 28 13:14:33 2024).


Generating samples:   0%|          | 0/128 [00:00<?, ?it/s]

nsamples 83
sample 0
sample 50
PPL for /data0/users/tanganke/projects/fusion_bench/outputs/llama-13b/sparselo/magnitude/semistructured/4_8: 6.580754280090332


6.580754280090332

# Iter

## Magnitude

In [10]:
model_eval_ppl(
    "/data0/users/tanganke/projects/fusion_bench/outputs/llama-13b/iterative_sparselo/magnitude/unstructured/0.5"
)

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

We've detected an older driver with an RTX 4000 series GPU. These drivers have issues with P2P. This can affect the multi-gpu inference when using accelerate device_map.Please make sure to update your driver to the latest version which resolves this.


evaluating on wikitext2


Using the latest cached version of the dataset since wikitext couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'wikitext-2-raw-v1' at /data0/users/tanganke/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3 (last modified on Wed Aug 28 13:14:33 2024).
Using the latest cached version of the dataset since wikitext couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'wikitext-2-raw-v1' at /data0/users/tanganke/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3 (last modified on Wed Aug 28 13:14:33 2024).


Generating samples:   0%|          | 0/128 [00:00<?, ?it/s]

nsamples 83
sample 0
sample 50
PPL for /data0/users/tanganke/projects/fusion_bench/outputs/llama-13b/iterative_sparselo/magnitude/unstructured/0.5: 5.648659706115723


5.648659706115723

In [3]:
model_eval_ppl(
    "/data0/users/tanganke/projects/fusion_bench/outputs/llama-13b/iterative_sparselo/magnitude/unstructured/0.6"
)

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

We've detected an older driver with an RTX 4000 series GPU. These drivers have issues with P2P. This can affect the multi-gpu inference when using accelerate device_map.Please make sure to update your driver to the latest version which resolves this.


trainable params: 13.52B || all params: 13.52B || trainable%: 100.0000
evaluating on wikitext2


Using the latest cached version of the dataset since wikitext couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'wikitext-2-raw-v1' at /data0/users/tanganke/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3 (last modified on Wed Aug 28 13:14:33 2024).
Using the latest cached version of the dataset since wikitext couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'wikitext-2-raw-v1' at /data0/users/tanganke/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3 (last modified on Wed Aug 28 13:14:33 2024).


Generating samples:   0%|          | 0/128 [00:00<?, ?it/s]

nsamples 83
sample 0


We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


sample 50
PPL for /data0/users/tanganke/projects/fusion_bench/outputs/llama-13b/iterative_sparselo/magnitude/unstructured/0.6: 8.832003593444824


8.832003593444824

In [4]:
model_eval_ppl(
    "/data0/users/tanganke/projects/fusion_bench/outputs/llama-13b/iterative_sparselo/magnitude/unstructured/0.7"
)

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

We've detected an older driver with an RTX 4000 series GPU. These drivers have issues with P2P. This can affect the multi-gpu inference when using accelerate device_map.Please make sure to update your driver to the latest version which resolves this.


trainable params: 13.52B || all params: 13.52B || trainable%: 100.0000
evaluating on wikitext2


Using the latest cached version of the dataset since wikitext couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'wikitext-2-raw-v1' at /data0/users/tanganke/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3 (last modified on Wed Aug 28 13:14:33 2024).
Using the latest cached version of the dataset since wikitext couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'wikitext-2-raw-v1' at /data0/users/tanganke/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3 (last modified on Wed Aug 28 13:14:33 2024).


Generating samples:   0%|          | 0/128 [00:00<?, ?it/s]

nsamples 83
sample 0
sample 50
PPL for /data0/users/tanganke/projects/fusion_bench/outputs/llama-13b/iterative_sparselo/magnitude/unstructured/0.7: 99.2710189819336


99.2710189819336

In [3]:
model_eval_ppl(
    "/data0/users/tanganke/projects/fusion_bench/outputs/llama-13b/iterative_sparselo/magnitude/semistructured/2_4"
)

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

We've detected an older driver with an RTX 4000 series GPU. These drivers have issues with P2P. This can affect the multi-gpu inference when using accelerate device_map.Please make sure to update your driver to the latest version which resolves this.


evaluating on wikitext2


Using the latest cached version of the dataset since wikitext couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'wikitext-2-raw-v1' at /data0/users/tanganke/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3 (last modified on Wed Aug 28 13:14:33 2024).
Using the latest cached version of the dataset since wikitext couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'wikitext-2-raw-v1' at /data0/users/tanganke/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3 (last modified on Wed Aug 28 13:14:33 2024).


Generating samples:   0%|          | 0/128 [00:00<?, ?it/s]

nsamples 83
sample 0


We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


sample 50
PPL for /data0/users/tanganke/projects/fusion_bench/outputs/llama-13b/iterative_sparselo/magnitude/semistructured/2_4: 7.764371395111084


7.764371395111084

In [4]:
model_eval_ppl(
    "/data0/users/tanganke/projects/fusion_bench/outputs/llama-13b/iterative_sparselo/magnitude/semistructured/4_8"
)

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

We've detected an older driver with an RTX 4000 series GPU. These drivers have issues with P2P. This can affect the multi-gpu inference when using accelerate device_map.Please make sure to update your driver to the latest version which resolves this.


evaluating on wikitext2


Using the latest cached version of the dataset since wikitext couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'wikitext-2-raw-v1' at /data0/users/tanganke/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3 (last modified on Wed Aug 28 13:14:33 2024).
Using the latest cached version of the dataset since wikitext couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'wikitext-2-raw-v1' at /data0/users/tanganke/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3 (last modified on Wed Aug 28 13:14:33 2024).


Generating samples:   0%|          | 0/128 [00:00<?, ?it/s]

nsamples 83
sample 0
sample 50
PPL for /data0/users/tanganke/projects/fusion_bench/outputs/llama-13b/iterative_sparselo/magnitude/semistructured/4_8: 6.39985466003418


6.39985466003418