In [2]:
import torch
import torch.nn as nn
import tqdm
import json
import os
from datasets import load_dataset
from transformers.models.opt.modeling_opt import (
    OPTAttention,
    OPTDecoderLayer,
    OPTForCausalLM,
)
from transformers import GPT2Tokenizer, AutoTokenizer, AutoModelForCausalLM
from smoothquant.smooth import smooth_lm
from smoothquant.fake_quant_2_bits import quantize_qwen2 as quantize_qwen2_2
from smoothquant.fake_quant_4_bits import quantize_qwen2 as quantize_qwen2_4
from smoothquant.fake_quant_6_bits import quantize_qwen2 as quantize_qwen2_6
from smoothquant.fake_quant_8_bits import quantize_qwen2 as quantize_qwen2_8

In [3]:
class Evaluator:
    def __init__(self, dataset, tokenizer, device, seqlen=2048):
        """
        Initializes the Evaluator with the dataset, tokenizer, device, and sequence length.

        Args:
            dataset: The dataset to evaluate (e.g., loaded via `load_dataset`).
            tokenizer: The tokenizer to encode the dataset text.
            device: The device to run the model on (e.g., 'cuda' or 'cpu').
            seqlen: The sequence length for processing (default is 2048).
        """
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.device = device
        self.seqlen = seqlen

        # Concatenate all text entries into a single string separated by double newlines
        concatenated_text = "\n\n".join([example["text"] for example in self.dataset])

        # Tokenize the concatenated text
        encoded = self.tokenizer(
            concatenated_text,
            return_tensors="pt",
            add_special_tokens=False
        )
        self.input_ids = encoded["input_ids"].to(self.device)
        
        # Calculate the number of full samples based on the sequence length
        self.nsamples = self.input_ids.size(1) // self.seqlen

    @torch.no_grad()
    def evaluate(self, model, output_path=None):
        """
        Evaluates the model on the dataset and computes perplexity.

        Args:
            model: The language model to evaluate.
            output_path: Optional path to save the results as a JSON file.

        Returns:
            A dictionary containing the perplexity.
        """
        model.eval()
        model.seqlen = self.seqlen  # Optional: If your model uses `seqlen` as an attribute

        nlls = []
        for i in tqdm.tqdm(range(self.nsamples), desc="Evaluating Perplexity"):
            # Slice the input_ids to get the current batch
            start_idx = i * self.seqlen
            end_idx = (i + 1) * self.seqlen
            batch = self.input_ids[:, start_idx:end_idx]

            # Forward pass through the model
            outputs = model(batch)
            lm_logits = outputs.logits

            # Shift logits and labels for next-token prediction
            shift_logits = lm_logits[:, :-1, :].contiguous().float()
            shift_labels = batch[:, 1:].contiguous()

            # Compute cross-entropy loss
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(
                shift_logits.view(-1, shift_logits.size(-1)),
                shift_labels.view(-1)
            )

            # Accumulate negative log-likelihood
            neg_log_likelihood = loss * self.seqlen
            nlls.append(neg_log_likelihood)

        # Compute perplexity
        total_nll = torch.stack(nlls).sum()
        ppl = torch.exp(total_nll / (self.nsamples * self.seqlen))
        print(f"Perplexity: {ppl.item()}")

        # Prepare results
        results = {"ppl": ppl.item()}

        # Optionally save the results to a JSON file
        if output_path is not None:
            os.makedirs(os.path.dirname(output_path), exist_ok=True)
            with open(output_path, "w") as f:
                json.dump(results, f, indent=2)

        return results

In [4]:
from datasets import load_dataset

tokenizer = AutoTokenizer.from_pretrained("facebook/opt-6.7b")
dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
# dataset = dataset.select(range(1000))  # 데이터셋에서 처음 1000개를 선택
# dataset = load_dataset("lambada", split="validation[:1000]")  # LAMBADA 데이터셋
evaluator = Evaluator(dataset, tokenizer, "cuda", seqlen=2048)


In [6]:
model = AutoModelForCausalLM.from_pretrained(
    "facebook/opt-6.7b", torch_dtype=torch.float16, device_map="auto"
).to("cuda")


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

You shouldn't move a model that is dispatched using accelerate hooks.


OutOfMemoryError: CUDA out of memory. Tried to allocate 394.00 MiB. GPU 0 has a total capacity of 23.64 GiB of which 5.25 MiB is free. Process 2090505 has 16.97 GiB memory in use. Including non-PyTorch memory, this process has 6.66 GiB memory in use. Of the allocated memory 6.28 GiB is allocated by PyTorch, and 3.42 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [20]:
opt_model = evaluator.evaluate(model)
print(f"Qwen2.5 0.5B model accuracy: {opt_model}")

Qwen2.5 0.5B model accuracy: 0.666


In [29]:
model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2.5-0.5B", torch_dtype=torch.float16, device_map="auto"
)

act_scales = torch.load("act_scales/Qwen2.5-0.5b.pt")
smooth_lm(model, act_scales, 0.85)
model_smoothquant = quantize_qwen2_2(model).to("cuda:0") 

  act_scales = torch.load("act_scales/Qwen2.5-0.5b.pt")


In [30]:
acc_smoothquant = evaluator.evaluate(model_smoothquant)
print(f"SmoothQuant W8A8 quantized model accuracy: {acc_smoothquant}")

SmoothQuant W8A8 quantized model accuracy: 0.0
