<a href="https://colab.research.google.com/github/vincentmin/llama-2-qlora/blob/main/llama_2_7b_best_of_n_sampling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q scipy tensorboard datasets xformers \
  transformers \
  peft \
  accelerate \
  trl \
  bitsandbytes \
  einops \
  optimum \
  evaluate

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m492.4/492.4 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m109.1/109.1 MB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m75.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m23.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.4/77.4 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.2/42.2 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [1]:
from typing import Any, Callable, List, Optional, Union
import torch
from transformers import (AutoModelForCausalLM,
                          AutoModelForSequenceClassification,AutoTokenizer,
                          GenerationConfig, PreTrainedModel,
                          PreTrainedTokenizer, PreTrainedTokenizerFast)
from peft import PeftConfig, PeftModel
from trl.core import set_seed

class BestOfNSampler(object):
    def __init__(
        self,
        model: PreTrainedModel,
        reward_model: PreTrainedModel,
        tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
        sample_size: int = 4,
        seed: Optional[int] = None,
        n_candidates: int = 1,
        generation_config: Optional[GenerationConfig] = None,
    ) -> None:
        r"""
        Initialize the sampler for best-of-n generation

        Args:
            model (`PreTrainedModel`):
                The pretrained model to use for generation
            reward_model (`PreTrainedModel`):
                The pretrained model to use for reward prediction
            tokenizer (`PreTrainedTokenizer` or `PreTrainedTokenizerFast`):
                Tokenizer associated with the pretrained models. We assume the same tokenizer is used for both the generation and reward model.
            sample_size (`int`):
                Number of samples to generate for each query
            seed (`int`, *optional*):
                Random seed used to control generation
            n_candidates (`int`):
                Number of candidates to return for each query
            generation_config (`GenerationConfig`, *optional*):
                Generation config passed to the underlying model's `generate` method.
                See `GenerationConfig` (https://huggingface.co/docs/transformers/v4.29.1/en/main_classes/text_generation#transformers.GenerationConfig) for more details
        """
        if seed is not None:
            set_seed(seed)

        if not isinstance(tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)):
            raise ValueError(
                f"tokenizer must be a PreTrainedTokenizer or PreTrainedTokenizerFast, got {type(tokenizer)}"
            )
        # if not isinstance(model, (SUPPORTED_ARCHITECTURES)):
        #     raise ValueError(
        #         f"model must be a PreTrainedModelWrapper, got {type(model)} - supported architectures are: {SUPPORTED_ARCHITECTURES}"
        #     )

        # TODO, since both model and reward_model share the same base model, we should be able to load the base model once instead of twice. This could significantly reduce memory consumption.
        self.model = model
        self.reward_model = reward_model
        self.tokenizer = tokenizer

        self.gen_config = generation_config
        self.sample_size = sample_size
        self.n_candidates = n_candidates

    @classmethod
    def from_pretrained(
        cls,
        peft_model_id: str,
        sample_size: int = 4,
        seed: Optional[int] = None,
        n_candidates: int = 1,
        generation_config: Optional[GenerationConfig] = None,
        load_in_4bit: bool=True,
        torch_dtype=torch.float16,
        # bnb_4bit_compute_type=torch.float16,
        use_auth_token: bool=True,
    ):
      # TODO: here we assume the model and reward_model are specified by a peft_model_id. Should be more general than this.
      config = PeftConfig.from_pretrained(peft_model_id)
      model = AutoModelForCausalLM.from_pretrained(
          config.base_model_name_or_path,
          load_in_4bit=load_in_4bit,
          # bnb_4bit_compute_type=bnb_4bit_compute_type,
          torch_dtype=torch_dtype,
      )
      reward_model = AutoModelForSequenceClassification.from_pretrained(
          config.base_model_name_or_path,
          num_labels=1,
          load_in_4bit=load_in_4bit,
          # bnb_4bit_compute_type=bnb_4bit_compute_type,
          torch_dtype=torch_dtype,
      )
      reward_model = PeftModel.from_pretrained(reward_model, peft_model_id)
      tokenizer = AutoTokenizer.from_pretrained(
          config.base_model_name_or_path,
          use_auth_token=use_auth_token
      )
      return cls(model=model, reward_model=reward_model, tokenizer=tokenizer, sample_size=sample_size, seed=seed, n_candidates=n_candidates, generation_config=generation_config)

    def generate(
        self,
        tokenized_query: Union[List[int], torch.Tensor, List[torch.Tensor], List[List[int]]],
        skip_special_tokens: bool = True,
        max_new_tokens: int = 512,
        **generation_kwargs,
    ) -> List[List[str]]:
        r"""
        Generate the best of n samples for input queries

        Args:
            tokenized_query (`List[int]` or `torch.Tensor` or `List[torch.Tensor]` or `List[int]`):
                represents either a single tokenized query (a single tensor or a list of integers) or a batch of tokenized queries (a list of tensors or a list of lists of integers)
            skip_special_tokens (`bool`):
                Whether to remove the special tokens from the output
            device (`str` or `torch.device`, *optional*):
                The device on which the model will be loaded
            **generation_kwargs (`dict`, *optional*):
                Additional keyword arguments passed along to the underlying model's `generate` method.
                This is used to override generation config

        Returns:
            List[List[str]]: A list of lists of generated texts
        """
        queries = None

        if isinstance(tokenized_query, torch.Tensor) and tokenized_query.ndim == 1:
            queries = tokenized_query.unsqueeze(0)
        elif isinstance(tokenized_query, List):
            element_type = type(tokenized_query[0])
            if element_type == int:
                queries = torch.tensor(tokenized_query).unsqueeze(0)
            elif element_type == torch.Tensor:
                queries = [tensor.reshape((1, -1)) for tensor in tokenized_query]
            else:
                queries = [torch.tensor(query).reshape((1, -1)) for query in tokenized_query]

        result = []

        self.model.eval()
        self.reward_model.eval()
        with torch.inference_mode():
          for query in queries:
              queries = query.repeat((self.sample_size, 1))
              output = self.model.generate(
                  queries.to(self.model.device),
                  max_new_tokens=max_new_tokens,
                  generation_config=self.gen_config,
                  **generation_kwargs,
              ).squeeze()
              scores = self.reward_model(output).logits.squeeze()
              output = self.tokenizer.batch_decode(output, skip_special_tokens=skip_special_tokens)

              output = [
                  {"score": scores[i], "text": output[i]}
                  for i in scores.topk(self.n_candidates).indices
                  ]
              result.append(output)

        return result

In [2]:
best_of_n = BestOfNSampler.from_pretrained("vincentmin/llama-2-7b-reward-oasst1", sample_size=5, n_candidates=4)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-2-7b-chat-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
prompt = """<s>[INST] <<SYS>>
You are a helpful assistant
<</SYS>>

What do Llama's eat? [/INST]"""
input_ids = best_of_n.tokenizer([prompt])['input_ids']

In [4]:
%%time
best_of_n.generate(input_ids, max_new_tokens=128, temperature=1, do_sample=True)



CPU times: user 48.4 s, sys: 13.3 s, total: 1min 1s
Wall time: 1min 5s


[[{'score': tensor(2.8418, device='cuda:0', dtype=torch.float16),
   'text': "[INST] <<SYS>>\nYou are a helpful assistant\n<</SYS>>\n\nWhat do Llama's eat? [/INST]  Oh, that's a great question! Llamas are herbivores, which means they eat plants and plant-based foods. Their diet typically consists of a variety of grasses, leaves, and fruits, as well as hay and grains. In the wild, llamas will graze on the lush vegetation in their native habitats, such as the Andes mountains in South America.\nIn captivity, llamas are often fed a diet of hay, grains, and fresh vegetables. They may also be given supplemental nutrients, such as vitam"},
  {'score': tensor(1.8564, device='cuda:0', dtype=torch.float16),
   'text': "[INST] <<SYS>>\nYou are a helpful assistant\n<</SYS>>\n\nWhat do Llama's eat? [/INST]  Oh, that's a great question! Llamas are herbivores, which means they primarily eat plants and plant-based foods. Their diet typically consists of a variety of grasses, hay, and other vegetation.