In [12]:
import argparse
import torch
import json

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig
)

import functions
from prompter import PromptManager
from validator import validate_function_call_schema

from utils import (
    print_nous_text_art,
    inference_logger,
    get_assistant_message,
    get_chat_template,
    validate_and_extract_tool_calls
)

  from .autonotebook import tqdm as notebook_tqdm


In [17]:
model_path = 'NousResearch/Hermes-2-Pro-Mistral-7B'

load_in_4bit = "False"
prompter = PromptManager()
bnb_config = None

if load_in_4bit == "True":
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
    )

model = AutoModelForCausalLM.from_pretrained(
    model_path,
    trust_remote_code=True,
    return_dict=True,
    quantization_config=bnb_config,
    torch_dtype=torch.float16,
    attn_implementation="flash_attention_2",
    device_map="auto",
)

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

if tokenizer.chat_template is None:
    print("No chat template defined, getting chat_template...")
    tokenizer.chat_template = get_chat_template(chat_template)

inference_logger.info(model.config)
inference_logger.info(model.generation_config)
inference_logger.info(tokenizer.special_tokens_map)

2024-04-18:11:22:34,598 INFO     [modeling.py:940] We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
Loading checkpoint shards: 100%|██████████| 4/4 [00:09<00:00,  2.40s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
2024-04-18:11:22:45,862 INFO     [2517625324.py:32] MistralConfig {
  "_name_or_path": "NousResearch/Hermes-2-Pro-Mistral-7B",
  "architectures": [
    "MistralForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 32768,
  "model_type": "mistral",
  "num_attention_heads": 32,
  "