In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
modelpath = "/home/vbot/models/Mistral-7B-ascii-art-qlora"

model = AutoModelForCausalLM.from_pretrained(
    modelpath,    
    device_map="auto",
    quantization_config=BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_quant_type="nf4",
    ),
    torch_dtype=torch.bfloat16,
)

Loading checkpoint shards: 100%|██████████| 4/4 [00:22<00:00,  5.58s/it]


In [4]:
tokenizer = AutoTokenizer.from_pretrained(modelpath, use_fast=False)   

# Add tokens <|im_start|> and <|im_end|>, latter is special eos token 
tokenizer.pad_token = "</s>"
tokenizer.add_tokens(["<|im_start|>"])
tokenizer.add_special_tokens(dict(eos_token="<|im_end|>"))
model.resize_token_embeddings(len(tokenizer))
model.config.eos_token_id = tokenizer.eos_token_id

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [19]:
prompt = "Topic: Whale\nArt:"

input_ids = tokenizer.encode(prompt, return_tensors="pt")

In [20]:
out = model.generate(input_ids, max_length=100, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
out_text = tokenizer.decode(out[0], skip_special_tokens=True)
print(out_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Topic: Whale
Art:  O. 0
   __
  /  \
  |    \
  |_____\
  \__  __)
   \_\/ /
      /
     /
jgs  /
    `
    `
    `
    `
    `
    `
    `
    `
    `
