In [1]:
import torch
import peft 
import time
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from utils.dataset import template

#from bokeh.plotting import figure, show
#from bokeh.io import output_notebook

#output_notebook()

In [5]:
model_id = 'GeneZC/MiniChat-3B'
device = 'cuda'

In [3]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)



model = AutoModelForCausalLM.from_pretrained(model_id, use_cache=True,
                                             device_map="auto",
                                             quantization_config=bnb_config,
                                             )
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [4]:
lora_path = 'lora/disco-limbic-dialogue-512/'

model = model.eval()
lora_model = peft.PeftModel.from_pretrained(model, lora_path, adapter_name='loraTrained', is_trainable=False)

In [7]:
dialog = [
    "[Electrochemistry]: Whoa! In your hand: *pyrholidon* -- the double rainbow of synthetic hallucinogens. Rare and gritty, a product of the age of atomic power.",
    "Look at the little puck of liquid.",
    "[Electrochemistry]: What a funny little cap! Don't let the *scary* medical warnings throw you off. It's an inadequate antidote to radiation poisoning, but a *potent* antidote to *boredom*.",
    "Hmm... open the cap.",
]

query = template(dialog) + ' [|Assistant|] '
model_inputs = tokenizer(query, return_tensors="pt", add_special_tokens=False).input_ids.to(device)
generated_ids = model.generate(input_ids=model_inputs, max_new_tokens=64,
                               do_sample=True,
                               temperature=0.7,
                               repetition_penalty=1.15)

output = tokenizer.decode(generated_ids[0], skip_special_tokens=False)
print(output)



[Sight]: Go ahead and sniff it, see how hard it comes back? How long does the after-effect last?</s>


In [66]:
dialog = [
]

while True:
    
    inp = input('Input:')
    if inp == 'q':
        break
    if inp:
        dialog.append(inp)
        
        print(f'[You]: {inp}', end='\n\n')
    query = template(dialog) + ' [|Assistant|] '
    
    model_inputs = tokenizer(query, return_tensors="pt", add_special_tokens=False).input_ids.to(device)
    input_len = len(model_inputs[0])
    generated_ids = model.generate(input_ids=model_inputs, max_new_tokens=128,
                                do_sample=True,
                                #pad_token_id=tokenizer.eos_token_id,
                                temperature=0.7,
                                repetition_penalty=1.15)
    #outputs = model(input_ids=input_ids),# max_length=cut_len, min_length=8, top_p=0.9, do_sample=True)
    output = tokenizer.decode(generated_ids[0][input_len:], skip_special_tokens=True)
    dialog.append(output)
    print(output, end='\n\n')
    time.sleep(0.5)

[Interfacing]: The pink and white ball slips out of your hand as it hits the ice -- what a *punk* moment...

[Visual Calculus]: And just look at that! A communist symbol in all its glory. It's like an army flag or something.
[Visual Calculus]: What is this crap? Are we really so lousy? Why isn't it anything other than a piece of shit?

