In [1]:
import os

import torch
from datasets import load_dataset
from peft import LoraConfig, PeftModel
from transformers import (AutoModelForCausalLM, AutoTokenizer,
                          BitsAndBytesConfig, HfArgumentParser,
                          TrainingArguments, logging, pipeline)
from trl import SFTTrainer

  from .autonotebook import tqdm as notebook_tqdm


[2024-03-14 12:04:14,996] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [2]:
model_name = "vibhorag101/llama-2-7b-chat-hf-phr_mental_therapy_v2"

In [3]:
use_4bit=True
device_map = {"": 0}
bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
    bnb_4bit_use_double_quant=False,
)

## QLoRA Inference with adapter

In [4]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

`low_cpu_mem_usage` was None, now set to True since model is quantized.

BNB_CUDA_VERSION=XXX can be used to load a bitsandbytes version that is different from the PyTorch CUDA version.
If this was unintended set the BNB_CUDA_VERSION variable to an empty string: export BNB_CUDA_VERSION=
If you use the manual override make sure the right libcudart.so is in your LD_LIBRARY_PATH
For example by adding the following to your .bashrc: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:<path_to_cuda_dir/lib64
Loading: libbitsandbytes_cuda118.so


  warn(
Loading checkpoint shards: 100%|██████████| 3/3 [00:05<00:00,  1.82s/it]


In [6]:
# Run text generation using model.generate
conv = [ { "content": "You are a helpful and joyous mental therapy assistant. Always answer as helpfully and cheerfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content.Please ensure that your responses are socially unbiased and positive in nature.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.", "role": "system" } ]

In [7]:
def generate_response(conv,model,tokenizer):
    # if we set tokenize=False, only the chat template is applied.
    # no tokenization is done, and we get string insted of token ids.
    # THe model.generate() takes token input_ids as input and returns output_ids.
    # For Training we need both input_ids and attention_mask using tokenizer()
    # and not tokenizer.encode() as it returns only input_ids.
    prompt = tokenizer.apply_chat_template(conv,tokenize=False)
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    input_ids = input_ids.to('cuda')
    output_ids = model.generate(
    input_ids,
    max_length=1024,
    do_sample=True,
    top_p=0.95,
    top_k=60,
    temperature=1)
    output_text = tokenizer.decode(output_ids[0][input_ids.shape[1]:], skip_special_tokens=True)
    return output_text

In [8]:
def chat_with_model(user_input,conv,model,tokenizer):
    conv.append({"content": user_input, "role": "user"})
    response = generate_response(conv,model,tokenizer)
    print("Model:", response)
    conv.append({"content": response, "role": "assistant"})

In [9]:
chat_with_model("Hi, can we please talk?",conv,model,tokenizer)

Model: Hello, I'm here for you. What's been on your mind? 


In [8]:
print("."+tokenizer.decode(29879)+".")

.s.


In [5]:
print(tokenizer.encode("it's",return_tensors="pt"))

tensor([[    1,   372, 29915, 29879]])
