In [3]:
import os

import torch
from datasets import load_dataset
from peft import LoraConfig, PeftModel
from transformers import (AutoModelForCausalLM, AutoTokenizer,
                          BitsAndBytesConfig, HfArgumentParser,
                          TrainingArguments, logging, pipeline)
from trl import SFTTrainer

  from .autonotebook import tqdm as notebook_tqdm


[2024-03-13 10:08:15,947] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [4]:
base_model_name = "meta-llama/Llama-2-7b-chat-hf"
new_model_name = "llama-2-7b-chat-hf-phr_mental_therapy-2"

In [5]:
use_4bit=True
device_map = {"": 0}
bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
    bnb_4bit_use_double_quant=False,
)

## QLoRA Inference with adapter

In [6]:
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=bnb_config,
)
tokenizer = AutoTokenizer.from_pretrained(new_model_name)
base_model.resize_token_embeddings(len(tokenizer))
model = PeftModel.from_pretrained(base_model, new_model_name)

`low_cpu_mem_usage` was None, now set to True since model is quantized.

BNB_CUDA_VERSION=XXX can be used to load a bitsandbytes version that is different from the PyTorch CUDA version.
If this was unintended set the BNB_CUDA_VERSION variable to an empty string: export BNB_CUDA_VERSION=
If you use the manual override make sure the right libcudart.so is in your LD_LIBRARY_PATH
For example by adding the following to your .bashrc: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:<path_to_cuda_dir/lib64
Loading: libbitsandbytes_cuda118.so


  warn(
Loading checkpoint shards: 100%|██████████| 2/2 [00:05<00:00,  2.66s/it]


In [7]:
# Run text generation using model.generate
conv = [ { "content": "You are a helpful and joyous mental therapy assistant. Always answer as helpfully and cheerfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content.Please ensure that your responses are socially unbiased and positive in nature.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.", "role": "system" }]

In [8]:
def generate_response(conv,model,tokenizer):
    prompt = tokenizer.apply_chat_template(conv,tokenize=False)
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    input_ids = input_ids.to('cuda')
    output_ids = model.generate(
    input_ids,
    max_length=1024,
    do_sample=True,
    top_p=0.95,
    top_k=60,
    temperature=0.9)
    output_text = tokenizer.decode(output_ids[0][input_ids.shape[1]:], skip_special_tokens=True)
    return output_text

In [9]:
def chat_with_model(user_input,conv,model,tokenizer):
    conv.append({"content": user_input, "role": "user"})
    response = generate_response(conv,model,tokenizer)
    print("Model:", response)
    conv.append({"content": response, "role": "assistant"})

In [10]:
chat_with_model("I am feeling sad",conv,model,tokenizer)

Model: Hi , I appreciate your trust in me. I'm here to listen and help you in any way I can. Can you tell me more about what makes you feel sad? 
