In [None]:
import os

import torch
from datasets import load_dataset
from peft import LoraConfig, PeftModel
from transformers import (AutoModelForCausalLM, AutoTokenizer,
                          BitsAndBytesConfig, HfArgumentParser,
                          TrainingArguments, logging, pipeline)
from trl import SFTTrainer

In [None]:
base_model = "meta-llama/Llama-2-13b-chat-hf"
dataset_name = "vibhorag101/phr-mental-therapy-dataset-openai-format"
new_model = "llama-2-13b-chat-hf-phr_mental_therapy-2"

In [None]:
# Hyperparameters
num_train_epochs = 3
per_device_train_batch_size = 2
per_device_eval_batch_size = 2
max_seq_length = 1024

In [None]:
dataset = load_dataset(dataset_name)

In [None]:
# QLoRA parameters and bits and bytes
lora_r = 64
lora_alpha = 16
lora_dropout = 0.1
use_4bit = True
bnb_4bit_compute_dtype = "float16"
bnb_4bit_quant_type = "nf4"
use_nested_quant = False
fp16 = False
bf16 = True

### Step Parameters ###
save_steps = 1000
logging_steps = 100
eval_steps = 1000

### Model Parameters ###
output_dir = "./results"
max_grad_norm = 0.3
learning_rate = 2e-5
weight_decay = 0.001
optim = "paged_adamw_32bit"
lr_scheduler_type = "cosine"
max_steps = -1
warmup_ratio = 0.03
group_by_length = True
device_map = {"": 0}
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)

training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    evaluation_strategy="steps",
    save_steps=save_steps,
    eval_steps=eval_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    resume_from_checkpoint=True,
    load_best_model_at_end=True,
    optim=optim,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="wandb",

)

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map=device_map
)
tokenizer = AutoTokenizer.from_pretrained(base_model)

In [None]:
#Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["val"],
    peft_config=peft_config,
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
)
trainer.train()
trainer.model.save_pretrained(new_model)

In [None]:
# Reload model in FP16 and merge it with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    base_model,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map="cpu", ## device map = "cpu", for merging on cpu
)
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()
tokenizer = AutoTokenizer.from_pretrained(base_model)


In [None]:
## for inference without merging qlora weights with original
base_model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map=device_map
)
model = PeftModel.from_pretrained(base_model, new_model)
tokenizer = AutoTokenizer.from_pretrained(base_model)
# We need to add the pad token, as the default tokenizer does not have it.
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

In [None]:
model.push_to_hub(new_model, use_temp_dir=False)
tokenizer.push_to_hub(new_model, use_temp_dir=False)

In [None]:
logging.set_verbosity(logging.CRITICAL)
SYSTEM_PROMPT = "You are a helpful and joyous mental therapy assistant. Always answer as helpfully and cheerfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content.If you don't know the answer to a question, please don't share false information. Always try to be as cheerfull as possible"
# Run text generation pipeline with our next model
prompt = "I am feeling suicidal.I have lost a lot of money in gambling. The money I used was taken as a loan"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=1000,temperature=0.9)
result = pipe(f"<s>[INST]<<SYS>>{SYSTEM_PROMPT}<</SYS>> {prompt} [/INST]")
print(result[0]['generated_text'])