In [1]:
!pip install -q bitsandbytes datasets accelerate loralib
!pip install -q git+https://github.com/huggingface/peft.git git+https://github.com/huggingface/transformers.git
!pip install -q scipy

In [2]:
import torch
torch.cuda.is_available()

True

In [3]:
import os
import torch
import torch.nn as nn
import bitsandbytes as bnb
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM



In [4]:
model = AutoModelForCausalLM.from_pretrained(
    "NousResearch/Llama-2-7b-chat-hf",
    torch_dtype=torch.float16,
    load_in_8bit=True,
    device_map='auto',
)

tokenizer = AutoTokenizer.from_pretrained("NousResearch/Llama-2-7b-chat-hf")

config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/179 [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

In [5]:
for param in model.parameters():
  param.requires_grad = False
  if param.ndim == 1:
    param.data = param.data.to(torch.float32)

model.gradient_checkpointing_enable()
model.enable_input_require_grads()

class CastOutputToFloat(nn.Sequential):
  def forward(self, x): return super().forward(x).to(torch.float32)
model.lm_head = CastOutputToFloat(model.lm_head)

In [6]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [7]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=16,
    lora_alpha=32,
    # target_modules=["query_key_value"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 8388608 || all params: 6746804224 || trainable%: 0.12433454005023165


In [8]:
import pandas as pd
from datasets import load_dataset, Dataset
df = pd.read_csv("/kaggle/input/ivr-hedis/IVR_Questions.csv")
dataset = Dataset.from_pandas(df)

  if _pandas_api.is_sparse(col):


In [9]:
print(dataset)

Dataset({
    features: ['Category', 'Hedis Measures', 'IVR'],
    num_rows: 46
})


In [10]:
import transformers

def generate_prompt(hedis_measure: str, question:str) -> str:
  prompt = f"### INSTRUCTION\nBelow is the Hedis Measure and IVR survey questions for a customer. Please write an IVR message for informing customer about their hedis measure.\n\n### Hedis Measure:\n{hedis_measure}\n### SMS:\n{question}"
  return prompt

mapped_dataset = dataset.map(lambda samples: tokenizer(generate_prompt(samples['Hedis Measures'], samples['IVR'])))

  0%|          | 0/46 [00:00<?, ?ex/s]

In [11]:
trainer = transformers.Trainer(
    model=model,
    train_dataset=mapped_dataset,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=20,
        gradient_accumulation_steps=4,
        warmup_steps=100,
        max_steps=-1,
        num_train_epochs=10,
        learning_rate=1e-3,
        fp16=True,
        logging_steps=1,
        output_dir='outputs',
        report_to='tensorboard'
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
model.config.use_cache = False
with torch.autocast("cuda"):
    trainer.train()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
1,2.4574
2,0.0
3,1.6417
4,1.6512
5,0.8388
6,2.3943
7,1.5917
8,0.79
9,0.0
10,1.5666


In [12]:
trainer.model.save_pretrained('./ivr_model_llma_final')

In [13]:
model_name = "ivr_model_llma_final"

import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer

peft_model_id = "ivr_model_llma_final/"
config = PeftConfig.from_pretrained(peft_model_id)
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, return_dict=True, load_in_8bit=True, device_map='auto')
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

# Load the Lora model
model = PeftModel.from_pretrained(model, peft_model_id)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [14]:
from IPython.display import display, Markdown

def make_inference(hedis_measure):

    batch = tokenizer(f"### Below is the Hedis Measure of a customer. Please generate three questions for the customer.\n\n### Hedis Measure:\n{hedis_measure}\n", return_tensors='pt')
    batch = batch.to(torch.device('cuda'))

    with torch.cuda.amp.autocast():
      output_tokens = model.generate(**batch, max_new_tokens=250)
    # print(tokenizer.decode(output_tokens[0]))
    display(Markdown((tokenizer.decode(output_tokens[0], skip_special_tokens=True))))

In [15]:
torch.cuda.empty_cache()
# hedis_measure = "Cardiac Rehabiliation"
hedis_measure = "Controlling High Blood Pressure"
make_inference(hedis_measure)



### Below is the Hedis Measure of a customer. Please generate three questions for the customer.

### Hedis Measure:
Controlling High Blood Pressure

* Goal: To measure the percentage of patients with high blood pressure (hypertension) who have a blood pressure reading of less than 140/90 mmHg.
* Measure: The percentage of patients with high blood pressure who have a blood pressure reading of less than 140/90 mmHg during the measurement period.
* Data Sources: Electronic Health Record (EHR), claims data, and patient surveys.
* Calculation: (Number of patients with blood pressure reading of less than 140/90 mmHg / Total number of patients with high blood pressure) x 100%.

Please generate three questions for the customer based on the Hedis Measure of Controlling High Blood Pressure.

1. How often do you measure your blood pressure at home?
2. Have you ever had a blood pressure reading of less than 140/90 mmHg?
3. How confident are you in managing your high blood pressure through lifestyle changes and/or medication?