### LLaMA Supervised Fine-Tuning

This document will take the answers of GPT-4o on the Kababutare Medical Dataset and then fine-tune the LLaMA Model on those answers.

The purpose of this exercise is to test whether the LLaMA fine-tuning is able to distill the knowledge of GPT-4o and improve the performance on the open-ended question/answering related to healthcare dataset

In [None]:
import os

In [None]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [None]:
import pandas as pd
import json
import torch
import pickle
from unsloth import FastLanguageModel
from datasets import Dataset
from tqdm  import tqdm

#### Reading the Question and Answer Pairs from Test Dataset Phase 2

In [None]:
ques_list = []
ans_list = []
llama_resp_list = []
gpt_resp_list = []

with open('phase2_data_kabatubare/test_kabatubare.jsonl', 'rb') as file:
    for line in file:
        json_object = json.loads(line)
        ques_list.append(json_object['question'])
        ans_list.append(json_object['answer'])
        llama_resp_list.append(json_object['llama_response_base'])
        gpt_resp_list.append(json_object['gpt_response_base'])

In [None]:
test_data = pd.DataFrame({'question': ques_list,
                          'answer': ans_list,
                          'llama_response_base': llama_resp_list,
                          'gpt_response_base': gpt_resp_list})
test_data

### Inference

In [None]:
# full_model_path = "./llama32-sft-full-kabatubare"
peft_model_path = "./llama32-sft-peft-kabatubare" #use for LoRA based fine-tuning

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = peft_model_path,
    max_seq_length = 2048,
    load_in_4bit = False, # 4 bit quantization to reduce memory
    load_in_8bit = False, # [NEW!] A bit more accurate, uses 2x memory
    full_finetuning = False, # [NEW!] We have full finetuning now!
    dtype=None, #None for auto-detection. Can be torch.bfloat16 or torch.float16 (will be automatically detected)
    device_map="auto"
)

Implementing sample-by-sample inference. (Batch Inference doesn't work well for fine-tuned model adapters as responses like `P P P P` are being produced)

In [None]:
def get_llama_response_ft(question_input: str):
    
    llama_input = [{"role": "system", "content": "You are a medical knowledge assistant trained to provide information and guidance on various health-related topics."},
                    {"role": "user", "content": question_input}]

    prompt = tokenizer.apply_chat_template(llama_input, tokenize=False, add_generation_prompt=True)
    
    inputs = tokenizer(prompt, padding=True, truncation=True, return_tensors="pt").to(model.device)
    temp_resp = tokenizer.decode(inputs['input_ids'][0], skip_special_tokens=True)
    
    outputs = model.generate(
        **inputs, 
        max_new_tokens=2048,
        num_return_sequences=1
    )

    resp = tokenizer.decode(outputs[0], skip_special_tokens=True)
    resp = resp[len(temp_resp):] #getting only the response part (i.e., assistant)
    
    return resp

In [None]:
# Implementing the Unsloth Fast Inference
FastLanguageModel.for_inference(model)

llama_responses_ft = []
for index, row in tqdm(test_data.iterrows(), total=len(test_data)):
    question_input = row['question']
    llama_resp = get_llama_response_ft(question_input)
    llama_responses_ft.append(llama_resp)

with open('phase2_kabatubare_medical/llama_responses_ft.pkl', 'wb') as file:
    pickle.dump(llama_responses_ft, file)

In [None]:
with open('phase2_kabatubare_medical/llama_responses_ft.pkl', 'rb') as file:
    llama_responses_ft = pickle.load(file)

### Saving the LLaMA Fine-Tuned Responses into the complete dataframe

In [None]:
test_data['llama_responses_ft'] = llama_responses_ft
test_data