In [2]:
!pip install unsloth

Collecting unsloth
  Downloading unsloth-2024.12.4-py3-none-any.whl.metadata (59 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/59.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.2/59.2 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting unsloth_zoo>=2024.11.8 (from unsloth)
  Downloading unsloth_zoo-2024.12.1-py3-none-any.whl.metadata (16 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.28.post3-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Collecting bitsandbytes (from unsloth)
  Downloading bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl.metadata (2.9 kB)
Collecting triton>=3.0.0 (from unsloth)
  Downloading triton-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.3 kB)
Collecting tyro (from unsloth)
  Downloading tyro-0.9.2-py3-none-any.whl.metadata (9.4 kB)
Collecting datasets>=2.16.0 (from unsloth)
  Downloadi

In [3]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [2]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-2-7b-bnb-4bit", # Choose ANY! eg mistralai/Mistral-7B-Instruct-v0.2
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

==((====))==  Unsloth 2024.12.4: Fast Llama patching. Transformers:4.46.3.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/3.87G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/183 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/948 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/438 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

In [3]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.12.4 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [4]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

In [5]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        "You are an AI-powered medical assistant trained to provide reliable, evidence-based health information. Your task is to assist users by answering questions related to common medical conditions, symptoms, treatments, and general health advice.", # instruction
        "List types of cancer", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 256, use_cache = True)
tokenizer.batch_decode(outputs)

['<s> Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nYou are an AI-powered medical assistant trained to provide reliable, evidence-based health information. Your task is to assist users by answering questions related to common medical conditions, symptoms, treatments, and general health advice.\n\n### Input:\nList types of cancer\n\n### Response:\nThe types of cancer are:\n\n* [Leukemia](https://en.wikipedia.org/wiki/Leukemia)\n* [Lymphoma](https://en.wikipedia.org/wiki/Lymphoma)\n* [Multiple myeloma](https://en.wikipedia.org/wiki/Multiple_myeloma)\n* [Myelodysplastic syndromes](https://en.wikipedia.org/wiki/Myelodysplastic_syndromes)\n* [Myeloproliferative neoplasms](https://en.wikipedia.org/wiki/Myeloproliferative_neoplasms)\n* [Myeloma](https://en.wikipedia.org/wiki/Myeloma)\n* [Myeloproliferative neoplasms](https://en.wikipedia.org/wiki/Myeloproliferat

FINE TUNING

FINE TUNING 2

In [7]:
from datasets import load_dataset
from transformers import AutoTokenizer
from trl import SFTTrainer
from transformers import TrainingArguments
import torch

# Define the Alpaca-style prompt template
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

# Define EOS token
EOS_TOKEN = tokenizer.eos_token

In [8]:
# Formatting function to structure the input and output
def formatting_prompts_func(examples):
    questions = examples["Question"]
    answers = examples["Answer"]
    texts = []
    for question, answer in zip(questions, answers):
        # Construct the Alpaca-style prompt
        text = alpaca_prompt.format("You are an AI-powered medical assistant trained to provide reliable, evidence-based health information. Your task is to assist users by answering questions related to common medical conditions, symptoms, treatments, and general health advice. Answer the following medical question:", question, answer) + EOS_TOKEN
        texts.append(text)
    return { "text": texts }

In [9]:
# Load the dataset
dataset = load_dataset("keivalya/MedQuad-MedicalQnADataset", split="train")

# Preprocess the dataset
dataset = dataset.map(formatting_prompts_func, batched=True)

# Initialize the tokenizer for the model
tokenizer = AutoTokenizer.from_pretrained("unsloth/llama-2-7b-bnb-4bit")

README.md:   0%|          | 0.00/233 [00:00<?, ?B/s]

medDataset_processed.csv:   0%|          | 0.00/22.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/16407 [00:00<?, ? examples/s]

Map:   0%|          | 0/16407 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/948 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/438 [00:00<?, ?B/s]

In [10]:
# Set training parameters
max_seq_length = 512  # or another appropriate value based on the model's capabilities

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,  # Can make training 5x faster for short sequences
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=60,
        learning_rate=2e-4,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
    ),
)

Map (num_proc=2):   0%|          | 0/16407 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [11]:
# Start training
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 16,407 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 39,976,960
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss
1,2.3722
2,2.9205
3,2.3686
4,2.7478
5,2.3077
6,2.9258
7,2.5956
8,3.0125
9,2.8853
10,2.6477


In [13]:
model.save_pretrained("outputs")  # replace with your desired output directory
tokenizer.save_pretrained("outputs")

('outputs/tokenizer_config.json',
 'outputs/special_tokens_map.json',
 'outputs/tokenizer.model',
 'outputs/added_tokens.json',
 'outputs/tokenizer.json')

In [19]:
import shutil
save_directory = "outputs"
# Create a zip file of the saved model directory
shutil.make_archive("outputs", 'zip', save_directory)

# Download the zip file to your local machine
from google.colab import files
files.download("outputs.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Inference

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
# from FastLanguageModel import FastLanguageModel  # If this is the right import for your setup

# Load your fine-tuned model and tokenizer from the saved directory
model_name = "outputs"  # Replace with your model's path
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
`low_cpu_mem_usage` was None, now default to True since model is quantized.


model.safetensors:   0%|          | 0.00/3.87G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/183 [00:00<?, ?B/s]

In [4]:
# Define the prompt template for medical question-answering
medquad_prompt = """Below is an instruction that describes a medical question, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
You are an AI-powered medical assistant trained to provide reliable, evidence-based health information. Your task is to assist users by answering questions related to common medical conditions, symptoms, treatments, and general health advice.

### Input:
{}

### Response:
{}"""
import torch
# Function to perform inference for the fine-tuned model
def generate_medical_answer(question: str, max_length: int = 256):
    # Format the input for the model
    formatted_input = medquad_prompt.format(question, "")  # The output is left empty for generation

    # Tokenize the input question
    inputs = tokenizer([formatted_input], return_tensors="pt", truncation=True, padding=True, max_length=512).to("cuda")

    # Perform inference to generate the answer
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_length,  # Limit the length of the generated answer
            use_cache=True,
            num_beams=5,  # Optional, for beam search (higher value gives better quality but slower)
            top_p=0.95,   # Nucleus sampling
            temperature=0.7,  # Control randomness of the output
            pad_token_id=tokenizer.eos_token_id,  # Padding token
        )

    # Decode the output tokens to get the generated answer
    answer = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

    return answer.strip()

# Example usage
question = "What is the treatment for type 2 diabetes?"
answer = generate_medical_answer(question)

print(f"Question: {question}")
print(f"Answer: {answer}")

Question: What is the treatment for type 2 diabetes?
Answer: Below is an instruction that describes a medical question, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
You are an AI-powered medical assistant trained to provide reliable, evidence-based health information. Your task is to assist users by answering questions related to common medical conditions, symptoms, treatments, and general health advice.

### Input:
What is the treatment for type 2 diabetes?

### Response:
Treatment for type 2 diabetes is a combination of diet, exercise, and medication. The goal of treatment is to keep your blood glucose, also called blood sugar, levels as close to normal as possible. You and your health care team will work together to create a treatment plan that is right for you. Your treatment plan may change over time as your health care team learns more about your condition.
                
### Response Instructio

In [None]:
# Define the prompt template for medical question-answering
medquad_prompt = """Below is an instruction that describes a medical question, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
You are an AI-powered medical assistant trained to provide reliable, evidence-based health information. 
Your task is to assist users by answering questions related to common medical conditions, symptoms, treatments, and general health advice.

### Input:
{}

### Response:
{}"""
import torch
# Function to perform inference for the fine-tuned model
def generate_medical_answer(question: str, max_length: int = 256):
    # Format the input for the model
    formatted_input = medquad_prompt.format(question, "")  # The output is left empty for generation

    # Tokenize the input question
    inputs = tokenizer([formatted_input], return_tensors="pt", truncation=True, padding=True, max_length=512).to("cuda")

    # Perform inference to generate the answer
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_length,  # Limit the length of the generated answer
            use_cache=True,
            num_beams=5,  # Optional, for beam search (higher value gives better quality but slower)
            top_p=0.95,   # Nucleus sampling
            temperature=0.7,  # Control randomness of the output
            pad_token_id=tokenizer.eos_token_id,  # Padding token
        )

    # Decode the output tokens to get the generated answer
    answer = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

    return answer.strip()

# Example usage
question = "List types of cancer?"
answer = generate_medical_answer(question)

print(f"Question: {question}")
print(f"Answer: {answer}")

Question: List types of cancer?
Answer: Below is an instruction that describes a medical question, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
You are an AI-powered medical assistant trained to provide reliable, evidence-based health information. Your task is to assist users by answering questions related to common medical conditions, symptoms, treatments, and general health advice.

### Input:
List types of cancer?

### Response:
Cancer is a disease in which cells in the body grow out of control. Cancer cells can invade nearby tissue and can spread to other parts of the body through the lymph system or bloodstream. There are more than 100 different types of cancer, including breast cancer, skin cancer, lung cancer, colon cancer, and prostate cancer.


In [10]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [14]:
from datasets import load_dataset
from transformers import AutoTokenizer
from trl import SFTTrainer
from transformers import TrainingArguments
import torch
import evaluate
# Load the MedQuad dataset and take only the first 100 samples from the test split
dataset = load_dataset("keivalya/MedQuad-MedicalQnADataset", split="train[:50]")  # Load only first 100 samples

# Initialize the evaluation metrics
metric_em = evaluate.load("exact_match")
metric_bleu = evaluate.load("bleu")

# Function to generate answers from the model
def generate_answer(question: str, max_length: int = 256):
    # Format the input for the model
    formatted_input = f"""Below is an instruction that describes a medical question, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
You are an AI-powered medical assistant trained to provide reliable, evidence-based health information. Your task is to assist users by answering questions related to common medical conditions, symptoms, treatments, and general health advice.

### Input:
{question}

### Response:
"""

    # Tokenize the input question
    inputs = tokenizer([formatted_input], return_tensors="pt", truncation=True, padding=True, max_length=512).to("cuda")

    # Generate the answer
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_length,
            use_cache=True,
            num_beams=1,
            top_p=0.95,
            temperature=0.7,
            pad_token_id=tokenizer.eos_token_id,
        )

    # Decode the output tokens to get the generated answer
    answer = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

    return answer.strip()

# Evaluate the model on the first 100 samples
total_em = 0
total_bleu = 0
num_samples = 0

for example in dataset:
    question = example["Question"]
    ground_truth = example["Answer"]

    # Generate model's answer
    generated_answer = generate_answer(question)

    # Compute Exact Match (EM)
    em_score = metric_em.compute(predictions=[generated_answer], references=[ground_truth])
    total_em += em_score["exact_match"]

    # Compute BLEU score
    bleu_score = metric_bleu.compute(predictions=[generated_answer], references=[[ground_truth]])
    total_bleu += bleu_score["bleu"]

    num_samples += 1

# Calculate average metrics
avg_em = total_em / num_samples
avg_bleu = total_bleu / num_samples

print(f"Exact Match (EM) Score (for 100 samples): {avg_em:.4f}")
print(f"BLEU Score (for 100 samples): {avg_bleu:.4f}")

Exact Match (EM) Score (for 100 samples): 0.0000
BLEU Score (for 100 samples): 0.0125
