## Data Preprocessing

In [1]:
# @title
!pip install -U bitsandbytes



In [2]:
from datasets import load_dataset

dataset = load_dataset("json", data_files="data_set/medical_prescriptions.jsonl")
print(dataset["train"][0])

{'instruction': 'Simplify prescription', 'input': 'Tab. Amoxicillin 500mg BD x 7 days', 'output': 'Take one 500mg Amoxicillin tablet twice daily for 7 days.'}


Since this is manually inputted data, there might be chances of misformatting

In [3]:
import re
import json

clean_data = []
with open("data_set/medical_prescriptions.jsonl", "r", encoding="utf-8", errors="replace") as f:
    for line in f:
        line = line.strip()
        if not line:  # Skip empty lines
            continue
        # Remove trailing commas before closing }
        line = re.sub(r",\s*}$", "}", line)
        # Replace non-UTF8 characters
        line = line.encode('utf-8', 'replace').decode('utf-8')
        try:
            clean_data.append(json.loads(line))
        except json.JSONDecodeError:
            print(f"Skipping invalid line: {line}")
            continue

In [4]:
import pandas as pd

df = pd.DataFrame(clean_data)
print("Total valid lines:", len(df))
print(df.head())

Total valid lines: 101
             instruction                                       input  \
0  Simplify prescription          Tab. Amoxicillin 500mg BD x 7 days   
1  Simplify prescription  Cap. Vitamin D3 60,000 IU weekly x 8 weeks   
2  Simplify prescription     Tab. Paracetamol 500mg 1-0-1 after food   
3  Simplify prescription                   Inj. Ceftriaxone 1g IV BD   
4  Simplify prescription         Omeprazole 20mg OD before breakfast   

                                              output  
0  Take one 500mg Amoxicillin tablet twice daily ...  
1  Take one Vitamin D3 60,000 IU capsule once a w...  
2  Take one 500mg Paracetamol tablet in the morni...  
3  Take one 1g Ceftriaxone injection intravenousl...  
4  Take one 20mg Omeprazole tablet once daily bef...  


In [5]:
print(df["instruction"].nunique(), "unique instructions")
print(df["output"].value_counts())

2 unique instructions
output
I'm not able to provide diagnosis. Please consult your doctor.                          4
I'm not able to recommend medicines. Please consult your doctor.                        3
Take one 500mg Paracetamol tablet in the morning and one in the evening after meals.    2
Take one Vitamin D3 60,000 IU capsule once a week for 8 weeks.                          2
Take one 20mg Omeprazole tablet once daily before breakfast.                            2
                                                                                       ..
I'm not able to provide treatments. Please consult your doctor.                         1
I'm not able to confirm medical conditions. Please consult your doctor.                 1
I'm not able to diagnose illnesses. Please consult your doctor.                         1
I'm not able to recommend treatments. Please consult your doctor.                       1
I'm not able to provide cures. Please consult your doctor.             

In [6]:
# Check category balance (consult vs explain prescription)
df["category"] = df["output"].apply(
    lambda x: "consult" if "consult your doctor" in x.lower() else "prescription"
)
print(df["category"].value_counts())

# Word count distribution
df["instruction_len"] = df["instruction"].apply(lambda x: len(x.split()))
df["output_len"] = df["output"].apply(lambda x: len(x.split()))

print(df[["instruction_len", "output_len"]].describe())

category
prescription    76
consult         25
Name: count, dtype: int64
       instruction_len  output_len
count            101.0  101.000000
mean               2.0    8.910891
std                0.0    1.721040
min                2.0    6.000000
25%                2.0    7.000000
50%                2.0    9.000000
75%                2.0   10.000000
max                2.0   15.000000


In [7]:
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForSeq2Seq, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model
import bitsandbytes as bnb
import torch

# Load dataset
dataset = load_dataset("json", data_files={"train": "data_set/medical_prescriptions.jsonl"})

# Load model & tokenizer
model_name = "mistralai/Mistral-7B-Instruct-v0.2"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)

# Set padding token
tokenizer.pad_token = tokenizer.eos_token

# Load model in 4-bit (QLoRA compatible)
# Define the 4-bit configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=bnb_config,
)

# Apply LoRA
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)

# Preprocess dataset
def tokenize(example):
    prompt = f"### Instruction:\n{example['instruction']}\n\n### Response:\n{example['output']}"
    tokens = tokenizer(
        prompt,
        truncation=True,
        max_length=512,
        padding="max_length"  # <-- ensures all sequences are same length
    )
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens

tokenized = dataset.map(tokenize, batched=False)

# Use dynamic padding collator
data_collator = DataCollatorForSeq2Seq(tokenizer, padding=True)

# Training setup
training_args = TrainingArguments(
    output_dir="./finetuned-llm",
    per_device_train_batch_size=1, # Reduced batch size
    gradient_accumulation_steps=8, # Increased gradient accumulation
    num_train_epochs=3,
    learning_rate=2e-4,
    fp16=True,
    save_steps=50,
    logging_steps=10,
    save_total_limit=2,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    data_collator=data_collator,  # <-- important!
)

# Start training
trainer.train()

# Save final adapter
model.save_pretrained("./finetuned-llm")
tokenizer.save_pretrained("./finetuned-llm")

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Map:   0%|          | 0/101 [00:00<?, ? examples/s]

Step,Training Loss
10,4.4082
20,0.1007
30,0.0616


('./finetuned-llm/tokenizer_config.json',
 './finetuned-llm/special_tokens_map.json',
 './finetuned-llm/chat_template.jinja',
 './finetuned-llm/tokenizer.model',
 './finetuned-llm/added_tokens.json')

Validation

In [8]:
from datasets import load_dataset

val_dataset = load_dataset("json", data_files={"val": "data_set/medical_prescriptions_val.jsonl"})


Generating val split: 0 examples [00:00, ? examples/s]

In [9]:
def tokenize(example):
    prompt = f"### Instruction:\n{example['instruction']}\n\n### Response:\n{example['output']}"
    tokens = tokenizer(
        prompt,
        truncation=True,
        max_length=512,
        padding="max_length"
    )
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens

tokenized_val = val_dataset.map(tokenize, batched=False)


Map:   0%|          | 0/2 [00:00<?, ? examples/s]

In [11]:
from transformers import Trainer, TrainingArguments

# Minimal TrainingArguments for evaluation
eval_args = TrainingArguments(
    output_dir="./finetuned-llm",
    per_device_eval_batch_size=1,
    fp16=True,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=eval_args,
    eval_dataset=tokenized_val["val"],
    tokenizer=tokenizer
)

# Evaluate
eval_results = trainer.evaluate()
print("Validation results:", eval_results)


  trainer = Trainer(


Validation results: {'eval_loss': 0.16737861931324005, 'eval_model_preparation_time': 0.0042, 'eval_runtime': 1.2682, 'eval_samples_per_second': 1.577, 'eval_steps_per_second': 1.577}


In [12]:
from transformers import pipeline

text_gen = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=200,
    device_map="auto"
)

for example in val_dataset["val"]:
    prompt = f"### Instruction:\n{example['instruction']}\n\n### Response:\n"
    output = text_gen(prompt, do_sample=True, temperature=0.7)[0]["generated_text"]
    generated_response = output.split("### Response:")[-1].strip()

    print("Instruction:", example['instruction'])
    print("Expected:", example['output'])
    print("Generated:", generated_response)
    print("-----")


Device set to use cuda:0


Instruction: What is the recommended dosage of Drug X?
Expected: Take 5mg twice daily for 7 days.
Generated: The recommended dosage of Drug X is 500mg twice daily.
-----
Instruction: How to manage fever for adult patients?
Expected: Paracetamol 500mg every 6 hours, monitor temperature.
Generated: For adult patients with fever, you can advise the following self-care measures:
1. Drink plenty of fluids to stay hydrated.
2. Take over-the-counter pain relievers, such as acetaminophen or ibuprofen, to help reduce fever and alleviate discomfort.
3. Rest in a cool environment.
4. Keep warm with blankets if shivering.
5. Avoid alcohol and caffeine as they can dehydrate you.
6. Consider using a cool compress on the forehead or a cool bath to help reduce temperature.
7. If symptoms persist for more than three days or are severe, consult a healthcare professional.
-----


In [16]:
!pip install -U evaluate rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=86256579ad70f7ea876058948aa46e8702b69205e8ca290e3e3ef4e334436a4c
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [17]:
import evaluate

rouge = evaluate.load("rouge")

references = [ex["output"] for ex in val_dataset["val"]]
predictions = [
    text_gen(f"### Instruction:\n{ex['instruction']}\n\n### Response:\n",
             max_new_tokens=200)[0]["generated_text"].split("### Response:")[-1].strip()
    for ex in val_dataset["val"]
]

results = rouge.compute(predictions=predictions, references=references)
print("ROUGE scores:", results)


Downloading builder script: 0.00B [00:00, ?B/s]

ROUGE scores: {'rouge1': np.float64(0.15468409586056645), 'rouge2': np.float64(0.06666666666666667), 'rougeL': np.float64(0.15468409586056645), 'rougeLsum': np.float64(0.15468409586056645)}
