In [None]:
from google.colab import files

uploaded = files.upload()  # Choose DiseaseAndSymptoms.csv and Disease precaution.csv


In [None]:
import os

print(os.listdir())


In [None]:
import pandas as pd

df = pd.read_csv("DiseaseAndSymptoms.csv")
df.head()


In [None]:
symptom_cols = [col for col in df.columns if col.startswith("Symptom_")]
print(symptom_cols)


In [None]:
import json

data = []

for _, row in df.iterrows():
    # Join all symptoms into one text string
    symptoms = []
    for col in symptom_cols:
        value = row[col]
        if pd.notna(value):
            symptoms.append(str(value).replace("_", " ").strip())

    symptoms_text = ", ".join(symptoms)
    disease = row["Disease"]

    entry = {
        "instruction": "Identify the disease pattern based on symptoms.",
        "input": symptoms_text,
        "output": (
            f"Disease: {disease}\n"
            f"Explanation: These symptoms frequently match {disease} patterns in the dataset.\n"
            "Note: This is not medical or diagnostic advice. Consult a real doctor for health concerns."
        )
    }

    data.append(entry)

print("Example training entry:")
print(json.dumps(data[0], indent=2))
print("Total entries created:", len(data))


In [None]:
import random

# Shuffle the full dataset in-place
random.shuffle(data)

# 80% for training, 20% for testing
split_idx = int(0.8 * len(data))
train = data[:split_idx]
test = data[split_idx:]

print("Train size:", len(train))
print("Test size:", len(test))


In [None]:
import json

with open("train.jsonl", "w") as f:
    for item in train:
        f.write(json.dumps(item) + "\n")

with open("test.jsonl", "w") as f:
    for item in test:
        f.write(json.dumps(item) + "\n")

print("Saved files: train.jsonl and test.jsonl")


In [None]:
import os

print(os.listdir())


In [None]:
!pip install -q transformers datasets peft bitsandbytes accelerate


In [None]:
from datasets import load_dataset

train_ds = load_dataset("json", data_files="train.jsonl")["train"]
test_ds = load_dataset("json", data_files="test.jsonl")["train"]

print(train_ds[0])
print("Train size:", len(train_ds))
print("Test size:", len(test_ds))


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"  # public, no token required

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
)

print("Model loaded successfully.")


In [None]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


In [None]:
from datasets import load_dataset

train_ds = load_dataset("json", data_files="train.jsonl")["train"]
test_ds = load_dataset("json", data_files="test.jsonl")["train"]

print("Train size:", len(train_ds))
print("Test size:", len(test_ds))
print(train_ds[0])


In [None]:
def format_example(example):
    text = (
        f"Instruction: {example['instruction']}\n"
        f"Input: {example['input']}\n"
        f"Output: {example['output']}"
    )
    return {"text": text}

formatted_train = train_ds.map(format_example)
formatted_test = test_ds.map(format_example)

print(formatted_train[0]["text"])


In [None]:
def tokenize_function(example):
    return tokenizer(
        example["text"],
        truncation=True,
        max_length=512,
        padding="max_length"
    )

tokenized_train = formatted_train.map(tokenize_function, batched=True)
tokenized_test = formatted_test.map(tokenize_function, batched=True)

# Remove the original text + fields we don't need for training
tokenized_train = tokenized_train.remove_columns(["instruction", "input", "output", "text"])
tokenized_test = tokenized_test.remove_columns(["instruction", "input", "output", "text"])

import torch
tokenized_train.set_format("torch")
tokenized_test.set_format("torch")

print(tokenized_train[0]["input_ids"][:20])


In [None]:
from transformers import TrainingArguments, Trainer
import torch   # <-- THIS FIXES YOUR ERROR

training_args = TrainingArguments(
    output_dir="./disease-llm-qlora",
    num_train_epochs=2,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    logging_steps=20,

    eval_strategy="epoch",
    save_strategy="epoch",

    fp16=torch.cuda.is_available(),
    report_to="none",
)

def data_collator(features):
    batch = {
        "input_ids": torch.stack([f["input_ids"] for f in features]),
        "attention_mask": torch.stack([f["attention_mask"] for f in features]),
    }
    batch["labels"] = batch["input_ids"].clone()
    return batch

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    data_collator=data_collator,
)

trainer.train()


In [None]:
save_path = "disease_llm_adapter"

model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print("Saved fine-tuned adapter to:", save_path)


In [None]:
import torch

def generate_disease_response(symptom_text, max_new_tokens=80):
    instruction = "Identify the disease pattern based on symptoms."
    prompt = (
        f"Instruction: {instruction}\n"
        f"Input: {symptom_text}\n"
        f"Output:"
    )

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            pad_token_id=tokenizer.eos_token_id
        )

    full_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return full_text


In [None]:
import torch

model.eval()  # set model to evaluation mode

# Use a small subset of the original test dataset (text form)
small_eval = test_ds.select(range(30))   # 30 examples is enough for a confusion matrix

true_labels = []
pred_labels = []

for i, example in enumerate(small_eval):
    # ----- True disease from ground-truth -----
    gt_line = example["output"].split("\n")[0]   # first line: "Disease: XYZ"
    gt_disease = gt_line.replace("Disease:", "").strip()
    true_labels.append(gt_disease)

    # ----- Model prediction -----
    gen_text = generate_disease_response(example["input"], max_new_tokens=40)

    pred_disease = "Unknown"
    if "Disease:" in gen_text:
        try:
            pred_disease = gen_text.split("Disease:")[1].split("\n")[0].strip()
        except:
            pred_disease = "Unknown"

    pred_labels.append(pred_disease)

    # Progress print so you know it's moving
    if (i + 1) % 5 == 0 or (i + 1) == len(small_eval):
        print(f"Processed {i+1}/{len(small_eval)} examples")

print("Done. Collected labels for:", len(true_labels), "examples.")


In [None]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import numpy as np

classes = sorted(list(set(true_labels)))   # unique actual diseases

cm = confusion_matrix(true_labels, pred_labels, labels=classes)

plt.figure(figsize=(10, 8))
im = plt.imshow(cm, interpolation="nearest")
plt.title("Disease Prediction Confusion Matrix")
plt.colorbar(im)

tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=90)
plt.yticks(tick_marks, classes)

plt.xlabel("Predicted Disease")
plt.ylabel("Actual Disease")
plt.tight_layout()
plt.savefig("confusion_matrix.png", dpi=200)
plt.show()

print("Saved confusion_matrix.png")


In [None]:
demo_symptoms = "Fever, headache, body pain"

demo_output = generate_disease_response(demo_symptoms, max_new_tokens=80)
print(demo_output)


In [None]:
with open("sample_output_demo_query.txt", "w") as f:
    f.write("Symptoms: Fever, headache, body pain\n\n")
    f.write("Model response:\n")
    f.write(demo_output)

print("Saved: sample_output_demo_query.txt")


In [None]:
test_cases = [
    "fever, chills, cough, sore throat",                    # maybe flu / viral
    "abdominal pain, diarrhea, vomiting, loss of appetite", # gastro/intestinal
    "joint pain, swelling, stiffness, fatigue",             # arthritis/rheumatic
]

for i, symptoms in enumerate(test_cases, start=1):
    print(f"\n=== Test case {i} ===")
    print("Symptoms:", symptoms)
    response = generate_disease_response(symptoms, max_new_tokens=80)
    print("Model response:\n", response)
    print("-" * 60)


In [None]:
with open("sample_output_demo_query.txt", "w") as f:
    f.write("Symptoms: Fever, headache, body pain\n\n")
    f.write("Model response:\n")
    f.write(demo_output)

print("Saved: sample_output_demo_query.txt")


In [None]:
import os
print(os.listdir())


In [None]:
!zip -r disease_llm_adapter.zip disease_llm_adapter


In [None]:
import os
print(os.listdir())


In [None]:
from google.colab import files

files.download("train.jsonl")
files.download("test.jsonl")
files.download("confusion_matrix.png")
files.download("sample_output_demo_query.txt")
files.download("disease_llm_adapter.zip")


In [None]:
Edit â†’ Clear all outputs


In [None]:
Clear all outputs
