In [None]:
!pip install trl

In [None]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

# **READING AND PREPARING DATA**

In [None]:
import pandas as pd
df=pd.read_csv('/content/DiseaseAndSymptoms.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4920 entries, 0 to 4919
Data columns (total 18 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Disease     4920 non-null   object
 1   Symptom_1   4920 non-null   object
 2   Symptom_2   4920 non-null   object
 3   Symptom_3   4920 non-null   object
 4   Symptom_4   4572 non-null   object
 5   Symptom_5   3714 non-null   object
 6   Symptom_6   2934 non-null   object
 7   Symptom_7   2268 non-null   object
 8   Symptom_8   1944 non-null   object
 9   Symptom_9   1692 non-null   object
 10  Symptom_10  1512 non-null   object
 11  Symptom_11  1194 non-null   object
 12  Symptom_12  744 non-null    object
 13  Symptom_13  504 non-null    object
 14  Symptom_14  306 non-null    object
 15  Symptom_15  240 non-null    object
 16  Symptom_16  192 non-null    object
 17  Symptom_17  72 non-null     object
dtypes: object(18)
memory usage: 692.0+ KB


In [None]:
# Assuming your dataframe is named df

# Get all symptom columns
symptom_cols = [col for col in df.columns if col.startswith("Symptom_")]

# Create the combined Symptoms column
df["Symptoms"] = df[symptom_cols].apply(
    lambda row: ", ".join([
        str(x).replace("_", " ")               # replace underscores
        for x in row.dropna()                  # remove NaN
        if str(x).strip() != ""                # remove empty strings
    ]),
    axis=1
)
df.Symptoms

Unnamed: 0,Symptoms
0,"itching, skin rash, nodal skin eruptions, d..."
1,"skin rash, nodal skin eruptions, dischromic..."
2,"itching, nodal skin eruptions, dischromic p..."
3,"itching, skin rash, dischromic patches"
4,"itching, skin rash, nodal skin eruptions"
...,...
4915,"vomiting, headache, nausea, spinning movem..."
4916,"skin rash, pus filled pimples, blackheads, ..."
4917,"burning micturition, bladder discomfort, fo..."
4918,"skin rash, joint pain, skin peeling, silve..."


In [None]:
df.drop(symptom_cols, axis=1, inplace=True)
df

Unnamed: 0,Disease,Symptoms
0,Fungal infection,"itching, skin rash, nodal skin eruptions, d..."
1,Fungal infection,"skin rash, nodal skin eruptions, dischromic..."
2,Fungal infection,"itching, nodal skin eruptions, dischromic p..."
3,Fungal infection,"itching, skin rash, dischromic patches"
4,Fungal infection,"itching, skin rash, nodal skin eruptions"
...,...,...
4915,(vertigo) Paroymsal Positional Vertigo,"vomiting, headache, nausea, spinning movem..."
4916,Acne,"skin rash, pus filled pimples, blackheads, ..."
4917,Urinary tract infection,"burning micturition, bladder discomfort, fo..."
4918,Psoriasis,"skin rash, joint pain, skin peeling, silve..."


In [None]:
from sklearn.model_selection import train_test_split
import json
df=df.sample(frac=1)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Function to write JSONL
def write_jsonl(df, filename):
    with open(filename, "w") as f:
        for _, row in df.iterrows():
            record = {
                "instruction": "Identify the disease pattern based on symptoms.",
                "input": row["Symptoms"],
                "output": (
                    f"Disease: {row['Disease']}\n"
                    f"Explanation: These symptoms frequently match {row['Disease']} patterns in the dataset.\n"
                    "Note: This is not medical advice."
                )
            }
            f.write(json.dumps(record) + "\n")

write_jsonl(train_df, "train.jsonl")
write_jsonl(test_df, "test.jsonl")

In [None]:
import json

def load_jsonl(path):
    data = []
    with open(path, "r") as f:
        for line in f:
            data.append(json.loads(line))
    return data

train_raw = load_jsonl("/content/train.jsonl")
test_raw  = load_jsonl("/content/test.jsonl")

# ---------- BUILD TRAIN PROMPTS ----------
train_data = []

for row in train_raw:
    prompt = (
        f"{row['instruction']}\n"
        f"Symptoms: {row['input']}\n\n"
    )

    completion = row["output"]

    train_data.append({
        "prompt": prompt,
        "completion": completion
    })


test_data = []
for row in test_raw:
    prompt = (
        f"{row['instruction']}\n"
        f"Symptoms: {row['input']}\n"
    )
    test_data.append({"prompt": prompt, "completion": ""})  # empty completion for eval


In [None]:
from datasets import Dataset
train = pd.DataFrame(train_data)
test  = pd.DataFrame(test_data)

train_dataset = Dataset.from_pandas(train)
test_dataset  = Dataset.from_pandas(test)



# **LOADING MODEL - MISTRAL 7B**

In [None]:
!huggingface-cli login


In [None]:
model_name = "mistralai/Mistral-7B-v0.1"

tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
#!pip install -U bitsandbytes
#!pip install accelerate peft

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=False,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,   # <-- ONLY this!
    torch_dtype=torch.bfloat16,
    device_map="cuda:0",
    trust_remote_code=True,
)

model.config.use_cache = False
model.config.pretraining_tp = 1
model.gradient_checkpointing_enable()


In [None]:
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device


'cuda'

# **MODEL TRAINING - 2 EPOCHS**

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

from peft import LoraConfig
from trl import SFTTrainer, SFTConfig # Import SFTConfig


# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj"]
)

# Define SFT configuration, moving dataset_text_field here
sft_config = SFTConfig(
    output_dir="./results",
    num_train_epochs=2,
    per_device_train_batch_size=32,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=20,
    logging_steps=100,
    learning_rate=2e-4,
    weight_decay=0.001,
    bf16=True,
    fp16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant" # Moved from TrainingArguments
)

# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    peft_config=peft_config,
    processing_class=tokenizer,
    args=sft_config
)

In [None]:
trainer.train()


# **SAVING THE FINE TUNED MODEL**

In [None]:
trainer.model.save_pretrained("mistral_lora")
tokenizer.save_pretrained("mistral_lora")
test_df.to_csv("test.csv", index=False)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
save_path = "/content/drive/MyDrive/mistral_lora_adapter"

trainer.model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print("LoRA adapter saved to:", save_path)


LoRA adapter saved to: /content/drive/MyDrive/mistral_lora_adapter
