In [1]:
import torch
assert torch.cuda.is_available(), "⚠️ Enable GPU: Runtime → Change runtime type → GPU"
print("🚀 Using GPU:", torch.cuda.get_device_name(0))

🚀 Using GPU: Tesla T4


In [2]:
# 🛠 Install compatible libraries
!pip install --upgrade datasets "numpy<2.0"
!pip install -q transformers datasets torch torchvision torchaudio kagglehub accelerate

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting numpy<2.0
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m815.1 kB/s[0m eta [36m0:00:00[0m
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.3/18.3 MB[0m [31m24.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m62.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m54.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m49.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m827.8 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [1]:
import os, glob, zipfile
import torch
import pandas as pd
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer, DataCollatorWithPadding
)
from datasets import Dataset
import kagglehub

In [2]:
# ✅ Step 1: Verify GPU availability
assert torch.cuda.is_available(), (
    "⚠️ No GPU detected! "
    "Go to Runtime → Change runtime type → set Hardware accelerator to GPU"
)
device = torch.device("cuda")
print("🚀 Training on GPU:", torch.cuda.get_device_name(0))  # e.g., Tesla T4


🚀 Training on GPU: Tesla T4


In [13]:
# ✅ Step 2: Download dataset
print("📥 Downloading dataset…")
download_path = kagglehub.dataset_download("niyarrbarman/symptom2disease")
print("Download path:", download_path)

# ✅ Step 3: Locate CSV
all_files = glob.glob(os.path.join(download_path, "**/*.*"), recursive=True)
csv_file = next((f for f in all_files if f.lower().endswith(".csv")), None)
if not csv_file:
    zip_file = next((f for f in all_files if f.endswith(".zip")), None)
    if zip_file:
        with zipfile.ZipFile(zip_file, 'r') as z:
            z.extractall(download_path)
        all_files = glob.glob(os.path.join(download_path, "**/*.*"), recursive=True)
        csv_file = next((f for f in all_files if f.lower().endswith(".csv")), None)
if not csv_file:
    raise RuntimeError(f"No CSV found in {download_path}. Files: {all_files}")

print("✅ CSV found:", csv_file)
df = pd.read_csv(csv_file)
print("✅ Loaded", len(df), "rows across", df['label'].nunique(), "disease classes")
print(df.head())

# ✅ Step 4: Label mapping
# ✅ Step 4: Label mapping (unchanged)
labels = sorted(df['label'].unique())
label2id = {l:i for i,l in enumerate(labels)}
id2label = {i:l for l,i in label2id.items()}
df['label_id'] = df['label'].map(label2id)

# 🆕 New disease sample
new_samples = [
    {
        "text": "jar and khoki jaaro",
        "label": "kaala jar"
    }
]

for sample in new_samples:
    lbl = sample["label"]
    if lbl not in label2id:
        new_id = max(label2id.values()) + 1
        label2id[lbl] = new_id
        id2label[new_id] = lbl

    new_row_df = pd.DataFrame([{
        "text": sample["text"],
        "label": sample["label"],
        "label_id": label2id[sample["label"]]
    }])
    df = pd.concat([df, new_row_df], ignore_index=True)

print("Updated rows:", len(df), "| disease classes:", df['label'].nunique())
# 🔧 FIX: Update labels list after adding new samples
labels = sorted(df['label'].unique())  # Recalculate labels list
print("Updated rows:", len(df), "| disease classes:", len(labels))
# ✅ Step 5: create HF dataset
hf = Dataset.from_pandas(
    df[['text','label_id']].rename(columns={'label_id':'labels'}),
    preserve_index=False
)


📥 Downloading dataset…
Download path: /kaggle/input/symptom2disease
✅ CSV found: /kaggle/input/symptom2disease/Symptom2Disease.csv
✅ Loaded 1200 rows across 24 disease classes
   Unnamed: 0      label                                               text
0           0  Psoriasis  I have been experiencing a skin rash on my arm...
1           1  Psoriasis  My skin has been peeling, especially on my kne...
2           2  Psoriasis  I have been experiencing joint pain in my fing...
3           3  Psoriasis  There is a silver like dusting on my skin, esp...
4           4  Psoriasis  My nails have small dents or pits in them, and...
Updated rows: 1201 | disease classes: 25
Updated rows: 1201 | disease classes: 25


In [14]:

# ✅ Step 6: Tokenization
model_name = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"
tokenizer = AutoTokenizer.from_pretrained(model_name)
def tokenize_fn(ex): return tokenizer(ex["text"], truncation=True, padding="max_length", max_length=512)
hf = hf.map(tokenize_fn, batched=True)
hf.set_format(type="torch", columns=["input_ids","attention_mask","labels"])

# ✅ Step 7: Load model onto GPU
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=len(labels), id2label=id2label, label2id=label2id
)
model.to(device)

Map:   0%|          | 0/1201 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [16]:
# ✅ Step 8: Setup Trainer with GPU
assert torch.cuda.is_available(), (
    "⚠️ No GPU detected! "
    "Go to Runtime → Change runtime type → set Hardware accelerator to GPU"
)
device = torch.device("cuda")
print("🚀 Training on GPU:", torch.cuda.get_device_name(0))  # e.g., Tesla T4


🚀 Training on GPU: Tesla T4


In [28]:
training_args = TrainingArguments(
    output_dir="./pubmedbert-s2d",
    num_train_epochs=9,
    per_device_train_batch_size=8,
    warmup_steps=50,
    weight_decay=0.01,
    logging_steps=10,
    eval_strategy="no", # Changed from evaluation_strateg
    save_strategy="epoch",
    # evaluation_strategy="epoch",
    # load_best_model_at_end=True,
    fp16=torch.cuda.is_available(),  # Mixed precision
    seed=42
)

In [29]:
trainer = Trainer(
    model=model, args=training_args,
    train_dataset=hf,
    data_collator=DataCollatorWithPadding(tokenizer),
    tokenizer=tokenizer,
)

# ✅ Step 9: Train & evaluate
print("🏋️ Training started…")
trainer.train()
print("✅ Training completed")
# print("📊 Evaluation metrics:", trainer.evaluate())

  trainer = Trainer(


🏋️ Training started…


Step,Training Loss
10,0.0452
20,0.0116
30,0.0103
40,0.0092
50,0.0489
60,0.0075
70,0.0067
80,0.014
90,0.2149
100,0.0696


✅ Training completed


In [30]:

# ✅ Step 10: Inference helper
def predict(text:str):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=512)
    for k in inputs: inputs[k] = inputs[k].to(device)
    model.eval()
    with torch.no_grad():
        probs = torch.nn.functional.softmax(model(**inputs).logits, dim=-1)
        idx = int(probs.argmax().item())
    return id2label[idx], float(probs[0, idx])

# 🧪 Test inference
# print(predict("I have a red, itchy, scaly rash on my elbows."))
print(predict("jar and khoki jaaro"))


('kaala jar', 0.9499890804290771)


In [31]:
predict("I've had a high fever, particularly at night. It's been quite unpleasant. There is a little headache, as well as constipation and diarrhea. I don't feel like eating anything.")

('Typhoid', 0.9998500347137451)

In [32]:
predict("I'm feeling fatigued and have no energy. I can barely keep my eyes open during the day, and I've been feeling lethargic and unable to motivate myself.")

('Chicken pox', 0.9997732043266296)

In [None]:
predict("Recently, my skin has been quite itchy, and I have a rash all over my body. My skin also has a few spots where the hue is altered and some lumps and knot-like pimples.")

In [26]:
print(predict("jar and khoki jaaro"))

('Jaundice', 0.11904729902744293)
