# import necessary libraries

In [1]:
import os
os.environ["WANDB_DISABLED"] = "true"

os.getcwd()
os.listdir()



import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

from transformers import Trainer, TrainingArguments, BertTokenizer, BertForSequenceClassification
import torch



# Read the Symptoms and Disease data

In [5]:

# Load data
Symp_Disease_data = pd.read_csv("/content/Train_data.csv")
Symp_Disease_data.shape



(1200, 3)

# Data Clean up

In [6]:

def clean_text(text):
    text = text.lower()
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)
    return text.strip()

# Apply cleaning - replace 'text_column' with the actual text feature name
Symp_Disease_data['cleaned_text'] = Symp_Disease_data['text'].apply(clean_text)


# Label Encoding of the Target variable - convert disease into numeric values

In [7]:
# Encode labels
label_encoder = LabelEncoder()
Symp_Disease_data["label_id"] = label_encoder.fit_transform(Symp_Disease_data["label"])


# Train-test split

In [9]:
train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    Symp_Disease_data["cleaned_text"], Symp_Disease_data["label_id"], test_size=0.30, random_state=42, stratify=Symp_Disease_data["label_id"]
)
val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts, temp_labels, test_size=0.50, random_state=42, stratify=temp_labels
)


# Save vocab_size and num_classes for later use

print(f"Sample encoded text: {train_texts.shape}")
print(f"Encoded label: {train_labels.shape}")

Sample encoded text: (840,)
Encoded label: (840,)


# Tokenization

In [10]:

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=64)
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True, max_length=64)
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True, max_length=64)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

# Create PyTorch Dataset

In [11]:
import torch

class SymptomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} | {"labels": torch.tensor(self.labels[idx])}

train_dataset = SymptomDataset(train_encodings, train_labels.tolist())
val_dataset = SymptomDataset(val_encodings, val_labels.tolist())
test_dataset = SymptomDataset(test_encodings, test_labels.tolist())


# Model Definition (BERT + Classification Head)

### BertForSequenceClassification is a pre-trained BERT model fine-tuned for text classification tasks such as sentiment analysis, spam detection, or topic classification. It adds a classification head on top of the [CLS] token output from BERT's final layer.  This model is widely used due to its strong contextual understanding and high accuracy across NLP tasks.

In [12]:

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=24
)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Training Setup

In [13]:
# Load model & tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=24)

# Define TrainingArguments (no evaluation_strategy here)
training_args = TrainingArguments(
    output_dir="./results",
    save_steps=1000,          # save checkpoint every N steps
    save_total_limit=1,       # keep only last checkpoint
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=50,
    do_eval=True              # make sure evaluation is enabled
)

# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,  # from your earlier code
    eval_dataset=val_dataset,     # from your earlier code
    tokenizer=tokenizer
)

# Train with manual evaluation each epoch
for epoch in range(int(training_args.num_train_epochs)):
    print(f"\n===== Epoch {epoch+1} / {training_args.num_train_epochs} =====")
    trainer.train()
    print("\n*** Running Evaluation ***")
    trainer.evaluate()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(



===== Epoch 1 / 3 =====


  return forward_call(*args, **kwargs)


Step,Training Loss
50,3.1122
100,2.4955
150,1.6816
200,1.1714
250,0.7772
300,0.626



*** Running Evaluation ***


  return forward_call(*args, **kwargs)



===== Epoch 2 / 3 =====


Step,Training Loss
50,0.4933
100,0.2631
150,0.1337
200,0.067
250,0.0404
300,0.0315



*** Running Evaluation ***


  return forward_call(*args, **kwargs)



===== Epoch 3 / 3 =====


Step,Training Loss
50,0.0231
100,0.0166
150,0.0134
200,0.0104
250,0.0093
300,0.0089



*** Running Evaluation ***


  return forward_call(*args, **kwargs)


# Train the Model

In [14]:
trainer.train()

Step,Training Loss
50,0.0079
100,0.0064
150,0.0056
200,0.005
250,0.0048
300,0.0046


TrainOutput(global_step=315, training_loss=0.005689582015786852, metrics={'train_runtime': 55.6622, 'train_samples_per_second': 45.273, 'train_steps_per_second': 5.659, 'total_flos': 81601098122880.0, 'train_loss': 0.005689582015786852, 'epoch': 3.0})

# Model Evaluation

In [15]:
predictions = trainer.predict(test_dataset)
pred_labels = predictions.predictions.argmax(axis=1)

from sklearn.metrics import classification_report

print(classification_report(test_labels, pred_labels, target_names=label_encoder.classes_))


  return forward_call(*args, **kwargs)


                                 precision    recall  f1-score   support

                           Acne       1.00      1.00      1.00         7
                      Arthritis       1.00      1.00      1.00         7
               Bronchial Asthma       1.00      1.00      1.00         7
           Cervical spondylosis       1.00      1.00      1.00         8
                    Chicken pox       0.89      1.00      0.94         8
                    Common Cold       1.00      1.00      1.00         7
                         Dengue       1.00      0.88      0.93         8
          Dimorphic Hemorrhoids       1.00      1.00      1.00         8
               Fungal infection       1.00      1.00      1.00         7
                   Hypertension       1.00      1.00      1.00         7
                       Impetigo       1.00      1.00      1.00         8
                       Jaundice       1.00      1.00      1.00         7
                        Malaria       1.00      1.

# Deployment-Ready Inference Function

In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)  # ensure model is on GPU (or CPU if no GPU)

def predict_disease(text):
    # Tokenize
    inputs = tokenizer(
        text,
        truncation=True,
        padding=True,
        max_length=64,
        return_tensors="pt"
    )

    # Move inputs to the same device as the model
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Predict
    with torch.no_grad():
        outputs = model(**inputs)
        predicted_class_id = torch.argmax(outputs.logits, dim=1).item()

    return label_encoder.classes_[predicted_class_id]

# Example
print(predict_disease("high fever, severe headache, joint pain"))

Dengue


  return forward_call(*args, **kwargs)


In [17]:
predictions = trainer.predict(test_dataset)
pred_labels = predictions.predictions.argmax(axis=1)

# Key metric for evluation - f1 score

In [18]:
from sklearn.metrics import f1_score

# Macro F1 = average F1 across all classes (treats all classes equally)
macro_f1 = f1_score(test_labels, pred_labels, average='macro')

# Weighted F1 = average F1 weighted by number of samples in each class
weighted_f1 = f1_score(test_labels, pred_labels, average='weighted')

print(f"Macro F1-score: {macro_f1:.4f}")
print(f"Weighted F1-score: {weighted_f1:.4f}")

Macro F1-score: 0.9948
Weighted F1-score: 0.9944
