In [12]:
pip install transformers torch scikit-learn



In [13]:
texts = [
    "Fever, cough, chest pain, and difficulty breathing.",        # pneumonia
    "Wheezing, tight chest, and shortness of breath.",            # asthma
    "Blurred vision and high blood sugar levels.",                # diabetes
    "Itchy skin and red rash after eating shellfish.",            # allergy
    "Stable vitals, normal examination, no complaints.",          # healthy
]

labels = [0, 1, 2, 3, 4]  # 0 = pneumonia, 1 = asthma, ..., 4 = healthy
label_map = {0: "pneumonia", 1: "asthma", 2: "diabetes", 3: "allergy", 4: "healthy"}

In [14]:
from transformers import AutoTokenizer
from torch.utils.data import Dataset
import torch

tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

class ClinicalDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
        self.labels = torch.tensor(labels)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item

dataset = ClinicalDataset(texts, labels)


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [15]:
from transformers import AutoModelForSequenceClassification

num_labels = len(set(labels))  # Here: 5 classes
model = AutoModelForSequenceClassification.from_pretrained(
    "emilyalsentzer/Bio_ClinicalBERT",
    num_labels=num_labels
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
import os
os.environ["WANDB_DISABLED"] = "true"  # 👈 disables wandb popup

from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=2,
    logging_dir="./logs",
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
)

trainer.train()


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss


TrainOutput(global_step=9, training_loss=1.493546273973253, metrics={'train_runtime': 40.517, 'train_samples_per_second': 0.37, 'train_steps_per_second': 0.222, 'total_flos': 115628089950.0, 'train_loss': 1.493546273973253, 'epoch': 3.0})

In [24]:
import pandas as pd
import torch
from transformers import AutoTokenizer
from tqdm import tqdm

# Texts to classify
test_texts = [
    "The patient complains of shortness of breath and fever.",
    "Normal test results and no symptoms.",
    "Tingling and high glucose levels after meals."
]

# Tokenize all at once
inputs = tokenizer(test_texts, return_tensors="pt", padding=True, truncation=True)

# Run prediction
model.eval()
with torch.no_grad():
    outputs = model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=1)
    preds = torch.argmax(probs, dim=1)

# Map predictions to labels
predictions = []
for i, text in enumerate(test_texts):
    pred_label = preds[i].item()
    confidence = round(probs[i][pred_label].item(), 3)
    predictions.append({
        "text": text,
        "predicted_class": pred_label,
        "label_name": label_map[pred_label],
        "confidence": confidence
    })

# Save to CSV
df = pd.DataFrame(predictions)
df.to_csv("clinical_predictions.csv", index=False)
print("✅ Saved predictions to clinical_predictions.csv")


✅ Saved predictions to clinical_predictions.csv
