In [1]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertModel
from torch.optim import Adam
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

# load csv
csv_path = '/content/mimic_reports_with_results.csv'
df = pd.read_csv(csv_path).head(100)
print('Done')

Done


In [2]:
# mapping texts with labels
label_map = {'Normal': 0, 'Abnormal': 1, 'Uncertain': 2}
texts = df['text'].tolist()
labels = df['Results'].map(label_map).tolist()
print('Done')

Done


In [3]:
# train validation
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42, stratify=labels
)
print('Done')

Done


In [4]:
# tokenize text for model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

train_inputs = tokenizer(train_texts, padding=True, truncation=True, return_tensors="pt")
val_inputs = tokenizer(val_texts, padding=True, truncation=True, return_tensors="pt")
print('Done')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Done


In [5]:
# preparing datasets and dataloaders
train_labels_tensor = torch.tensor(train_labels)
val_labels_tensor = torch.tensor(val_labels)

train_dataset = TensorDataset(train_inputs['input_ids'], train_inputs['attention_mask'], train_labels_tensor)
val_dataset = TensorDataset(val_inputs['input_ids'], val_inputs['attention_mask'], val_labels_tensor)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)
print('Done')


Done


In [6]:
# defining three-class classifier
class ThreeClassDistilBERT(nn.Module):
    def __init__(self, hidden_size=768):
        super().__init__()
        self.bert = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.classifier = nn.Linear(hidden_size, 3)
        self.dropout = nn.Dropout(0.3)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

In [7]:
# train model with validation
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ThreeClassDistilBERT().to(device)
optimizer = Adam(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()

epochs = 3

for epoch in range(epochs):
    # --- Training ---
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        optimizer.zero_grad()
        logits = model(input_ids, attention_mask)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(train_loader)
    print(f"\nEpoch {epoch+1}/{epochs} - Training Loss: {avg_loss:.4f}")

    # --- Validation ---
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            logits = model(input_ids, attention_mask)
            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)
    print(f"Validation Accuracy: {acc:.4f}")
    print(classification_report(all_labels, all_preds, target_names=label_map.keys()))



Epoch 1/3 - Training Loss: 0.9511
Validation Accuracy: 0.5000
              precision    recall  f1-score   support

      Normal       0.50      1.00      0.67        10
    Abnormal       0.00      0.00      0.00         9
   Uncertain       0.00      0.00      0.00         1

    accuracy                           0.50        20
   macro avg       0.17      0.33      0.22        20
weighted avg       0.25      0.50      0.33        20



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Epoch 2/3 - Training Loss: 0.8022
Validation Accuracy: 0.5000
              precision    recall  f1-score   support

      Normal       0.50      0.80      0.62        10
    Abnormal       0.50      0.22      0.31         9
   Uncertain       0.00      0.00      0.00         1

    accuracy                           0.50        20
   macro avg       0.33      0.34      0.31        20
weighted avg       0.47      0.50      0.45        20



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Epoch 3/3 - Training Loss: 0.7128
Validation Accuracy: 0.5500
              precision    recall  f1-score   support

      Normal       0.62      0.50      0.56        10
    Abnormal       0.50      0.67      0.57         9
   Uncertain       0.00      0.00      0.00         1

    accuracy                           0.55        20
   macro avg       0.38      0.39      0.38        20
weighted avg       0.54      0.55      0.53        20



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
