<a href="https://colab.research.google.com/github/infanton/PyHealth/blob/master/alcohol_use_bioclincalbert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# MIMIC-SBDH + NOTEEVENTS: Bio-ClinicalBERT Alcohol Use

This notebook:
- Loads **MIMIC-SBDH** labels
- Streams **NOTEEVENTS.csv** in chunks and then does a simple pandas join
- Fine-tunes **Bio-ClinicalBERT** with 5-fold CV
- Runs an input-length ablation (**128 vs 64 tokens**)


## Install Dependencies

In [None]:
!pip install -q transformers accelerate datasets scikit-learn

## Import Libraries

In [None]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, f1_score

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    get_linear_schedule_with_warmup,
)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Device:', device)

SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)
if device.type == 'cuda':
    torch.cuda.manual_seed_all(SEED)


Device: cuda


## Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Load MIMIC-SBDH and Prepare IDs

In [None]:
SBDH_PATH       = "/content/drive/MyDrive/Colab Notebooks/MIMIC-SBDH.csv"
NOTEEVENTS_PATH = "/content/drive/MyDrive/Colab Notebooks/NOTEEVENTS.csv"

df_sbdh = pd.read_csv(SBDH_PATH)
print("MIMIC-SBDH shape:", df_sbdh.shape)
df_sbdh.head()

MIMIC-SBDH shape: (7025, 9)


Unnamed: 0,row_id,sdoh_community_present,sdoh_community_absent,sdoh_education,sdoh_economics,sdoh_environment,behavior_alcohol,behavior_tobacco,behavior_drug
0,5,0,0,0,0,0,0,1,0
1,42,0,0,0,0,0,0,2,0
2,136,1,0,0,2,1,3,4,0
3,442,1,1,0,0,1,3,1,2
4,328,1,0,0,2,1,3,3,3


In [None]:
df_sbdh['row_id'] = df_sbdh['row_id'].astype(int)
target_ids = set(df_sbdh['row_id'].unique())
print("Number of unique row_id in SBDH:", len(target_ids))

Number of unique row_id in SBDH: 7025


## Stream NOTEEVENTS in Chunks and Then Simple Join

We only read `ROW_ID` and `TEXT`, filter rows whose `ROW_ID` is in `target_ids`,
then concatenate and do a normal pandas merge.

In [None]:
import re
import numpy as np

def extract_social_history(text: str):
    """
    Extract the 'Social History:' section from a discharge summary TEXT.
    Returns the section text (without the heading) or np.nan if not found.
    """
    if not isinstance(text, str):
        return np.nan

    m = re.search(
        r"social history\s*:(.*?)(?:\n[A-Z][A-Za-z /]+:|$)",
        text,
        flags=re.IGNORECASE | re.DOTALL,
    )
    if m:
        return m.group(1).strip()
    else:
        return np.nan

In [None]:
usecols = ['ROW_ID', 'TEXT']
chunksize = 100000
filtered_chunks = []

for i, chunk in enumerate(pd.read_csv(
    NOTEEVENTS_PATH,
    usecols=usecols,
    chunksize=chunksize,
    low_memory=False,
)):
    print(f"Processing chunk {i+1}...")

    chunk['ROW_ID'] = chunk['ROW_ID'].astype(int)

    # Filter only target IDs
    mask_ids = chunk['ROW_ID'].isin(target_ids)
    chunk = chunk[mask_ids]

    if chunk.empty:
        continue

    # Drop rows where TEXT is missing
    chunk = chunk.dropna(subset=['TEXT'])

    filtered_chunks.append(chunk)

# Combine all filtered results
notes_small = pd.concat(filtered_chunks, ignore_index=True)
print("Filtered notes_small shape:", notes_small.shape)
notes_small.head()


Processing chunk 1...
Processing chunk 2...
Processing chunk 3...
Processing chunk 4...
Processing chunk 5...
Processing chunk 6...
Processing chunk 7...
Processing chunk 8...
Processing chunk 9...
Processing chunk 10...
Processing chunk 11...
Processing chunk 12...
Processing chunk 13...
Processing chunk 14...
Processing chunk 15...
Processing chunk 16...
Processing chunk 17...
Processing chunk 18...
Processing chunk 19...
Processing chunk 20...
Processing chunk 21...
Filtered notes_small shape: (7025, 2)


Unnamed: 0,ROW_ID,TEXT
0,178,Admission Date: [**2162-3-3**] D...
1,227,Admission Date: [**2152-10-2**] Dischar...
2,235,Admission Date: [**2198-4-23**] ...
3,188,Admission Date: [**2198-7-17**] ...
4,189,Admission Date: [**2147-6-24**] Discharge...


## Simple pandas Join: SBDH + NOTEEVENTS

In [None]:
notes_small['ROW_ID'] = notes_small['ROW_ID'].astype(int)

df_merged = df_sbdh.merge(
    notes_small,
    left_on='row_id',
    right_on='ROW_ID',
    how='inner',
)

print("Merged shape:", df_merged.shape)
df_merged[['row_id', 'ROW_ID', 'TEXT', 'behavior_alcohol']].head()

Merged shape: (7025, 11)


Unnamed: 0,row_id,ROW_ID,TEXT,behavior_alcohol
0,5,5,She smokes a pack per day.,0
1,42,42,Social history is significant for the absence ...,0
2,136,136,,3
3,442,442,- Tobacco: smokes 1-1.5ppd x 30yrs\n- Alcohol:...,3
4,328,328,"Married with three children, born in [**2184**...",3


## Build Alcohol Use Labels and Modeling DataFrame

In [None]:
df_merged = df_merged.rename(columns={'TEXT': 'note_text'})

TEXT_COL_RAW = 'note_text'
ALC_COL_RAW  = 'behavior_alcohol'  # original 0–4 from MIMIC-SBDH

# Keep original 0–4 labels for 5-class classification
df_model = df_merged[[TEXT_COL_RAW, ALC_COL_RAW]].dropna().copy()
df_model['behavior_alcohol'] = df_model[ALC_COL_RAW].astype(int)

print("5-class label distribution:")
print(df_model['behavior_alcohol'].value_counts())

5-class label distribution:
behavior_alcohol
3    2444
1    2077
0    1657
2     515
4     332
Name: count, dtype: int64


## Tokenizer & Dataset Class

In [None]:
pretrained_model_name = 'emilyalsentzer/Bio_ClinicalBERT'
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)

class NotesDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = list(texts)
        self.labels = list(labels)
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt',
        )
        item = {k: v.squeeze(0) for k, v in enc.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


## Upsampling to Handle Class Imbalance

In [None]:
def make_upsampled_df(df, label_col):
    counts = df[label_col].value_counts()
    max_n = counts.max()
    pieces = []
    for cls, n in counts.items():
        sub = df[df[label_col] == cls]
        if n < max_n:
            extra = sub.sample(max_n - n, replace=True, random_state=42)
            sub = pd.concat([sub, extra], axis=0)
        pieces.append(sub)
    out = pd.concat(pieces, axis=0).sample(frac=1.0, random_state=42).reset_index(drop=True)
    return out


## Train One Fold (Bio-ClinicalBERT)

In [None]:
def train_one_fold(
    train_df,
    val_df,
    fold_idx,
    num_labels=5,
    epochs=3,
    batch_size=8,
    lr=5e-5,
    max_len=128,
):
    # label_col = 'alcohol_use_bin'
    label_col = 'behavior_alcohol'
    num_labels = 5

    train_df = make_upsampled_df(train_df, label_col)

    train_ds = NotesDataset(train_df[TEXT_COL_RAW], train_df[label_col], tokenizer, max_len=max_len)
    val_ds   = NotesDataset(val_df[TEXT_COL_RAW],   val_df[label_col],   tokenizer, max_len=max_len)

    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
    val_loader   = DataLoader(val_ds,   batch_size=batch_size, shuffle=False)

    model = AutoModelForSequenceClassification.from_pretrained(
        pretrained_model_name,
        num_labels=num_labels,
    )
    model.to(device)

    optimizer = AdamW(model.parameters(), lr=lr)

    total_steps = len(train_loader) * epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=int(0.1 * total_steps) if total_steps > 0 else 0,
        num_training_steps=total_steps if total_steps > 0 else 1,
    )

    def run_epoch(loader, train=False):
        if train:
            model.train()
        else:
            model.eval()

        total_loss = 0.0
        all_y = []
        all_pred = []

        for batch in loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            labels = batch['labels']

            optimizer.zero_grad()

            with torch.set_grad_enabled(train):
                out = model(**batch)
                loss = out.loss
                logits = out.logits

                if train:
                    loss.backward()
                    optimizer.step()
                    if total_steps > 0:
                        scheduler.step()

            total_loss += loss.item() * labels.size(0)
            preds = torch.argmax(logits, dim=-1)
            all_y.extend(labels.detach().cpu().numpy().tolist())
            all_pred.extend(preds.detach().cpu().numpy().tolist())

        avg_loss = total_loss / len(loader.dataset)
        return avg_loss, np.array(all_y), np.array(all_pred)

    best_val_f1 = -1.0
    best_state = None

    for epoch in range(1, epochs + 1):
        train_loss, _, _ = run_epoch(train_loader, train=True)
        val_loss, y_true, y_pred = run_epoch(val_loader, train=False)

        macro_f1 = f1_score(y_true, y_pred, average='macro')
        print(
            f"[Fold {fold_idx}] Epoch {epoch}/{epochs} "
            f"train_loss={train_loss:.4f} val_loss={val_loss:.4f} macro_f1={macro_f1:.4f}"
        )

        f1_per_class = f1_score(y_true, y_pred, average=None)
        print("Per-class F1:", f1_per_class)

        if macro_f1 > best_val_f1:
            best_val_f1 = macro_f1
            best_state = {
                'model_state_dict': model.state_dict(),
                'y_true': y_true,
                'y_pred': y_pred,
            }

            global last_trained_model
            last_trained_model = model

    return best_val_f1, best_state


## Cross-Validation Wrapper

In [None]:
def run_cv_experiment(df_model, max_len=128, num_labels=2, epochs=3, batch_size=8, lr=5e-5):
    label_col = 'behavior_alcohol'
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    fold_macro_f1 = []

    for fold, (train_idx, val_idx) in enumerate(skf.split(df_model[TEXT_COL_RAW], df_model[label_col]), start=1):
        print('=' * 70)
        print(f'Fold {fold} | max_len={max_len}')
        train_df = df_model.iloc[train_idx].reset_index(drop=True)
        val_df   = df_model.iloc[val_idx].reset_index(drop=True)

        best_f1, best_state = train_one_fold(
            train_df,
            val_df,
            fold_idx=fold,
            num_labels=5,
            epochs=epochs,
            batch_size=batch_size,
            lr=lr,
            max_len=max_len,
        )
        fold_macro_f1.append(best_f1)

        y_true = best_state['y_true']
        y_pred = best_state['y_pred']
        print(f"\n[Fold {fold}] Best-epoch classification report:")
        print(classification_report(y_true, y_pred, digits=3))
        print('=' * 70)

    print("\n==== 5-fold CV summary (macro-F1) ====")
    print("Fold macro-F1:", [f"{x:.3f}" for x in fold_macro_f1])
    print("Mean macro-F1: {:.3f} ± {:.3f}".format(np.mean(fold_macro_f1), np.std(fold_macro_f1)))

    return fold_macro_f1


# Run Experiments

## Baseline: max_len = 128

In [None]:
scores_128 = run_cv_experiment(
    df_model,
    max_len=128,
    num_labels=5,
    epochs=3, # 50 was used in paper
    batch_size=8,
    lr=5e-5,
)


Fold 1 | max_len=128


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[Fold 1] Epoch 1/3 train_loss=0.6843 val_loss=0.4311 macro_f1=0.8319
Per-class F1: [0.85598923 0.87989556 0.74774775 0.92389006 0.7518797 ]
[Fold 1] Epoch 2/3 train_loss=0.2243 val_loss=0.4034 macro_f1=0.8537
Per-class F1: [0.86486486 0.90636704 0.74509804 0.92584746 0.82644628]
[Fold 1] Epoch 3/3 train_loss=0.1404 val_loss=0.4076 macro_f1=0.8616
Per-class F1: [0.85322359 0.915      0.7979798  0.92033543 0.82170543]

[Fold 1] Best-epoch classification report:
              precision    recall  f1-score   support

           0      0.783     0.937     0.853       332
           1      0.951     0.882     0.915       415
           2      0.832     0.767     0.798       103
           3      0.944     0.898     0.920       489
           4      0.841     0.803     0.822        66

    accuracy                          0.888      1405
   macro avg      0.870     0.857     0.862      1405
weighted avg      0.895     0.888     0.889      1405

Fold 2 | max_len=128


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[Fold 2] Epoch 1/3 train_loss=0.6652 val_loss=0.4481 macro_f1=0.7971
Per-class F1: [0.84167794 0.86602871 0.65882353 0.92787944 0.69117647]
[Fold 2] Epoch 2/3 train_loss=0.2335 val_loss=0.4453 macro_f1=0.8221
Per-class F1: [0.85561497 0.88545689 0.73732719 0.93942614 0.69291339]
[Fold 2] Epoch 3/3 train_loss=0.1547 val_loss=0.4363 macro_f1=0.8176
Per-class F1: [0.85326087 0.8852459  0.72195122 0.93920335 0.68852459]

[Fold 2] Best-epoch classification report:
              precision    recall  f1-score   support

           0      0.769     0.964     0.856       332
           1      0.953     0.827     0.885       416
           2      0.702     0.777     0.737       103
           3      0.976     0.906     0.939       488
           4      0.721     0.667     0.693        66

    accuracy                          0.875      1405
   macro avg      0.824     0.828     0.822      1405
weighted avg      0.888     0.875     0.877      1405

Fold 3 | max_len=128


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[Fold 3] Epoch 1/3 train_loss=0.6516 val_loss=0.4324 macro_f1=0.8221
Per-class F1: [0.85447263 0.88860435 0.70899471 0.91987513 0.73846154]
[Fold 3] Epoch 2/3 train_loss=0.2107 val_loss=0.4343 macro_f1=0.8318
Per-class F1: [0.85254692 0.88341969 0.71028037 0.91954023 0.79338843]
[Fold 3] Epoch 3/3 train_loss=0.1426 val_loss=0.4558 macro_f1=0.8312
Per-class F1: [0.85210312 0.89414695 0.7173913  0.92834891 0.76422764]

[Fold 3] Best-epoch classification report:
              precision    recall  f1-score   support

           0      0.766     0.961     0.853       331
           1      0.958     0.820     0.883       416
           2      0.685     0.738     0.710       103
           3      0.940     0.900     0.920       489
           4      0.873     0.727     0.793        66

    accuracy                          0.870      1405
   macro avg      0.844     0.829     0.832      1405
weighted avg      0.883     0.870     0.872      1405

Fold 4 | max_len=128


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[Fold 4] Epoch 1/3 train_loss=0.6457 val_loss=0.3956 macro_f1=0.8298
Per-class F1: [0.86595174 0.88010204 0.7638191  0.93062967 0.70833333]
[Fold 4] Epoch 2/3 train_loss=0.2337 val_loss=0.4106 macro_f1=0.8470
Per-class F1: [0.86344828 0.89393939 0.79396985 0.9375     0.74626866]
[Fold 4] Epoch 3/3 train_loss=0.1523 val_loss=0.4414 macro_f1=0.8423
Per-class F1: [0.86657497 0.88360451 0.7902439  0.93501048 0.736     ]

[Fold 4] Best-epoch classification report:
              precision    recall  f1-score   support

           0      0.794     0.946     0.863       331
           1      0.939     0.853     0.894       415
           2      0.823     0.767     0.794       103
           3      0.955     0.920     0.938       489
           4      0.746     0.746     0.746        67

    accuracy                          0.887      1405
   macro avg      0.852     0.846     0.847      1405
weighted avg      0.893     0.887     0.888      1405

Fold 5 | max_len=128


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[Fold 5] Epoch 1/3 train_loss=0.6523 val_loss=0.4297 macro_f1=0.8203
Per-class F1: [0.86486486 0.88528678 0.68393782 0.92787944 0.73972603]
[Fold 5] Epoch 2/3 train_loss=0.2236 val_loss=0.4617 macro_f1=0.8255
Per-class F1: [0.86111111 0.87135922 0.70707071 0.93390192 0.75384615]
[Fold 5] Epoch 3/3 train_loss=0.1460 val_loss=0.4744 macro_f1=0.8282
Per-class F1: [0.86312849 0.87361963 0.71287129 0.93096234 0.76033058]

[Fold 5] Best-epoch classification report:
              precision    recall  f1-score   support

           0      0.803     0.934     0.863       331
           1      0.890     0.858     0.874       415
           2      0.727     0.699     0.713       103
           3      0.953     0.910     0.931       489
           4      0.852     0.687     0.760        67

    accuracy                          0.874      1405
   macro avg      0.845     0.817     0.828      1405
weighted avg      0.878     0.874     0.874      1405


==== 5-fold CV summary (macro-F1) ====
Fold ma

## Ablation: max_len = 64

In [None]:
scores_64 = run_cv_experiment(
    df_model,
    max_len=64,
    num_labels=5,
    epochs=3, # 50 was used in paper
    batch_size=8,
    lr=5e-5,
)


Fold 1 | max_len=64


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[Fold 1] Epoch 1/3 train_loss=0.7505 val_loss=0.5855 macro_f1=0.7516
Per-class F1: [0.77777778 0.78472959 0.67281106 0.87892377 0.64383562]
[Fold 1] Epoch 2/3 train_loss=0.2779 val_loss=0.5780 macro_f1=0.7441
Per-class F1: [0.75792988 0.78623566 0.62564103 0.87789474 0.6728972 ]
[Fold 1] Epoch 3/3 train_loss=0.1550 val_loss=0.6247 macro_f1=0.7824
Per-class F1: [0.79600571 0.86308068 0.65989848 0.87346939 0.71929825]

[Fold 1] Best-epoch classification report:
              precision    recall  f1-score   support

           0      0.756     0.840     0.796       332
           1      0.876     0.851     0.863       415
           2      0.691     0.631     0.660       103
           3      0.872     0.875     0.873       489
           4      0.854     0.621     0.719        66

    accuracy                          0.830      1405
   macro avg      0.810     0.764     0.782      1405
weighted avg      0.832     0.830     0.829      1405

Fold 2 | max_len=64


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[Fold 2] Epoch 1/3 train_loss=0.7610 val_loss=0.5626 macro_f1=0.7572
Per-class F1: [0.79040404 0.84196185 0.6036036  0.90217391 0.64788732]
[Fold 2] Epoch 2/3 train_loss=0.2946 val_loss=0.5751 macro_f1=0.7651
Per-class F1: [0.79681275 0.84129032 0.65306122 0.88333333 0.65079365]
[Fold 2] Epoch 3/3 train_loss=0.1725 val_loss=0.6334 macro_f1=0.7640
Per-class F1: [0.79566982 0.84278351 0.65625    0.89048106 0.63492063]

[Fold 2] Best-epoch classification report:
              precision    recall  f1-score   support

           0      0.713     0.904     0.797       332
           1      0.908     0.784     0.841       416
           2      0.688     0.621     0.653       103
           3      0.898     0.869     0.883       488
           4      0.683     0.621     0.651        66

    accuracy                          0.822      1405
   macro avg      0.778     0.760     0.765      1405
weighted avg      0.832     0.822     0.823      1405

Fold 3 | max_len=64


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[Fold 3] Epoch 1/3 train_loss=0.7651 val_loss=0.5745 macro_f1=0.7548
Per-class F1: [0.79946164 0.83089214 0.61924686 0.89751888 0.62666667]
[Fold 3] Epoch 2/3 train_loss=0.2913 val_loss=0.5041 macro_f1=0.7910
Per-class F1: [0.81309686 0.84741488 0.63043478 0.89484536 0.76923077]
[Fold 3] Epoch 3/3 train_loss=0.1673 val_loss=0.5528 macro_f1=0.7891
Per-class F1: [0.81842818 0.84833539 0.61797753 0.89864159 0.76190476]

[Fold 3] Best-epoch classification report:
              precision    recall  f1-score   support

           0      0.741     0.900     0.813       331
           1      0.891     0.808     0.847       416
           2      0.716     0.563     0.630       103
           3      0.902     0.888     0.895       489
           4      0.781     0.758     0.769        66

    accuracy                          0.837      1405
   macro avg      0.806     0.783     0.791      1405
weighted avg      0.842     0.837     0.836      1405

Fold 4 | max_len=64


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[Fold 4] Epoch 1/3 train_loss=0.7572 val_loss=0.5655 macro_f1=0.7602
Per-class F1: [0.781893   0.80291971 0.67264574 0.9030837  0.640625  ]
[Fold 4] Epoch 2/3 train_loss=0.2863 val_loss=0.5412 macro_f1=0.7870
Per-class F1: [0.82432432 0.83870968 0.68       0.89948187 0.69230769]
[Fold 4] Epoch 3/3 train_loss=0.1694 val_loss=0.5799 macro_f1=0.7776
Per-class F1: [0.82093664 0.82860666 0.65656566 0.89915966 0.68292683]

[Fold 4] Best-epoch classification report:
              precision    recall  f1-score   support

           0      0.746     0.921     0.824       331
           1      0.903     0.783     0.839       415
           2      0.701     0.660     0.680       103
           3      0.912     0.888     0.899       489
           4      0.714     0.672     0.692        67

    accuracy                          0.838      1405
   macro avg      0.795     0.785     0.787      1405
weighted avg      0.845     0.838     0.838      1405

Fold 5 | max_len=64


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[Fold 5] Epoch 1/3 train_loss=0.7768 val_loss=0.7267 macro_f1=0.7308
Per-class F1: [0.77860697 0.79554937 0.60655738 0.8600224  0.61333333]
[Fold 5] Epoch 2/3 train_loss=0.2991 val_loss=0.6259 macro_f1=0.7425
Per-class F1: [0.8067701  0.80923451 0.59633028 0.87783784 0.62222222]
[Fold 5] Epoch 3/3 train_loss=0.1797 val_loss=0.6791 macro_f1=0.7554
Per-class F1: [0.80821918 0.8172043  0.60540541 0.87445887 0.67164179]

[Fold 5] Best-epoch classification report:
              precision    recall  f1-score   support

           0      0.739     0.891     0.808       331
           1      0.810     0.824     0.817       415
           2      0.683     0.544     0.605       103
           3      0.929     0.826     0.874       489
           4      0.672     0.672     0.672        67

    accuracy                          0.813      1405
   macro avg      0.767     0.751     0.755      1405
weighted avg      0.819     0.813     0.813      1405


==== 5-fold CV summary (macro-F1) ====
Fold ma

## Inference Tests

In [None]:
import torch
import torch.nn.functional as F

model = last_trained_model
model.eval()
model.to(device)

id2label = {
    0: "0 = None / Not mentioned",
    1: "1 = Negative",
    2: "2 = Positive",
    3: "3 = Potential",
    4: "4 = N/A / Uncertain",
}

def predict_alcohol_use_5class(
    text,
    threshold_any=0.6,
    max_len=128,
):
    """
    Run 5-class Bio-ClinicalBERT on a single note string.
    Returns:
      - predicted_label_id (int)
      - predicted_label_name (str)
      - prob_dict (dict: label_name -> probability)
      - any_alcohol_flag (bool, based on Positive+Potential >= threshold_any)
      - any_alcohol_prob (float)
    """
    enc = tokenizer(
        text,
        truncation=True,
        padding="max_length",
        max_length=max_len,
        return_tensors="pt",
    )
    enc = {k: v.to(device) for k, v in enc.items()}

    with torch.no_grad():
        logits = model(**enc).logits
        probs = F.softmax(logits, dim=-1)

    probs = probs.squeeze(0).cpu().numpy()

    pred_id = int(probs.argmax())
    pred_name = id2label.get(pred_id, str(pred_id))

    prob_dict = {id2label[i]: float(probs[i]) for i in range(len(probs))}

    p_pos = float(probs[2])
    p_pot = float(probs[3])
    p_any = p_pos + p_pot
    any_flag = p_any >= threshold_any

    return pred_id, pred_name, prob_dict, any_flag, p_any

test_texts = [
    "Patient denies any alcohol use. No drinking reported.",
    "Patient reports drinking beer daily for several years.",
    "Alcohol use unclear. Mentions occasional social drinks.",
    "No history of alcohol consumption noted.",
]

print("Running 5-class Bio-ClinicalBERT alcohol use predictions...\n")
for i, txt in enumerate(test_texts, start=1):
    pred_id, pred_name, prob_dict, any_flag, p_any = predict_alcohol_use_5class(
        txt,
        threshold_any=0.6,
        max_len=128,
    )
    print(f"Example {i}:")
    print("  Text:", txt)
    print("  Predicted 5-class label:", pred_name)
    print("  P(any alcohol use) = {:.3f} | flag = {}".format(p_any, any_flag))
    print("  Full probs:", prob_dict)
    print()

Running 5-class Bio-ClinicalBERT alcohol use predictions...

Example 1:
  Text: Patient denies any alcohol use. No drinking reported.
  Predicted 5-class label: 3 = Potential
  P(any alcohol use) = 0.999 | flag = True
  Full probs: {'0 = None / Not mentioned': 0.00047981529496610165, '1 = Negative': 0.0005474034696817398, '2 = Positive': 0.00021043028391432017, '3 = Potential': 0.9986667633056641, '4 = N/A / Uncertain': 9.56122312345542e-05}

Example 2:
  Text: Patient reports drinking beer daily for several years.
  Predicted 5-class label: 1 = Negative
  P(any alcohol use) = 0.064 | flag = False
  Full probs: {'0 = None / Not mentioned': 0.0022712363861501217, '1 = Negative': 0.9321798086166382, '2 = Positive': 0.06376467645168304, '3 = Potential': 0.0006109825917519629, '4 = N/A / Uncertain': 0.0011733019491657615}

Example 3:
  Text: Alcohol use unclear. Mentions occasional social drinks.
  Predicted 5-class label: 1 = Negative
  P(any alcohol use) = 0.000 | flag = False
  Full pro