In [1]:
import os
import torch
import pandas as pd
from torch.utils.data import DataLoader, random_split
from transformers import AutoTokenizer
import src.config as config
from src.utils import ClinicalDataset 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
SPLIT_PATH = "data_splits.pth"

In [3]:
# Load data
df = pd.read_csv(config.DATA_PATH)
df = df.dropna(subset=["findings", "impression"])

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(config.MODEL_NAME)
dataset = ClinicalDataset(df, tokenizer)

if os.path.exists(SPLIT_PATH):
    print(f"[INFO] Loading existing split from {SPLIT_PATH}")
    split = torch.load(SPLIT_PATH)
    train_indices, test_indices = split["train"], split["test"]
    

[INFO] Loading existing split from data_splits.pth


In [14]:
test_indices[:3]

[2528, 563, 1095]

In [4]:
model_path = "checkpoint_epoch2.pth"

In [5]:
# test.py
import os
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import src.config as config

def load_checkpoint(model, ckpt_path=model_path, device="cpu"):
    if not os.path.exists(ckpt_path):
        raise FileNotFoundError(f"Checkpoint not found at {ckpt_path}")
    checkpoint = torch.load(ckpt_path, map_location=device)
    model.load_state_dict(checkpoint["model_state_dict"])
    print(f"Loaded checkpoint from epoch {checkpoint['epoch']}")
    return checkpoint["epoch"]


def predict_single(text, model, tokenizer, device, max_length=64):
    model.eval()
    inputs = tokenizer(
        text,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=128
    ).to(device)

    with torch.no_grad():
        output_ids = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=max_length,
        )
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)


# def main():
#     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#     tokenizer = AutoTokenizer.from_pretrained(config.MODEL_NAME)
#     model = AutoModelForSeq2SeqLM.from_pretrained(config.MODEL_NAME).to(device)

#     # Load trained weights
#     load_checkpoint(model, ckpt_path=model_path, device=device)

#     # Example input (replace with your own)
#     input_text = "Spinal stimulator in XXXX. Lungs are clear without focal airspace disease. No pleural effusions or pneumothoraces. Heart and mediastinum of normal size and contour. Degenerative changes in the thoracic spine."
#     prediction = predict_single(input_text, model, tokenizer, device)

#     print("\nInput :", input_text)
#     print("Output:", prediction)


# if __name__ == "__main__":
#     main()


In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained(config.MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(config.MODEL_NAME).to(device)

# Load trained weights
load_checkpoint(model, ckpt_path=model_path, device=device)


Loaded checkpoint from epoch 1


1

In [7]:
test_indices[:20]

[439,
 1912,
 2018,
 3097,
 1281,
 3027,
 188,
 2020,
 1167,
 2595,
 3217,
 2097,
 3074,
 37,
 2808,
 812,
 3137,
 3096,
 1441,
 3049]

In [36]:
df.iloc[670]["findings"]

'Lungs are clear. Heart size normal. The XXXX are unremarkable.'

In [None]:
df.iloc[670]["impression"]

'Clear lungs.'

In [8]:
idx = test_indices[:20]
# Example input (replace with your own)
for i in idx:
    input_text = df.iloc[i]["findings"]
    prediction = predict_single(input_text, model, tokenizer, device)
    actual = df.iloc[i]["impression"]

    print(f"Case No. {i+1}:")
    print("\nInput :", input_text)
    print("Model Output :", prediction)
    print("Actual Doctor :", actual)
    print("\n\n")

Case No. 440:

Input : There are no focal areas of consolidation. No suspicious bony opacities. Heart size within normal limits. No pleural effusions. No evidence of pneumothorax. Mild degenerative changes of the thoracic spine.
Model Output : No acute cardiopulmonary abnormality.
Actual Doctor : No acute cardiopulmonary abnormality.



Case No. 1913:

Input : Cardiac and mediastinal contours are unremarkable. Pulmonary vascularity is within normal limits. No focal air space opacities, pleural effusion, or pneumothorax. No cavitary lesions. XXXX are grossly unremarkable.
Model Output : 1. No acute cardiopulmonary disease.
Actual Doctor : 1. Clear lungs. No radiographic evidence of active TB.



Case No. 2019:

Input : Mild cardiomegaly, stable mediastinal contours. No focal alveolar consolidation, no definite pleural effusion seen. Mild bronchovascular crowding without typical findings of pulmonary edema.
Model Output : Mild cardiomegaly without acute pulmonary findings
Actual Doctor :