<a href="https://colab.research.google.com/github/svedison/JDRE-Research/blob/main/bertclip.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip uninstall -y torch torchvision torchaudio
!pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu121

Found existing installation: torch 2.6.0+cu124
Uninstalling torch-2.6.0+cu124:
  Successfully uninstalled torch-2.6.0+cu124
Found existing installation: torchvision 0.21.0+cu124
Uninstalling torchvision-0.21.0+cu124:
  Successfully uninstalled torchvision-0.21.0+cu124
Found existing installation: torchaudio 2.6.0+cu124
Uninstalling torchaudio-2.6.0+cu124:
  Successfully uninstalled torchaudio-2.6.0+cu124
Looking in indexes: https://download.pytorch.org/whl/nightly/cu121
Collecting torch
  Downloading https://download.pytorch.org/whl/nightly/cu121/torch-2.6.0.dev20241112%2Bcu121-cp311-cp311-linux_x86_64.whl (768.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m768.0/768.0 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchvision
  Downloading https://download.pytorch.org/whl/nightly/cu121/torchvision-0.20.0.dev20241112%2Bcu121-cp311-cp311-linux_x86_64.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m120

- Changed runtime from CPU to GPU
- Latest transformers required PyTorch >= 2.6.0
- Removed the old transformers and installed a compatible one to match the GPU environment

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
import random
# torch and torch.nn: PyTorch for tensor computations and neural network layers
# transformers: Hugging Face library for loading pre-trained models and tokenizers

# -------------------------------
# 1. Dataset
# -------------------------------

# Dummy data: (bio_text, clinical_text)
#sample sentences from the models
#one biomedical-style sentence, one clinical-style sentence about sample concept
paired_data = [
    ("Sunitinib is a tyrosine kinase inhibitor.", "The patient was given sunitinib for cancer."),
    ("Insulin regulates glucose in the body.", "The patient was started on insulin for diabetes."),
    ("Warfarin is an anticoagulant.", "He was prescribed warfarin due to high clot risk."),
    ("Metformin lowers blood sugar.", "She takes metformin for type 2 diabetes."),
]

#custom PyTorch dataset so can be batched and loaded using a DataLoader
class BioClinicalDataset(Dataset):
    def __init__(self, pairs):
        self.pairs = pairs

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        return self.pairs[idx]

# -------------------------------
# 2. Model with Projection Heads
# -------------------------------

class CLIPStyleModel(nn.Module):
    def __init__(self, bio_model_name, clinical_model_name, proj_dim=256):
        super().__init__()
        # Encoders
        self.bio_tokenizer = AutoTokenizer.from_pretrained(bio_model_name)
        self.clinical_tokenizer = AutoTokenizer.from_pretrained(clinical_model_name)
        self.bio_encoder = AutoModel.from_pretrained(bio_model_name, use_safetensors=True)
        self.clinical_encoder = AutoModel.from_pretrained(clinical_model_name, use_safetensors=True)
        #separate encoders and tokenizers for biomedical and clinical languages

        # Projection heads
        hidden_size = self.bio_encoder.config.hidden_size
        self.bio_proj = nn.Sequential(
            nn.Linear(hidden_size, proj_dim),
            nn.ReLU(),
            nn.Linear(proj_dim, proj_dim)
        )
        self.clinical_proj = nn.Sequential(
            nn.Linear(hidden_size, proj_dim),
            nn.ReLU(),
            nn.Linear(proj_dim, proj_dim)
        )
        #project to have a smaller embedding and matching embeddings across models

#extracts the token embedding as sentence representation
    def encode(self, texts, tokenizer, encoder):
        inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=128)
        inputs = {k: v.to(encoder.device) for k, v in inputs.items()}
        outputs = encoder(**inputs)
        cls = outputs.last_hidden_state[:, 0, :]  # CLS token
        return cls

#passes through projection layers and normalizes the vectors
    def forward(self, bio_texts, clinical_texts):
        bio_cls = self.encode(bio_texts, self.bio_tokenizer, self.bio_encoder)
        clinical_cls = self.encode(clinical_texts, self.clinical_tokenizer, self.clinical_encoder)

        bio_emb = F.normalize(self.bio_proj(bio_cls), dim=1)
        clinical_emb = F.normalize(self.clinical_proj(clinical_cls), dim=1)

        return bio_emb, clinical_emb

# -------------------------------
# 3. Contrastive Loss (InfoNCE)
# -------------------------------

def clip_contrastive_loss(bio_emb, clinical_emb, temperature=0.07):
    logits = torch.matmul(bio_emb, clinical_emb.T) / temperature
    labels = torch.arange(len(bio_emb)).to(bio_emb.device)
    loss_i = F.cross_entropy(logits, labels)
    loss_t = F.cross_entropy(logits.T, labels)
    return (loss_i + loss_t) / 2
  #should have highest similarity in its row/column
  #encourages diagonal dominance

# -------------------------------
# 4. Training Loop
# -------------------------------

def train(model, dataloader, optimizer, device, epochs=5):
    model.to(device) #move to GPU
    model.train() #put in training mode
    for epoch in range(epochs):
        total_loss = 0
        for batch in dataloader:
            bio_texts, clinical_texts = zip(*batch)
            optimizer.zero_grad()

            bio_emb, clinical_emb = model(bio_texts, clinical_texts) #splits into biomedical and clinical lists
            loss = clip_contrastive_loss(bio_emb, clinical_emb)
            loss.backward()
            optimizer.step() #computes embeddings and contrastive loss

            total_loss += loss.item() #backpropagates and updates weights

        avg_loss = total_loss / len(dataloader)
        print(f"Epoch {epoch+1}/{epochs} - Loss: {avg_loss:.4f}") #loss average for the epoch

# -------------------------------
# 5. Run Everything
# -------------------------------

if __name__ == "__main__":
    # Set seeds for reproducibility
    torch.manual_seed(42)
    random.seed(42)

    # Params
    BATCH_SIZE = 2
    EPOCHS = 10
    LR = 2e-5
    PROJ_DIM = 256
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") #define training configuration

    # Init dataset and dataloader
    dataset = BioClinicalDataset(paired_data)
    dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

    # Init model and optimizer
    #load the training data
    model = CLIPStyleModel(
        bio_model_name="michiyasunaga/BioLinkBERT-base",
        clinical_model_name="emilyalsentzer/Bio_ClinicalBERT",
        proj_dim=PROJ_DIM
    )
    #BioLinkBERT: biomedical domain-specific BERT model
    #tokenizer converts text into token IDs
    #model output contextual embeddings for input tokens
    optimizer = torch.optim.AdamW(model.parameters(), lr=LR) #initializes the CLIP-style model, standard optimizer for training

    # Train
    train(model, dataloader, optimizer, DEVICE, epochs=EPOCHS) #launches training loop

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Epoch 1/10 - Loss: 0.7164
Epoch 2/10 - Loss: 0.6531
Epoch 3/10 - Loss: 0.7204
Epoch 4/10 - Loss: 0.7596
Epoch 5/10 - Loss: 0.6531
Epoch 6/10 - Loss: 0.6147
Epoch 7/10 - Loss: 0.6546
Epoch 8/10 - Loss: 0.5040
Epoch 9/10 - Loss: 0.5259
Epoch 10/10 - Loss: 0.4236


In [None]:
#updated with hyperparameterization and more data
# ---------------------------------------------
# 0. Imports and Hyperparameters
# ---------------------------------------------
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
import random

# -------------------------------
# Hyperparameters
# -------------------------------
HYPERPARAMS = {
    "bio_model_name": "michiyasunaga/BioLinkBERT-base",
    "clinical_model_name": "emilyalsentzer/Bio_ClinicalBERT",
    "proj_dim": 256,
    "batch_size": 2,
    "learning_rate": 2e-5,
    "epochs": 10,
    "max_length": 128,
    "temperature": 0.07,
    "device": torch.device("cuda" if torch.cuda.is_available() else "cpu"),
    "seed": 42
}

# Set seeds for reproducibility
torch.manual_seed(HYPERPARAMS["seed"])
random.seed(HYPERPARAMS["seed"])

# ---------------------------------------------
# 1. Sample Paired Data (Expanded)
# ---------------------------------------------
paired_data = [
    ("Sunitinib is a tyrosine kinase inhibitor.", "The patient was given sunitinib for cancer."),
    ("Insulin regulates glucose in the body.", "The patient was started on insulin for diabetes."),
    ("Warfarin is an anticoagulant.", "He was prescribed warfarin due to high clot risk."),
    ("Metformin lowers blood sugar.", "She takes metformin for type 2 diabetes."),
    ("Atorvastatin reduces cholesterol levels.", "The patient is on atorvastatin to manage cholesterol."),
    ("Amoxicillin treats bacterial infections.", "Amoxicillin was prescribed for an ear infection."),
    ("Lisinopril is used for hypertension.", "He takes lisinopril to control his high blood pressure."),
    ("Levothyroxine replaces thyroid hormone.", "She is on levothyroxine due to hypothyroidism."),
    ("Albuterol is a bronchodilator.", "The patient uses albuterol to relieve asthma symptoms."),
    ("Omeprazole reduces stomach acid.", "Omeprazole was given for acid reflux management."),
]

# ---------------------------------------------
# 2. Dataset Class
# ---------------------------------------------
class BioClinicalDataset(Dataset):
    def __init__(self, pairs):
        self.pairs = pairs

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        return self.pairs[idx]

# ---------------------------------------------
# 3. CLIP-style Dual Encoder Model
# ---------------------------------------------
class CLIPStyleModel(nn.Module):
    def __init__(self, bio_model_name, clinical_model_name, proj_dim=256, max_length=128):
        super().__init__()
        # Tokenizers and encoders
        self.bio_tokenizer = AutoTokenizer.from_pretrained(bio_model_name)
        self.clinical_tokenizer = AutoTokenizer.from_pretrained(clinical_model_name)
        self.bio_encoder = AutoModel.from_pretrained(bio_model_name, use_safetensors=True)
        self.clinical_encoder = AutoModel.from_pretrained(clinical_model_name, use_safetensors=True)

        hidden_size = self.bio_encoder.config.hidden_size
        self.max_length = max_length

        # Projection heads
        self.bio_proj = nn.Sequential(
            nn.Linear(hidden_size, proj_dim),
            nn.ReLU(),
            nn.Linear(proj_dim, proj_dim)
        )
        self.clinical_proj = nn.Sequential(
            nn.Linear(hidden_size, proj_dim),
            nn.ReLU(),
            nn.Linear(proj_dim, proj_dim)
        )

    def encode(self, texts, tokenizer, encoder):
        inputs = tokenizer(
            texts,
            return_tensors='pt',
            padding=True,
            truncation=True,
            max_length=self.max_length
        )
        inputs = {k: v.to(encoder.device) for k, v in inputs.items()}
        outputs = encoder(**inputs)
        cls_token = outputs.last_hidden_state[:, 0, :]  # Take CLS token
        return cls_token

    def forward(self, bio_texts, clinical_texts):
        bio_cls = self.encode(bio_texts, self.bio_tokenizer, self.bio_encoder)
        clinical_cls = self.encode(clinical_texts, self.clinical_tokenizer, self.clinical_encoder)

        bio_emb = F.normalize(self.bio_proj(bio_cls), dim=1)
        clinical_emb = F.normalize(self.clinical_proj(clinical_cls), dim=1)

        return bio_emb, clinical_emb

# ---------------------------------------------
# 4. Contrastive Loss (InfoNCE)
# ---------------------------------------------
def clip_contrastive_loss(bio_emb, clinical_emb, temperature=0.07):
    logits = torch.matmul(bio_emb, clinical_emb.T) / temperature
    labels = torch.arange(len(bio_emb)).to(bio_emb.device)
    loss_i = F.cross_entropy(logits, labels)       # Bio → Clinical
    loss_t = F.cross_entropy(logits.T, labels)     # Clinical → Bio
    return (loss_i + loss_t) / 2

# ---------------------------------------------
# 5. Training Function
# ---------------------------------------------
def train(model, dataloader, optimizer, device, epochs=5):
    model.to(device)
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in dataloader:
            bio_texts, clinical_texts = zip(*batch)
            optimizer.zero_grad()

            bio_emb, clinical_emb = model(bio_texts, clinical_texts)
            loss = clip_contrastive_loss(bio_emb, clinical_emb, temperature=HYPERPARAMS["temperature"])
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(dataloader)
        print(f"Epoch {epoch+1}/{epochs} - Loss: {avg_loss:.4f}")

# ---------------------------------------------
# 6. Run Training
# ---------------------------------------------
if __name__ == "__main__":
    dataset = BioClinicalDataset(paired_data)
    dataloader = DataLoader(dataset, batch_size=HYPERPARAMS["batch_size"], shuffle=True)

    model = CLIPStyleModel(
        bio_model_name=HYPERPARAMS["bio_model_name"],
        clinical_model_name=HYPERPARAMS["clinical_model_name"],
        proj_dim=HYPERPARAMS["proj_dim"],
        max_length=HYPERPARAMS["max_length"]
    )

    optimizer = torch.optim.AdamW(model.parameters(), lr=HYPERPARAMS["learning_rate"])

    train(model, dataloader, optimizer, HYPERPARAMS["device"], epochs=HYPERPARAMS["epochs"])

Epoch 1/10 - Loss: 0.7389
Epoch 2/10 - Loss: 0.7625
Epoch 3/10 - Loss: 0.7010
Epoch 4/10 - Loss: 0.6757
Epoch 5/10 - Loss: 0.6738
Epoch 6/10 - Loss: 0.6237
Epoch 7/10 - Loss: 0.5635
Epoch 8/10 - Loss: 0.4465
Epoch 9/10 - Loss: 0.3527
Epoch 10/10 - Loss: 0.0729


In [None]:
# run our model on PMC data

In [None]:
from google.colab import files
import io
import pandas as pd

# This will prompt you to select a file from your local machine
uploaded = files.upload()

# Get the uploaded filename (automatically detects what you uploaded)
csv_filename = "final_combined_notes.csv"

# Read the uploaded file into a pandas DataFrame
df = pd.read_csv(io.BytesIO(uploaded[csv_filename]))

print(f"Successfully loaded {csv_filename} with shape {df.shape}")
print(df.head())  # Show first few rows to confirm

MessageError: RangeError: Maximum call stack size exceeded.

In [None]:
# ALIGN CLINICAL TEXT WITH PUBMED SENTENCES
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import torch
import os

# -----------------------------
# Step 1: Config
# -----------------------------
MIMIC_CSV_PATH = "mimic_notes.csv"  # your CSV file with TEXT column
PUBMED_TXT_PATH = "pubmed_sentences.txt"  # list of biomedical sentences, one per line
OUTPUT_CSV_PATH = "aligned_pubmed_mimic.csv"
MAX_SENTENCES = 1000  # optionally limit for faster testing

# -----------------------------
# Step 2: Load MIMIC Sentences
# -----------------------------
def extract_sentences_from_mimic(path, max_sentences=None):
    df = pd.read_csv(path)
    all_sentences = []

    for text in df["TEXT"].dropna():
        sents = [s.strip() for s in text.split('.') if len(s.strip()) > 20]
        all_sentences.extend(sents)

    if max_sentences:
        all_sentences = all_sentences[:max_sentences]

    print(f"Loaded {len(all_sentences)} clinical sentences from MIMIC.")
    return all_sentences

clinical_sentences = extract_sentences_from_mimic(MIMIC_CSV_PATH, MAX_SENTENCES)

# -----------------------------
# Step 3: Load PubMed Sentences
# -----------------------------
def load_pubmed_sentences(path):
    with open(path, "r") as f:
        return [line.strip() for line in f if len(line.strip()) > 20]

pubmed_sentences = load_pubmed_sentences(PUBMED_TXT_PATH)
print(f"Loaded {len(pubmed_sentences)} PubMed sentences.")

# -----------------------------
# Step 4: Load Biomedical SentenceTransformer
# -----------------------------
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

model = SentenceTransformer("pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb")
model = model.to(device)

# -----------------------------
# Step 5: Encode Sentences
# -----------------------------
print("Encoding PubMed sentences...")
pubmed_embeddings = model.encode(pubmed_sentences, convert_to_tensor=True, device=device)

print("Encoding MIMIC sentences...")
clinical_embeddings = model.encode(clinical_sentences, convert_to_tensor=True, device=device)

# -----------------------------
# Step 6: Find Top-1 Matches
# -----------------------------
print("Performing semantic search...")
matches = util.semantic_search(clinical_embeddings, pubmed_embeddings, top_k=1)

aligned_pairs = []
for i, match in enumerate(matches):
    best_idx = match[0]['corpus_id']
    pubmed = pubmed_sentences[best_idx]
    clinical = clinical_sentences[i]
    aligned_pairs.append((pubmed, clinical))

# -----------------------------
# Step 7: Save to CSV
# -----------------------------
df_out = pd.DataFrame(aligned_pairs, columns=["pubmed_sentence", "mimic_sentence"])
df_out.to_csv(OUTPUT_CSV_PATH, index=False)

print(f"✅ Saved {len(df_out)} aligned sentence pairs to: {OUTPUT_CSV_PATH}")

In [None]:
# CLIP-STYLE MODEL TRAINING USING ALIGNED PUBMED-MIMIC SENTENCES

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
import pandas as pd
import random

# -----------------------------
# Hyperparameters
# -----------------------------
HYPERPARAMS = {
    "bio_model_name": "michiyasunaga/BioLinkBERT-base",
    "clinical_model_name": "emilyalsentzer/Bio_ClinicalBERT",
    "proj_dim": 256,
    "batch_size": 8,
    "learning_rate": 2e-5,
    "epochs": 5,
    "max_length": 128,
    "temperature": 0.07,
    "device": torch.device("cuda" if torch.cuda.is_available() else "cpu"),
    "seed": 42,
    "csv_path": "aligned_pubmed_mimic.csv"  # generated from the alignment script
}

# Set seeds for reproducibility
torch.manual_seed(HYPERPARAMS["seed"])
random.seed(HYPERPARAMS["seed"])

# -----------------------------
# Dataset: Load aligned sentence pairs
# -----------------------------
class BioClinicalDataset(Dataset):
    def __init__(self, csv_path):
        df = pd.read_csv(csv_path).dropna()
        self.pairs = list(zip(df["pubmed_sentence"].astype(str), df["mimic_sentence"].astype(str)))

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        return self.pairs[idx]

# -----------------------------
# Model: Dual Encoder with Projection Heads
# -----------------------------
class CLIPStyleModel(nn.Module):
    def __init__(self, bio_model_name, clinical_model_name, proj_dim=256, max_length=128):
        super().__init__()
        self.bio_tokenizer = AutoTokenizer.from_pretrained(bio_model_name)
        self.clinical_tokenizer = AutoTokenizer.from_pretrained(clinical_model_name)
        self.bio_encoder = AutoModel.from_pretrained(bio_model_name, use_safetensors=True)
        self.clinical_encoder = AutoModel.from_pretrained(clinical_model_name, use_safetensors=True)

        hidden_size = self.bio_encoder.config.hidden_size
        self.max_length = max_length

        self.bio_proj = nn.Sequential(
            nn.Linear(hidden_size, proj_dim),
            nn.ReLU(),
            nn.Linear(proj_dim, proj_dim)
        )
        self.clinical_proj = nn.Sequential(
            nn.Linear(hidden_size, proj_dim),
            nn.ReLU(),
            nn.Linear(proj_dim, proj_dim)
        )

    def encode(self, texts, tokenizer, encoder):
        inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=self.max_length)
        inputs = {k: v.to(encoder.device) for k, v in inputs.items()}
        outputs = encoder(**inputs)
        return outputs.last_hidden_state[:, 0, :]  # CLS token

    def forward(self, bio_texts, clinical_texts):
        bio_cls = self.encode(bio_texts, self.bio_tokenizer, self.bio_encoder)
        clinical_cls = self.encode(clinical_texts, self.clinical_tokenizer, self.clinical_encoder)

        bio_emb = F.normalize(self.bio_proj(bio_cls), dim=1)
        clinical_emb = F.normalize(self.clinical_proj(clinical_cls), dim=1)

        return bio_emb, clinical_emb

# -----------------------------
# Contrastive Loss (InfoNCE)
# -----------------------------
def clip_contrastive_loss(bio_emb, clinical_emb, temperature=0.07):
    logits = torch.matmul(bio_emb, clinical_emb.T) / temperature
    labels = torch.arange(len(bio_emb)).to(bio_emb.device)
    loss_i = F.cross_entropy(logits, labels)
    loss_t = F.cross_entropy(logits.T, labels)
    return (loss_i + loss_t) / 2

# -----------------------------
# Training Loop
# -----------------------------
def train(model, dataloader, optimizer, device, epochs=5):
    model.to(device)
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for bio_texts, clinical_texts in dataloader:
            optimizer.zero_grad()
            bio_emb, clinical_emb = model(bio_texts, clinical_texts)
            loss = clip_contrastive_loss(bio_emb, clinical_emb, temperature=HYPERPARAMS["temperature"])
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        avg_loss = total_loss / len(dataloader)
        print(f"Epoch {epoch+1}/{epochs} — Loss: {avg_loss:.4f}")

# -----------------------------
# Run Training
# -----------------------------
if __name__ == "__main__":
    dataset = BioClinicalDataset(HYPERPARAMS["csv_path"])
    dataloader = DataLoader(dataset, batch_size=HYPERPARAMS["batch_size"], shuffle=True)

    model = CLIPStyleModel(
        bio_model_name=HYPERPARAMS["bio_model_name"],
        clinical_model_name=HYPERPARAMS["clinical_model_name"],
        proj_dim=HYPERPARAMS["proj_dim"],
        max_length=HYPERPARAMS["max_length"]
    )

    optimizer = torch.optim.AdamW(model.parameters(), lr=HYPERPARAMS["learning_rate"])

    train(model, dataloader, optimizer, HYPERPARAMS["device"], epochs=HYPERPARAMS["epochs"])

In [None]:
#now compute similarity
import torch
import torch.nn.functional as F

# -----------------------------
# Example Inputs
# -----------------------------
bio_sentences = [
    "Metformin reduces glucose production in the liver.",
    "Aspirin is used for preventing blood clots."
]

clinical_sentences = [
    "The patient was started on metformin for diabetes management.",
    "He is taking aspirin after his heart attack."
]

# -----------------------------
# Encode Sentences
# -----------------------------
model.eval()
with torch.no_grad():
    bio_emb, clinical_emb = model(bio_sentences, clinical_sentences)

# -----------------------------
# Compute Cosine Similarity Matrix
# -----------------------------
similarity_matrix = F.cosine_similarity(bio_emb.unsqueeze(1), clinical_emb.unsqueeze(0), dim=2)

# similarity_matrix[i][j] = similarity between bio_sentences[i] and clinical_sentences[j]
print("Cosine Similarity Matrix:")
for i, bio in enumerate(bio_sentences):
    for j, clin in enumerate(clinical_sentences):
        score = similarity_matrix[i][j].item()
        print(f"🧬 Bio: {bio}\n🏥 Clin: {clin}\n🔗 Similarity: {score:.4f}\n")