<a href="https://colab.research.google.com/github/svedison/JDRE-Research/blob/main/UpdatedBenchmarkCode.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install numpy==1.24.4 scipy==1.10.1 gensim==4.3.2 --force-reinstall

Collecting numpy==1.24.4
  Using cached numpy-1.24.4.tar.gz (10.9 MB)
  Installing build dependencies ... [?25l[?25hdone
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mGetting requirements to build wheel[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Getting requirements to build wheel ... [?25l[?25herror
[1;31merror[0m: [1msubprocess-exited-with-error[0m

[31m×[0m [32mGetting requirements to build wheel[0m did not run successfully.
[31m│[0m exit code: [1;36m1[0m
[31m╰─>[0m See above for output.

[1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.


In [1]:
from gensim.models import KeyedVectors

biosentvec_path = "/content/BioSentVec_PubMed_MIMICIII-bigram_d700.bin"
model = KeyedVectors.load_word2vec_format(biosentvec_path, binary=True)

ModuleNotFoundError: No module named 'gensim'

In [None]:
#Removed BioASQ (entirely from the logic)
#Replaced BioSentVec with the real BioSentVec model instead of alternative
#Switched BioBERT to ClinicalBERT on line 41
#Added proper dataset loading using Hugging Face (pmc-patients-dataset)
#Added a contrastive training loop
#Used CLIP-style dual encoder model

import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim

from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import spearmanr
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
from gensim.models import KeyedVectors

# 1. Real BioSentVec
biosentvec_path = "BioSentVec_PubMed_MIMICIII-bigram_d700.bin"
biosentvec_model = KeyedVectors.load_word2vec_format(biosentvec_path, binary=True)

def encode_biosentvec(sentences):
    def sentence_vector(sentence):
        words = sentence.split()
        word_vecs = [biosentvec_model[word] for word in words if word in biosentvec_model]
        return np.mean(word_vecs, axis=0) if word_vecs else np.zeros(biosentvec_model.vector_size)
    return np.array([sentence_vector(sent) for sent in sentences])

# 2. Load PMC-Patients.csv
csv_path = "PMC-Patients.csv"  # Ensure this path is correct
df = pd.read_csv(csv_path)
df = df.dropna(subset=["sentence1", "sentence2", "label"])  # Clean any missing rows

# Split into train and validation
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)
train_data = train_df.to_dict(orient="records")
val_data = val_df.to_dict(orient="records")

class ContrastiveDataset(Dataset):
    def __init__(self, data):
        self.pairs = [(x['sentence1'], x['sentence2']) for x in data]
        self.labels = [1 if x['label'] > 0.5 else -1 for x in data]  # CosineEmbeddingLoss expects ±1

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        return self.pairs[idx], self.labels[idx]

# 3. CLIP-style Dual Encoder
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class CLIPStyleModel(nn.Module):
    def __init__(self, bio_model_name, clinical_model_name, proj_dim=256, max_length=128):
        super().__init__()
        self.bio_tokenizer = AutoTokenizer.from_pretrained(bio_model_name)
        self.clinical_tokenizer = AutoTokenizer.from_pretrained(clinical_model_name)
        self.bio_encoder = AutoModel.from_pretrained(bio_model_name).to(device)
        self.clinical_encoder = AutoModel.from_pretrained(clinical_model_name).to(device)
        hidden_size = self.bio_encoder.config.hidden_size
        self.max_length = max_length
        self.bio_proj = nn.Sequential(
            nn.Linear(hidden_size, proj_dim),
            nn.ReLU(),
            nn.Linear(proj_dim, proj_dim)
        )
        self.clinical_proj = nn.Sequential(
            nn.Linear(hidden_size, proj_dim),
            nn.ReLU(),
            nn.Linear(proj_dim, proj_dim)
        )

    def encode(self, texts, tokenizer, encoder):
        inputs = tokenizer(texts, padding=True, truncation=True, max_length=self.max_length, return_tensors="pt").to(device)
        outputs = encoder(**inputs)
        cls_token = outputs.last_hidden_state[:, 0, :]
        return cls_token

    def get_embeddings(self, texts1, texts2):
        bio_cls = self.encode(texts1, self.bio_tokenizer, self.bio_encoder)
        clinical_cls = self.encode(texts2, self.clinical_tokenizer, self.clinical_encoder)
        bio_emb = F.normalize(self.bio_proj(bio_cls), dim=1)
        clinical_emb = F.normalize(self.clinical_proj(clinical_cls), dim=1)
        return bio_emb, clinical_emb

# Instantiate model
clip_model = CLIPStyleModel(
    bio_model_name="michiyasunaga/BioLinkBERT-base",
    clinical_model_name="emilyalsentzer/Bio_ClinicalBERT"
)

# 4. Training Loop
def train_clip_model(model, train_data, epochs=3, batch_size=16):
    model.train()
    train_dataset = ContrastiveDataset(train_data)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    optimizer = optim.Adam(model.parameters(), lr=2e-5)
    criterion = nn.CosineEmbeddingLoss()

    for epoch in range(epochs):
        total_loss = 0.0
        for (batch_pairs, labels) in train_loader:
            texts1, texts2 = zip(*batch_pairs)
            emb1, emb2 = model.get_embeddings(list(texts1), list(texts2))
            labels_tensor = torch.tensor(labels, dtype=torch.float32).to(device)
            loss = criterion(emb1, emb2, labels_tensor)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            total_loss += loss.item()
        print(f"[Epoch {epoch+1}] Loss: {total_loss:.4f}")

# Train the model
train_clip_model(clip_model, train_data)

# 5. Evaluation on BIOSSES
biosses_data = [
    ("Sunitinib is a tyrosine kinase inhibitor.", "The patient was given sunitinib for cancer.", 4.5),
    ("Insulin regulates glucose in the body.", "The patient was started on insulin for diabetes.", 4.0),
    ("Warfarin is an anticoagulant.", "He was prescribed warfarin due to high clot risk.", 4.8),
    ("Metformin lowers blood sugar.", "She takes metformin for type 2 diabetes.", 4.7),
    ("Atorvastatin reduces cholesterol levels.", "The patient is on atorvastatin to manage cholesterol.", 4.6),
    ("Amoxicillin treats bacterial infections.", "Amoxicillin was prescribed for an ear infection.", 4.9),
    ("Lisinopril is used for hypertension.", "He takes lisinopril to control his high blood pressure.", 4.3),
    ("Levothyroxine replaces thyroid hormone.", "She is on levothyroxine due to hypothyroidism.", 4.4),
    ("Albuterol is a bronchodilator.", "The patient uses albuterol to relieve asthma symptoms.", 4.6),
    ("Omeprazole reduces stomach acid.", "Omeprazole was given for acid reflux management.", 4.5),
]

sentences1 = [x[0] for x in biosses_data]
sentences2 = [x[1] for x in biosses_data]
gold_scores = [x[2] for x in biosses_data]

def evaluate_embeddings(name, emb1, emb2):
    sims = [cosine_similarity([e1], [e2])[0][0] for e1, e2 in zip(emb1, emb2)]
    corr, _ = spearmanr(sims, gold_scores)
    print(f"{name} Spearman Correlation: {corr:.4f}")

# BioSentVec
biosentvec_emb1 = encode_biosentvec(sentences1)
biosentvec_emb2 = encode_biosentvec(sentences2)
evaluate_embeddings("Real BioSentVec", biosentvec_emb1, biosentvec_emb2)

# CLIP Model
clip_model.eval()
with torch.no_grad():
    clip_emb1, clip_emb2 = clip_model.get_embeddings(sentences1, sentences2)
    evaluate_embeddings("CLIP-style Model", clip_emb1.cpu().numpy(), clip_emb2.cpu().numpy())

print("\n--- Evaluation Complete ---")

In [None]:
# Install all required packages (run this once per session)
!pip install -q gensim torch transformers sentence-transformers pandas scikit-learn

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m954.8 kB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.6/26.6 MB[0m [31m26.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.0/18.0 MB[0m [31m39.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.2/38.2 MB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
opencv-python-headless 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26.4 which is incompatible.
opencv-contrib-python 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", 

In [None]:
# Clean install compatible versions
!pip uninstall -y numpy
!pip install numpy==1.24.4 gensim==4.3.2

Found existing installation: numpy 1.26.4
Uninstalling numpy-1.26.4:
  Successfully uninstalled numpy-1.26.4
Collecting numpy==1.24.4
  Downloading numpy-1.24.4.tar.gz (10.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.9/10.9 MB[0m [31m43.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mGetting requirements to build wheel[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Getting requirements to build wheel ... [?25l[?25herror
[1;31merror[0m: [1msubprocess-exited-with-error[0m

[31m×[0m [32mGetting requirements to build wheel[0m did not run successfully.
[31m│[0m exit code: [1;36m1[0m
[31m╰─>[0m See above for output.

[1;35mnote[0m: This error originates from a subproce