In [1]:
# One run of test to deduplicate the bio_med_research dataset
import pandas as pd
import os
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
# Initialize classifier
import xml.etree.ElementTree as ET
import json
from tqdm import tqdm

In [2]:
# if use colab, run this part
from google.colab import drive

drive.mount('/content/drive')
os.chdir('/content/drive/MyDrive/bionlp')

Mounted at /content/drive


In [3]:
# go to model dir
os.chdir('MedImageInsights')

In [4]:
# install necessary package
!pip install mup
!pip install fvcore

Collecting mup
  Downloading mup-1.0.0.tar.gz (28 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: mup
  Building wheel for mup (setup.py) ... [?25l[?25hdone
  Created wheel for mup: filename=mup-1.0.0-py3-none-any.whl size=23629 sha256=7d8d2601a791e1855d77bbc0237750f1f07fb72b15a96b25233409a8d954bdf4
  Stored in directory: /root/.cache/pip/wheels/f4/c8/88/3c23a3d10c50053b6552d2d30aee5b53ba89a47f742420036c
Successfully built mup
Installing collected packages: mup
Successfully installed mup-1.0.0
Collecting fvcore
  Downloading fvcore-0.1.5.post20221221.tar.gz (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.2/50.2 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting yacs>=0.1.6 (from fvcore)
  Downloading yacs-0.1.8-py3-none-any.whl.metadata (639 bytes)
Collecting iopath>=0.1.7 (from fvcore)
  Downloading iopath-0.1.10.tar.gz (42 kB)
[2K     [90m━━

In [5]:
# load model
from medimageinsightmodel import MedImageInsight

classifier = MedImageInsight(
    model_dir="2024.09.27",
    vision_model_name="medimageinsigt-v1.0.0.pt",
    language_model_name="language_model.pth"
)

classifier.load_model()



Model loaded successfully on device: cuda


## Caluclate Existing Embeddings

In [6]:
import os
import numpy as np
import pickle  # To save/load embeddings efficiently

def calculate_and_save_embeddings(dataset, dataset_name, save_dir="embeddings_cache", batch_size=128):
    """
    Compute and save embeddings for a QA dataset.

    Args:
        dataset (pd.DataFrame): Dataset containing "question" and "answer" columns.
        dataset_name (str): Name of the dataset for unique file identification.
        save_dir (str): Directory where embeddings will be saved.
        batch_size (int): Batch size for generating embeddings.

    Returns:
        dict: A dictionary containing question and answer embeddings.
    """
    # Ensure save directory exists
    os.makedirs(save_dir, exist_ok=True)

    # File paths for embeddings
    question_embedding_file = os.path.join(save_dir, f"{dataset_name}_question_embeddings.pkl")
    answer_embedding_file = os.path.join(save_dir, f"{dataset_name}_answer_embeddings.pkl")

    # Check if embeddings already exist
    if os.path.exists(question_embedding_file) and os.path.exists(answer_embedding_file):
        print(f"Loading cached embeddings for {dataset_name}...")
        with open(question_embedding_file, "rb") as qf:
            question_embeddings = pickle.load(qf)
        with open(answer_embedding_file, "rb") as af:
            answer_embeddings = pickle.load(af)
    else:
        # Compute embeddings for questions
        print(f"Generating question embeddings for {dataset_name}...")
        questions = dataset["question"].tolist()
        question_embeddings = []
        for i in tqdm(range(0, len(questions), batch_size), desc="Question Embeddings"):
            batch_questions = questions[i:i + batch_size]
            question_embeddings.extend(classifier.encode(texts=batch_questions)["text_embeddings"])
        question_embeddings = np.array(question_embeddings)

        # Save question embeddings
        with open(question_embedding_file, "wb") as qf:
            pickle.dump(question_embeddings, qf)
        print(f"Saved question embeddings for {dataset_name}.")

        # Compute embeddings for answers
        print(f"Generating answer embeddings for {dataset_name}...")
        answers = dataset["answer"].tolist()
        answer_embeddings = []
        for i in tqdm(range(0, len(answers), batch_size), desc="Answer Embeddings"):
            batch_answers = answers[i:i + batch_size]
            answer_embeddings.extend(classifier.encode(texts=batch_answers)["text_embeddings"])
        answer_embeddings = np.array(answer_embeddings)

        # Save answer embeddings
        with open(answer_embedding_file, "wb") as af:
            pickle.dump(answer_embeddings, af)
        print(f"Saved answer embeddings for {dataset_name}.")

    return {"questions": question_embeddings, "answers": answer_embeddings}


In [None]:
deduplicated_medmcqa_test = pd.read_csv("../deduplicated_data/QAs/MedMCQA/medmcqa_test_fulltext_deduplicated.csv")

In [None]:
calculate_and_save_embeddings(deduplicated_medmcqa_test, "medmcqa_test", save_dir="../deduplicated_embeddings/QAs", batch_size=128)

Generating question embeddings for medmcqa_test...


Question Embeddings: 100%|██████████| 43/43 [00:30<00:00,  1.43it/s]


Saved question embeddings for medmcqa_test.
Generating answer embeddings for medmcqa_test...


Answer Embeddings: 100%|██████████| 43/43 [00:29<00:00,  1.44it/s]

Saved answer embeddings for medmcqa_test.





{'questions': array([[ 0.00076395, -0.02654904, -0.03134664, ...,  0.02570958,
         -0.00195837, -0.00630343],
        [ 0.00524337,  0.033416  ,  0.00307591, ...,  0.04220593,
         -0.01449368,  0.0308999 ],
        [-0.00434002, -0.02685545,  0.01362809, ...,  0.05724397,
         -0.00692272, -0.0030829 ],
        ...,
        [-0.02461233,  0.01374935, -0.00677833, ..., -0.00967417,
         -0.01530647, -0.00684253],
        [-0.00531393,  0.01521857, -0.0279128 , ...,  0.02822973,
          0.00548543, -0.00014811],
        [ 0.01891935, -0.02109549, -0.02800494, ...,  0.01564752,
         -0.00385373, -0.00310911]], dtype=float32),
 'answers': array([[-0.02031539, -0.00073444, -0.02261897, ..., -0.00154823,
         -0.02488664,  0.0226727 ],
        [-0.0212907 ,  0.00392494, -0.03102093, ...,  0.00871337,
         -0.03402397,  0.03308565],
        [ 0.0137716 , -0.03382061, -0.02171203, ..., -0.01962037,
         -0.01894022,  0.00477085],
        ...,
        [-0.021

In [None]:
deduplicated_medmcqa_dev = pd.read_csv("../deduplicated_data/QAs/MedMCQA/medmcqa_dev_fulltext_deduplicated.csv")

In [None]:
calculate_and_save_embeddings(deduplicated_medmcqa_dev, "medmcqa_dev", save_dir="../deduplicated_embeddings/QAs", batch_size=128)

Generating question embeddings for medmcqa_dev...


Question Embeddings: 100%|██████████| 31/31 [00:21<00:00,  1.43it/s]


Saved question embeddings for medmcqa_dev.
Generating answer embeddings for medmcqa_dev...


Answer Embeddings: 100%|██████████| 31/31 [00:25<00:00,  1.24it/s]

Saved answer embeddings for medmcqa_dev.





{'questions': array([[-2.56071370e-02,  1.91394035e-02,  1.27475783e-02, ...,
          6.34452626e-02, -1.70708690e-02,  2.03857757e-02],
        [ 5.93451085e-03,  5.27069345e-02, -1.12807238e-02, ...,
         -1.53421902e-03, -4.03792597e-03,  5.80840185e-03],
        [-2.25096289e-02, -2.07897816e-02,  9.05894209e-03, ...,
          1.06330588e-02, -2.68628467e-02,  1.02449032e-02],
        ...,
        [ 1.20566385e-02,  4.27294001e-02, -3.73669937e-02, ...,
          5.58599308e-02, -4.34714146e-02, -1.61970966e-02],
        [ 3.84220891e-02,  1.79261365e-03, -3.20613049e-02, ...,
         -9.69613343e-03, -1.58840474e-02,  1.38213523e-02],
        [-3.43796909e-02, -3.55549928e-05, -1.28169907e-02, ...,
          2.41738465e-02,  6.28327113e-03,  3.56595479e-02]], dtype=float32),
 'answers': array([[-0.00665213,  0.00178398,  0.01581716, ...,  0.02604667,
         -0.04338642,  0.01888935],
        [-0.02049993,  0.01377321,  0.01967613, ...,  0.04060692,
          0.00947692, 

In [None]:
deduplicated_medmcqa_train = pd.read_csv("../deduplicated_data/QAs/MedMCQA/medmcqa_train_fulltext_deduplicated.csv")

In [None]:
calculate_and_save_embeddings(deduplicated_medmcqa_train, "medmcqa_train", save_dir="../deduplicated_embeddings/QAs", batch_size=128)