In [1]:
# One run of test to deduplicate the bio_med_research dataset
import pandas as pd
import os
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
# Initialize classifier
import xml.etree.ElementTree as ET
import json
from tqdm import tqdm

In [2]:
# if use colab, run this part
from google.colab import drive

drive.mount('/content/drive')
os.chdir('/content/drive/MyDrive/bionlp')

Mounted at /content/drive


In [3]:
# go to model dir
os.chdir('MedImageInsights')

In [4]:
# install necessary package
!pip install mup
!pip install fvcore

Collecting mup
  Downloading mup-1.0.0.tar.gz (28 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: mup
  Building wheel for mup (setup.py) ... [?25l[?25hdone
  Created wheel for mup: filename=mup-1.0.0-py3-none-any.whl size=23629 sha256=e9ffdccbd647c5fe3ee5c20ff8681d8a06fa6b9608ebaae5027348995a55de17
  Stored in directory: /root/.cache/pip/wheels/f4/c8/88/3c23a3d10c50053b6552d2d30aee5b53ba89a47f742420036c
Successfully built mup
Installing collected packages: mup
Successfully installed mup-1.0.0
Collecting fvcore
  Downloading fvcore-0.1.5.post20221221.tar.gz (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.2/50.2 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting yacs>=0.1.6 (from fvcore)
  Downloading yacs-0.1.8-py3-none-any.whl.metadata (639 bytes)
Collecting iopath>=0.1.7 (from fvcore)
  Downloading iopath-0.1.10.tar.gz (42 kB)
[2K     [90m━━

In [5]:
# load model
from medimageinsightmodel import MedImageInsight

classifier = MedImageInsight(
    model_dir="2024.09.27",
    vision_model_name="medimageinsigt-v1.0.0.pt",
    language_model_name="language_model.pth"
)

classifier.load_model()



Model loaded successfully on device: cuda


## Caluclate Existing Embeddings

In [6]:
import os
import numpy as np
import pickle  # To save/load embeddings efficiently

def calculate_and_save_embeddings(dataset, dataset_name, column_names, save_dir="embeddings_cache", batch_size=128):
    """
    Compute and save embeddings for a QA dataset.

    Args:
        dataset (pd.DataFrame): Dataset containing "question" and "answer" columns.
        dataset_name (str): Name of the dataset for unique file identification.
        save_dir (str): Directory where embeddings will be saved.
        batch_size (int): Batch size for generating embeddings.

    Returns:
        dict: A dictionary containing question and answer embeddings.
    """
    # Ensure save directory exists
    os.makedirs(save_dir, exist_ok=True)

    # File paths for embeddings
    embedding_file = os.path.join(save_dir, f"{dataset_name}_embeddings.pkl")

    # Check if embeddings already exist
    if os.path.exists(embedding_file):
        print(f"Loading cached embeddings for {dataset_name}...")
        with open(embedding_file, "rb") as qf:
            embeddings = pickle.load(qf)
    else:
        # Compute embeddings for questions
        print(f"Generating embeddings for {dataset_name}...")
        texts = dataset[column_names].tolist()
        embeddings = []
        for i in tqdm(range(0, len(texts), batch_size), desc="Question Embeddings"):
            batch_texts = texts[i:i + batch_size]
            embeddings.extend(classifier.encode(texts=batch_texts)["text_embeddings"])
        embeddings = np.array(embeddings)

        # Save question embeddings
        with open(embedding_file, "wb") as qf:
            pickle.dump(embeddings, qf)
        print(f"Saved embeddings for {dataset_name}.")

    return embeddings

In [None]:
# load column names
# column information
col_info = pd.read_csv("../col.csv", quotechar='"')