In [4]:
# One run of test to deduplicate the bio_med_research dataset
import pandas as pd
import os
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import xml.etree.ElementTree as ET
import json
from tqdm import tqdm
import pickle

In [None]:
# if use colab, run this part
from google.colab import drive

drive.mount('/content/drive')
os.chdir('/content/drive/MyDrive/bionlp')

In [5]:
# go to model dir
os.chdir('MedImageInsights')

In [6]:
# set directory to deduplicate
directory = "../dataset/bio_med_research"

In [4]:
# install necessary package
!pip install mup
!pip install fvcore



In [7]:
# load model
from medimageinsightmodel import MedImageInsight

classifier = MedImageInsight(
    model_dir="2024.09.27",
    vision_model_name="medimageinsigt-v1.0.0.pt",
    language_model_name="language_model.pth"
)

classifier.load_model()



Model loaded successfully on device: cpu


In [22]:
# loading dataset
def parse_xml(file):
    tree = ET.parse(file)
    root = tree.getroot()

    sentence_data = []
    for sentence in root.findall('sentence'):
        sentence_id = sentence.get('id')
        sentence_text = sentence.get('text')

        sentence_data.append({
            "sentence_id": sentence_id,
            "sentence_text": sentence_text
        })

    return pd.DataFrame(sentence_data)


def load_dataset(path, filetype = "csv"):
    if filetype == "csv":
        all_files = []
        for root, dirs, files in tqdm(os.walk(path), desc = "Loading CSV files"):
            for file in tqdm(files, desc = "Processing file"):
                if file.endswith(".csv"):
                    all_files.append(os.path.join(root, file))
        ds = {}
        for f in all_files:
            df = pd.read_csv(f)
            ds[f] = df
        return ds
    elif filetype == "xml":
        all_files = []
        for root, dirs, files in tqdm(os.walk(path), desc = "Loading XML files"):
            for file in tqdm(files, desc = "Processing file"):
                if file.endswith(".xml"):
                    all_files.append(os.path.join(root, file))
        ds = {}
        for f in all_files:
            ds[f] = parse_xml(f)
        return ds
    elif filetype == "jsonl":
        all_files = []
        for root, dirs, files in tqdm(os.walk(path), desc = "Loading JSONL files"):
            for file in tqdm(files, desc = "Processing file"):
                if file.endswith(".jsonl"):
                    all_files.append(os.path.join(root, file))
        ds = {}
        for f in all_files:
            print("current file: ", f)
            with open(f, "r") as file:
                data = [json.loads(line) for line in file]
            ds[f] = pd.DataFrame(data)
        return ds
    elif filetype == "json":
        all_files = []
        for root, dirs, files in tqdm(os.walk(path), desc = "Loading JSON files"):
            for file in tqdm(files, desc = "Processing file"):
                if file.endswith(".json"):
                    all_files.append(os.path.join(root, file))
        ds = {}
        for f in all_files:
            with open(f, "r") as file:
                data = json.load(file)
            ds[f] = pd.DataFrame(data)
        return ds
    elif filetype == "txt":
        all_files = []
        for root, dirs, files in tqdm(os.walk(path), desc = "Loading TXT files"):
            for file in tqdm(files, desc = "Processing file"):
                if file.endswith(".txt"):
                    all_files.append(os.path.join(root, file))
        print(all_files)
        ds = {}
        for f in all_files:
            with open(f, "r") as file:
                data = file.readlines()
            ds[f] = data
        return ds



In [24]:
bc5cdr = load_dataset(directory + "/bc5cdr", "txt")


Processing file: 100%|██████████| 6/6 [00:00<00:00, 35494.82it/s]
Loading TXT files: 1it [00:00, 482.38it/s]

['../dataset/bio_med_research/bc5cdr/val_bc5cdr.txt', '../dataset/bio_med_research/bc5cdr/test_bc5cdr.txt', '../dataset/bio_med_research/bc5cdr/train_bc5cdr.txt']





In [41]:
bc5cdr_train = bc5cdr['../dataset/bio_med_research/bc5cdr/train_bc5cdr.txt']
bc5cdr_val = bc5cdr['../dataset/bio_med_research/bc5cdr/val_bc5cdr.txt']
bc5cdr_test = bc5cdr['../dataset/bio_med_research/bc5cdr/test_bc5cdr.txt']

bc5cdr_train_df = pd.DataFrame()
bc5cdr_val_df = pd.DataFrame()
bc5cdr_test_df = pd.DataFrame()

bc5cdr_train_df["text"] = None
bc5cdr_val_df["text"] = None
bc5cdr_test_df["text"] = None

for i in range(len(bc5cdr_train)):
    bc5cdr_train_df.at[i, "text"] = bc5cdr_train[i]
    bc5cdr_val_df.at[i, "text"] = bc5cdr_val[i]
    bc5cdr_test_df.at[i, "text"] = bc5cdr_test[i]



In [47]:
# column information
col_info = pd.read_csv("../col.csv", quotechar='"')

In [48]:
col_info

Unnamed: 0,dataset_name,column_name
0,bc5cdr,text
1,BioNLI,"supp_set, conclusion, label_cat, ori_conclusion"
2,CORD19,"title, abstract, full_text"
3,DDCICorpus,sentence
4,hoc,"text, label"
5,pubmed,"MedlineCitation, PubmedData"
6,SourceData,"words,labels,tag_mask,text"
7,trec_covid,"title, text, metadata"


In [None]:
# deduplicate the dataset
def get_embeddings(texts, batch_size = 64):
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size), desc = "Generating embeddings"):
        batch_texts = texts[i:i+batch_size]
        embeddings.extend(classifier.encode(texts = batch_texts)['text_embeddings'])
    return np.array(embeddings)

def compute_similarity_chunked(embeddings, threshold=0.9, chunk_size=8000):
    """
    Compute cosine similarity in chunks to reduce memory usage.
    """
    n = len(embeddings)
    to_remove = set()
    for i in tqdm(range(0, n, chunk_size), desc= "Calcuating Similarity"):
        # Get the current chunk
        chunk_embeddings = embeddings[i:i + chunk_size]

        # Compute cosine similarity for the current chunk against all embeddings
        similarity_matrix = cosine_similarity(chunk_embeddings, embeddings)

        # Iterate through the chunk rows to find high-similarity indices
        for row_idx, similarities in enumerate(similarity_matrix):
            actual_idx = i + row_idx  # Map back to the original index
            if actual_idx in to_remove:
                continue

            similar_indices = np.where(similarities > threshold)[0]
            similar_indices = [idx for idx in similar_indices if idx > actual_idx]  # Avoid duplicates
            to_remove.update(similar_indices)

    return to_remove

def compute_similarity_between_datasets_chunked(embeddings1, embeddings2, threshold=0.9, chunk_size1=8000, chunk_size2=8000):
    """
    Compute cosine similarity between two datasets in chunks to reduce memory usage.
    Removes entries from embeddings1 based on high similarity with embeddings2.
    """
    to_remove = set()
    n1, n2 = len(embeddings1), len(embeddings2)

    for i in tqdm(range(0, n1, chunk_size1), desc="Processing dataset1 in chunks"):
        # Get a chunk from embeddings1
        chunk_embeddings1 = embeddings1[i:i + chunk_size1]

        for j in range(0, n2, chunk_size2):
            # Get a chunk from embeddings2
            chunk_embeddings2 = embeddings2[j:j + chunk_size2]

            # Compute cosine similarity for the two chunks
            similarity_matrix = cosine_similarity(chunk_embeddings1, chunk_embeddings2)

            # Check rows in chunk_embeddings1 with high similarity to chunk_embeddings2
            for row_idx, similarities in enumerate(similarity_matrix):
                actual_idx = i + row_idx  # Map back to the original index in embeddings1
                if actual_idx in to_remove:
                    continue
                if np.any(similarities > threshold):
                    to_remove.add(actual_idx)

    return to_remove

def deduplicate_within_dataset(dataset, columns,threshold=0.9):
    # joins the columns in the dataset
    texts = list(dataset[columns].apply(lambda x: " ".join(x.values.astype(str)), axis=1))
    embeddings = get_embeddings(texts)
    to_remove = compute_similarity_chunked(embeddings, threshold=threshold)
    number_removed = len(to_remove)
    return dataset.drop(to_remove), number_removed

def deduplicate_between_datasets(new_dataset, columns, old_embeddings, threshold=0.9):
    texts1 = list(new_dataset[columns].apply(lambda x: " ".join(x.values.astype(str)), axis=1))
    embeddings1 = get_embeddings(texts1)
    old_embeddings_list = []
    for embed in old_embeddings:
        old_embeddings_list.extend(embed)
    to_remove = compute_similarity_between_datasets_chunked(embeddings1, old_embeddings_list, threshold=threshold)
    number_removed = len(to_remove)
    return new_dataset.drop(to_remove), number_removed

In [None]:
# deduplicate within dataset
deduplicated_bc5cdr_train, number_removed_train = deduplicate_within_dataset(bc5cdr_train_df, col_info.loc[col_info["dataset_name"] == "bc5cdr", "column_name"].tolist())
deduplicated_bc5cdr_val, number_removed_val = deduplicate_within_dataset(bc5cdr_val_df, col_info.loc[col_info["dataset_name"] == "bc5cdr", "column_name"].tolist())
deduplicated_bc5cdr_test, number_removed_test = deduplicate_within_dataset(bc5cdr_test_df, col_info.loc[col_info["dataset_name"] == "bc5cdr", "column_name"].tolist())

In [None]:

print("Number of removed samples in train: ", number_removed_train)
print("Number of removed samples in val: ", number_removed_val)
print("Number of removed samples in test: ", number_removed_test)


In [None]:
# deduplicated_train will serve as the base ds and we add the other datasets to it
deduplicated_bc5cdr_train.to_csv("../deduplicated_data/bio_med_research/bc5cdr/train_bc5cdr_deduplicated.csv", index=False)


In [None]:
# old datasets
old_datas = []

with open("../deduplicated_embeddings/bio_med_research/bc5cdr_train_embeddings.pkl", "rb") as f:
    bc5cdr_train_embeddings = pickle.load(f)
    old_datas.append(bc5cdr_train_embeddings)

In [None]:
# deduplicate between existing dataset
full_deduplicate_bc5cdr_val, removed_idx_val = deduplicate_between_datasets(deduplicated_bc5cdr_val, ["text"], old_datas)
print(removed_idx_val)

In [None]:
# save the deduplicated dataset
full_deduplicate_bc5cdr_val.to_csv("../deduplicated_data/bio_med_research/bc5cdr/val_bc5cdr_deduplicated.csv", index=False)

In [None]:
# load embeddings
with open("../deduplicated_embeddings/bio_med_research/bc5cdr_val_embeddings.pkl", "rb") as f:
    bc5cdr_val_embeddings = pickle.load(f)
    old_datas.append(bc5cdr_val_embeddings)

In [None]:
# deduplicate between existing dataset
full_deduplicate_bc5cdr_test, removed_idx_test = deduplicate_between_datasets(deduplicated_bc5cdr_test, ["text"], old_datas)
print(removed_idx_test)

In [None]:
# save the deduplicated dataset
full_deduplicate_bc5cdr_test.to_csv("../deduplicated_data/bio_med_research/bc5cdr/test_bc5cdr_deduplicated.csv", index=False)

In [None]:
# deduplicate bionli
bionli = load_dataset(directory + "/BioNLI", "csv")
bionli_train = bionli['../dataset/bio_med_research/BioNLI/train_balanced.csv']
bionli_dev = bionli['../dataset/bio_med_research/BioNLI/dev_balanced.csv']
bionli_test = bionli['../dataset/bio_med_research/BioNLI/test.csv']

In [None]:
# deduplicate within dataset
deduplicated_bionli_train, number_removed_train = deduplicate_within_dataset(bionli_train, col_info.loc[col_info["dataset_name"] == "BioNLI", "column_name"].tolist())
deduplicated_bionli_dev, number_removed_dev = deduplicate_within_dataset(bionli_dev, col_info.loc[col_info["dataset_name"] == "BioNLI", "column_name"].tolist())
deduplicated_bionli_test, number_removed_test = deduplicate_within_dataset(bionli_test, col_info.loc[col_info["dataset_name"] == "BioNLI", "column_name"].tolist())

print("Number of removed samples in train: ", number_removed_train)
print("Number of removed samples in dev: ", number_removed_dev)
print("Number of removed samples in test: ", number_removed_test)


In [None]:
# old_datasets
old_datas = []
with open("../deduplicated_embeddings/bio_med_research/bc5cdr_train_embeddings.pkl", "rb") as f:
    bc5cdr_train_embeddings = pickle.load(f)
    old_datas.append(bc5cdr_train_embeddings)

with open("../deduplicated_embeddings/bio_med_research/bc5cdr_val_embeddings.pkl", "rb") as f:
    bc5cdr_val_embeddings = pickle.load(f)
    old_datas.append(bc5cdr_val_embeddings)
with open("../deduplicated_embeddings/bio_med_research/bc5cdr_test_embeddings.pkl", "rb") as f:
    bc5cdr_test_embeddings = pickle.load(f)
    old_datas.append(bc5cdr_test_embeddings)

In [None]:
# deduplicate between existing dataset
full_deduplicate_bionli_train, removed_idx_train = deduplicate_between_datasets(deduplicated_bionli_train, col_info.loc[col_info["dataset_name"] == "bionli", "column_name"].tolist(), old_datas)
print(removed_idx_train)

In [None]:
# save the deduplicated dataset
full_deduplicate_bionli_train.to_csv("../deduplicated_data/bio_med_research/bionli/train_bionli_deduplicated.csv", index=False)


In [None]:
# add new dataset
with open("../deduplicated_embeddings/bio_med_research/bionli_train_embeddings.pkl", "rb") as f:
    bionli_train_embeddings = pickle.load(f)
    old_datas.append(bionli_train_embeddings)


In [None]:
full_deduplicate_bionli_val, removed_idx_val = deduplicate_between_datasets(deduplicated_bionli_val, col_info.loc[col_info["dataset_name"] == "bionli", "column_name"].tolist(), old_datas)
print(removed_idx_val)

In [None]:
# save the deduplicated dataset
full_deduplicate_bionli_val.to_csv("../deduplicated_data/bio_med_research/bionli/val_bionli_deduplicated.csv", index=False)

In [None]:
# add new dataset
with open("../deduplicated_embeddings/bio_med_research/bionli_val_embeddings.pkl", "rb") as f:
    bionli_val_embeddings = pickle.load(f)
    old_datas.append(bionli_val_embeddings)


In [None]:
full_deduplicate_bionli_test, removed_idx_test = deduplicate_between_datasets(deduplicated_bionli_test, col_info.loc[col_info["dataset_name"] == "bionli", "column_name"].tolist(), old_datas)
print(removed_idx_test)

In [None]:
# save the deduplicated dataset
full_deduplicate_bionli_test.to_csv("../deduplicated_data/bio_med_research/bionli/test_bionli_deduplicated.csv", index=False)

## How to load the dataset from huggingface first

In [8]:
from datasets import load_dataset

In [9]:
dataset6 = load_dataset("bigbio/bc5cdr")

In [10]:
dataset6

DatasetDict({
    train: Dataset({
        features: ['passages'],
        num_rows: 500
    })
    test: Dataset({
        features: ['passages'],
        num_rows: 500
    })
    validation: Dataset({
        features: ['passages'],
        num_rows: 500
    })
})

In [11]:
ds_test = dataset6['train']

In [16]:
str(ds_test['passages'][0])


"[{'document_id': '227508', 'type': 'title', 'text': 'Naloxone reverses the antihypertensive effect of clonidine.', 'entities': [{'id': '0', 'offsets': [[0, 8]], 'text': ['Naloxone'], 'type': 'Chemical', 'normalized': [{'db_name': 'MESH', 'db_id': 'D009270'}]}, {'id': '1', 'offsets': [[49, 58]], 'text': ['clonidine'], 'type': 'Chemical', 'normalized': [{'db_name': 'MESH', 'db_id': 'D003000'}]}], 'relations': [{'id': 'R0', 'type': 'CID', 'arg1_id': 'D008750', 'arg2_id': 'D007022'}]}, {'document_id': '227508', 'type': 'abstract', 'text': 'In unanesthetized, spontaneously hypertensive rats the decrease in blood pressure and heart rate produced by intravenous clonidine, 5 to 20 micrograms/kg, was inhibited or reversed by nalozone, 0.2 to 2 mg/kg. The hypotensive effect of 100 mg/kg alpha-methyldopa was also partially reversed by naloxone. Naloxone alone did not affect either blood pressure or heart rate. In brain membranes from spontaneously hypertensive rats clonidine, 10(-8) to 10(-5) M,

In [6]:
ds_test.to_csv("example.csv", index=False)

In [18]:
# write the file 
for i in range(len(ds_test['passages'])):
    with open("../dataset/bio_med_research/bc5cdr/train_bc5cdr.txt", "a") as f:
        f.write(str(ds_test['passages'][i]) + "\n")

In [19]:
for i in range(len(dataset6['test']['passages'])):
    with open("../dataset/bio_med_research/bc5cdr/test_bc5cdr.txt", "a") as f:
        f.write(str(dataset6['test']['passages'][i]) + "\n")
for i in range(len(dataset6['validation']['passages'])):
    with open("../dataset/bio_med_research/bc5cdr/val_bc5cdr.txt", "a") as f:
        f.write(str(dataset6['validation']['passages'][i]) + "\n")