In [1]:
# One run of test to deduplicate the bio_med_research dataset
import pandas as pd
import os
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import xml.etree.ElementTree as ET
import json
from tqdm import tqdm
import pickle

In [2]:
# if use colab, run this part
from google.colab import drive

drive.mount('/content/drive')
os.chdir('/content/drive/MyDrive/bionlp')

Mounted at /content/drive


In [3]:
# go to model dir
os.chdir('MedImageInsights')

In [4]:
# set directory to deduplicate
directory = "../dataset/QAs"

In [5]:
# install necessary package
!pip install mup
!pip install fvcore

Collecting mup
  Downloading mup-1.0.0.tar.gz (28 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: mup
  Building wheel for mup (setup.py) ... [?25l[?25hdone
  Created wheel for mup: filename=mup-1.0.0-py3-none-any.whl size=23629 sha256=3a30d2b1d1e0f3019ee4432a098cca4b96f552e8b0708ce21a331c47a5835466
  Stored in directory: /root/.cache/pip/wheels/f4/c8/88/3c23a3d10c50053b6552d2d30aee5b53ba89a47f742420036c
Successfully built mup
Installing collected packages: mup
Successfully installed mup-1.0.0
Collecting fvcore
  Downloading fvcore-0.1.5.post20221221.tar.gz (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.2/50.2 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting yacs>=0.1.6 (from fvcore)
  Downloading yacs-0.1.8-py3-none-any.whl.metadata (639 bytes)
Collecting iopath>=0.1.7 (from fvcore)
  Downloading iopath-0.1.10.tar.gz (42 kB)
[2K     [90m━━

In [6]:
# load model
from medimageinsightmodel import MedImageInsight

classifier = MedImageInsight(
    model_dir="2024.09.27",
    vision_model_name="medimageinsigt-v1.0.0.pt",
    language_model_name="language_model.pth"
)

classifier.load_model()



Model loaded successfully on device: cuda


In [7]:
def parse_qa_xml(file_path):
    # Parse the XML file
    tree = ET.parse(file_path)
    root = tree.getroot()

    data = []

    # Iterate through each question
    for question in root.findall("NLM-QUESTION"):
        qid = question.attrib.get("qid", None)

        # Extract question details
        original_question = question.find("Original-Question")
        subject = original_question.find("SUBJECT").text if original_question.find("SUBJECT") is not None else None
        message = original_question.find("MESSAGE").text if original_question.find("MESSAGE") is not None else None
        paraphrase = question.find("NIST-PARAPHRASE").text if question.find("NIST-PARAPHRASE") is not None else None
        summary = question.find("NLM-Summary").text if question.find("NLM-Summary") is not None else None

        # Extract annotations
        annotations = question.find("ANNOTATIONS")
        focuses = []
        types = []
        keywords = []

        if annotations is not None:
            for focus in annotations.findall("FOCUS"):
                focuses.append({
                    "fid": focus.attrib.get("fid"),
                    "fcategory": focus.attrib.get("fcategory"),
                    "text": focus.text,
                })

            for type_elem in annotations.findall("TYPE"):
                types.append({
                    "tid": type_elem.attrib.get("tid"),
                    "hasFocus": type_elem.attrib.get("hasFocus"),
                    "hasKeyword": type_elem.attrib.get("hasKeyword"),
                    "text": type_elem.text,
                })

            for keyword in annotations.findall("KEYWORD"):
                keywords.append({
                    "kid": keyword.attrib.get("kid"),
                    "kcategory": keyword.attrib.get("kcategory"),
                    "text": keyword.text,
                })

        # Extract reference answers
        reference_answers = []
        ref_answers_elem = question.find("ReferenceAnswers")
        if ref_answers_elem is not None:
            for ref_answer in ref_answers_elem.findall("RefAnswer"):
                reference_answers.append({
                    "aid": ref_answer.attrib.get("aid"),
                    "text": ref_answer.find("ANSWER").text if ref_answer.find("ANSWER") is not None else None,
                    "url": ref_answer.find("AnswerURL").text if ref_answer.find("AnswerURL") is not None else None,
                    "comment": ref_answer.find("COMMENT").text if ref_answer.find("COMMENT") is not None else None,
                })

        # Append structured data
        data.append({
            "qid": qid,
            "subject": subject,
            "message": message,
            "paraphrase": paraphrase,
            "summary": summary,
            "focuses": focuses,
            "types": types,
            "keywords": keywords,
            "reference_answers": reference_answers,
        })

    return pd.DataFrame(data)

In [8]:
# loading dataset
def parse_xml(file):
    tree = ET.parse(file)
    root = tree.getroot()

    sentence_data = []
    for sentence in root.findall('sentence'):
        sentence_id = sentence.get('id')
        sentence_text = sentence.get('text')

        sentence_data.append({
            "sentence_id": sentence_id,
            "sentence_text": sentence_text
        })

    return pd.DataFrame(sentence_data)


def load_dataset(path, filetype = "csv"):
    if filetype == "csv":
        all_files = []
        for root, dirs, files in tqdm(os.walk(path), desc = "Loading CSV files"):
            for file in tqdm(files, desc = "Processing file"):
                if file.endswith(".csv"):
                    all_files.append(os.path.join(root, file))
        ds = {}
        for f in all_files:
            df = pd.read_csv(f)
            ds[f] = df
        return ds
    elif filetype == "xml":
        all_files = []
        for root, dirs, files in tqdm(os.walk(path), desc = "Loading XML files"):
            for file in tqdm(files, desc = "Processing file"):
                if file.endswith(".xml"):
                    all_files.append(os.path.join(root, file))
        ds = {}
        for f in all_files:
            ds[f] = parse_xml(f)
        return ds
    elif filetype == "jsonl":
        all_files = []
        for root, dirs, files in tqdm(os.walk(path), desc = "Loading JSONL files"):
            for file in tqdm(files, desc = "Processing file"):
                if file.endswith(".jsonl"):
                    all_files.append(os.path.join(root, file))
        ds = {}
        for f in all_files:
            print("current file: ", f)
            with open(f, "r") as file:
                data = [json.loads(line) for line in file]
            ds[f] = pd.DataFrame(data)
        return ds
    elif filetype == "json":
        all_files = []
        for root, dirs, files in tqdm(os.walk(path), desc = "Loading JSON files"):
            for file in tqdm(files, desc = "Processing file"):
                if file.endswith(".json"):
                    all_files.append(os.path.join(root, file))
        ds = {}
        for f in all_files:
            with open(f, "r") as file:
                data = json.load(file)
            ds[f] = pd.DataFrame(data)
        return ds



In [9]:
# functions for deduplication
def get_embeddings(texts, batch_size = 64):
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size), desc = "Generating embeddings"):
        batch_texts = texts[i:i+batch_size]
        embeddings.extend(classifier.encode(texts = batch_texts)['text_embeddings'])
    return np.array(embeddings)

def compute_similarity(embeddings, threshold = 0.9):
    # n = len(embeddings)
    # to_remove = set()
    # for i in tqdm(range(n), desc = "Computing similarity"):
    #     for j in range(i+1, n):
    #         sim = cosine_similarity(embeddings[i].reshape(1, -1), embeddings[j].reshape(1, -1))[0][0]
    #         if sim > threshold:
    #             to_remove.add(j)
    # return to_remove
    similarity_matrix = cosine_similarity(embeddings)
    np.fill_diagonal(similarity_matrix, 0)  # Ignore self-similarity

    # Find indices of pairs with similarity above the threshold
    to_remove = set()
    for i in range(similarity_matrix.shape[0]):
        if i in to_remove:
            continue
        similar_indices = np.where(similarity_matrix[i] > threshold)[0]
        to_remove.update(similar_indices)

    return to_remove

def compute_similarity_chunked(embeddings, threshold=0.9, chunk_size=8000):
    """
    Compute cosine similarity in chunks to reduce memory usage.
    """
    n = len(embeddings)
    to_remove = set()
    for i in tqdm(range(0, n, chunk_size), desc= "Calcuating Similarity"):
        # Get the current chunk
        chunk_embeddings = embeddings[i:i + chunk_size]

        # Compute cosine similarity for the current chunk against all embeddings
        similarity_matrix = cosine_similarity(chunk_embeddings, embeddings)

        # Iterate through the chunk rows to find high-similarity indices
        for row_idx, similarities in enumerate(similarity_matrix):
            actual_idx = i + row_idx  # Map back to the original index
            if actual_idx in to_remove:
                continue

            similar_indices = np.where(similarities > threshold)[0]
            similar_indices = [idx for idx in similar_indices if idx > actual_idx]  # Avoid duplicates
            to_remove.update(similar_indices)

    return to_remove

def compute_similarity_between_datasets(embeddings1, embeddings2, threshold = 0.9):
    to_remove = set()
    for i in tqdm(range(len(embeddings1)), desc = "Computing similarity"):
        for j in range(len(embeddings2)):
            sim = cosine_similarity(embeddings1[i].reshape(1, -1), embeddings2[j].reshape(1, -1))[0][0]
            if sim > threshold:
                to_remove.add(j)
    return to_remove

def compute_similarity_between_datasets_chunked(embeddings1, embeddings2, threshold=0.9, chunk_size1=8000, chunk_size2=8000):
    """
    Compute cosine similarity between two datasets in chunks to reduce memory usage.
    Removes entries from embeddings1 based on high similarity with embeddings2.
    """
    to_remove = set()
    n1, n2 = len(embeddings1), len(embeddings2)

    for i in tqdm(range(0, n1, chunk_size1), desc="Processing dataset1 in chunks"):
        # Get a chunk from embeddings1
        chunk_embeddings1 = embeddings1[i:i + chunk_size1]

        for j in range(0, n2, chunk_size2):
            # Get a chunk from embeddings2
            chunk_embeddings2 = embeddings2[j:j + chunk_size2]

            # Compute cosine similarity for the two chunks
            similarity_matrix = cosine_similarity(chunk_embeddings1, chunk_embeddings2)

            # Check rows in chunk_embeddings1 with high similarity to chunk_embeddings2
            for row_idx, similarities in enumerate(similarity_matrix):
                actual_idx = i + row_idx  # Map back to the original index in embeddings1
                if actual_idx in to_remove:
                    continue
                if np.any(similarities > threshold):
                    to_remove.add(actual_idx)

    return to_remove

def deduplication_within_dataset_qa(dataset, threshold = 0.9):
    questions = dataset["question"].tolist()
    #answers = dataset["answer"].tolist()

    question_embeddings = get_embeddings(questions)
    to_remove_questions = compute_similarity_chunked(question_embeddings, threshold)

    new_dataset = dataset.drop(index = list(to_remove_questions)).reset_index(drop=True)

    answers = new_dataset["answer"].tolist()
    answer_embeddings = get_embeddings(answers)
    to_remove_answers = compute_similarity_chunked(answer_embeddings, threshold)

    new_dataset = new_dataset.drop(index = list(to_remove_answers)).reset_index(drop=True)
    return new_dataset, list(to_remove_questions), list(to_remove_answers)


def deduplicate_across_datasets_qa(new_dataset, old_question_embeddings_saved, old_answer_embeddings_saved, threshold = 0.9):
    # Combine all old dataset questions and answers
    # all_old_questions = []
    # all_old_answers = []

    # for dataset in old_datasets:
    #     all_old_questions.extend(dataset["question"].tolist())
    #     all_old_answers.extend(dataset["answer"].tolist())

    # Generate embeddings for old dataset questions and answers
    # old_question_embeddings = get_embeddings(all_old_questions)
    # old_answer_embeddings = get_embeddings(all_old_answers)
    old_question_embeddings = []
    old_answer_embeddings = []
    for old_embed in old_question_embeddings_saved:
        old_question_embeddings.extend(old_embed)
    for old_embed in old_answer_embeddings_saved:
        old_answer_embeddings.extend(old_embed)

    # Generate embeddings for new dataset questions and answers
    new_question_embeddings = get_embeddings(new_dataset["question"].tolist())
    new_answer_embeddings = get_embeddings(new_dataset["answer"].tolist())

    # Deduplicate new questions
    to_remove_questions = compute_similarity_between_datasets_chunked(new_question_embeddings, old_question_embeddings)

    # Deduplicate new answers
    to_remove_answers = compute_similarity_between_datasets_chunked(new_answer_embeddings, old_answer_embeddings)

    # Combine removal indices
    to_remove = to_remove_questions.union(to_remove_answers)

    # Drop duplicates from new dataset
    deduplicated_new_dataset = new_dataset.drop(index=list(to_remove)).reset_index(drop=True)

    return deduplicated_new_dataset, list(to_remove_questions), list(to_remove_answers)



In [None]:
#deduplicated data loading
deduplicated_medicationqa = pd.read_csv("../deduplicated_data/QAs/MedicationQA/medicationqa_train_fulltext_deduplicated.csv")
deduplicated_pubmed1 = pd.read_csv("../deduplicated_data/QAs/PubMedQA/ori_pqaa_deduplicated.csv")
deduplicated_pubmed2 = pd.read_csv("../deduplicated_data/QAs/PubMedQA/ori_pqau_deduplicated.csv")
deduplicated_pubmed3 = pd.read_csv("../deduplicated_data/QAs/PubMedQA/ori_pqal_deduplicated.csv")
deduplicated_medmcqa_train = pd.read_csv("../deduplicated_data/QAs/MedMCQA/medmcqa_train_fulltext_deduplicated.csv")
deduplicated_medmcqa_dev = pd.read_csv("../deduplicated_data/QAs/MedMCQA/medmcqa_dev_fulltext_deduplicated.csv")
deduplicated_medmcqa_test = pd.read_csv("../deduplicated_data/QAs/MedMCQA/medmcqa_test_fulltext_deduplicated.csv")
deduplicated_medqa_train = pd.read_csv("../deduplicated_data/QAs/MedQA-USMLE/medqa_train_deduplicated.csv")
deduplicated_medqa_dev = pd.read_csv("../deduplicated_data/QAs/MedQA-USMLE/medqa_dev_deduplicated.csv")
deduplicated_medqa_test = pd.read_csv("../deduplicated_data/QAs/MedQA-USMLE/medqa_test_deduplicated.csv")

## Deduplicate LiveQA

In [None]:
def parse_nlm_questions(file_path):
    # Parse the XML file
    tree = ET.parse(file_path)
    root = tree.getroot()

    # Initialize storage for the parsed data
    data = []

    # Iterate through each NLM-QUESTION
    for question in root.findall("NLM-QUESTION"):
        qid = question.attrib.get("qid", None)
        subject = question.find("SUBJECT").text if question.find("SUBJECT") is not None else None
        message = question.find("MESSAGE").text if question.find("MESSAGE") is not None else None

        # Extract sub-questions
        sub_questions = question.find("SUB-QUESTIONS")
        if sub_questions is not None:
            for sub_question in sub_questions.findall("SUB-QUESTION"):
                # Extract annotations
                annotations = sub_question.find("ANNOTATIONS")
                focus = annotations.find("FOCUS").text if annotations is not None and annotations.find("FOCUS") is not None else None
                qtype = annotations.find("TYPE").text if annotations is not None and annotations.find("TYPE") is not None else None

                # Extract answers
                answers_elem = sub_question.find("ANSWERS")
                answers = []
                if answers_elem is not None:
                    for answer in answers_elem.findall("ANSWER"):
                        answers.append(answer.text.strip())

                # Store the parsed data
                data.append({
                    "qid": qid,
                    "subject": subject,
                    "question": message,
                    "focus": focus,
                    "type": qtype,
                    "answer": answers
                })

    # Convert data to a pandas DataFrame
    return pd.DataFrame(data)

def parse_nlm_questions_test(file_path):
    # Parse the XML file
    tree = ET.parse(file_path)
    root = tree.getroot()

    # Initialize storage for the parsed data
    data = []

    # Iterate through each NLM-QUESTION
    for question in root.findall("NLM-QUESTION"):
        qid = question.attrib.get("qid", None)

        # Extract subject and message
        subject_elem = question.find("./Original-Question/SUBJECT")
        subject = subject_elem.text.strip() if subject_elem.text is not None else None

        message_elem = question.find("./Original-Question/MESSAGE")
        message = message_elem.text.strip() if message_elem.text is not None else None

        # Extract answers
        answers = []
        reference_answers = question.find("ReferenceAnswers")
        if reference_answers is not None:
            for ref_answer in reference_answers.findall("RefAnswer"):
                answer_elem = ref_answer.find("ANSWER")
                if answer_elem is not None:
                    # Join all parts of the answer into a single string, stripping whitespace
                    answer_text = "".join(answer_elem.itertext()).strip()
                    answers.append(answer_text)
            if reference_answers.find("RefAnswer") is None:
                for ref_answer in reference_answers.findall("ReferenceAnswer"):
                    answer_elem = ref_answer.find("ANSWER")
                    if answer_elem is not None:
                        # Join all parts of the answer into a single string, stripping whitespace
                        answer_text = "".join(answer_elem.itertext()).strip()
                        answers.append(answer_text)

        # Append to the dataset
        data.append({
            "qid": qid,
            "subject": subject,
            "question": message,
            "answer": answers  # Store all answers as a list
        })

    # Convert data to a pandas DataFrame
    return pd.DataFrame(data)


In [None]:
trec_qa_train_1 = parse_nlm_questions(directory + "/LiveQA/TREC-2017-LiveQA-Medical-Train-1.xml")
trec_qa_train_2 = parse_nlm_questions(directory + "/LiveQA/TREC-2017-LiveQA-Medical-Train-2.xml")
trec_qa_test = parse_nlm_questions_test(directory + "/LiveQA/TREC-2017-LiveQA-Medical-Test.xml")

# Remove NaN values from the "question" and "answer" columns
def clean_dataframe(df):
    # Ensure "question" and "answer" columns exist and are non-empty
    df["question"] = df["question"].fillna("").astype(str)
    df["answer"] = df["answer"].fillna("").astype(str)

    # Remove rows where "question" or "answer" is an empty string
    df = df[(df["question"].str.strip() != "") & (df["answer"].str.strip() != "")]
    return df.reset_index(drop=True)

trec_qa_train_1 = clean_dataframe(trec_qa_train_1)
trec_qa_train_2 = clean_dataframe(trec_qa_train_2)
trec_qa_test = clean_dataframe(trec_qa_test)

In [None]:
# length of the dataset
print("Length of the train1 dataset: " + str(len(trec_qa_train_1)))
print("Length of the train2 dataset: " + str(len(trec_qa_train_2)))
print("Length of the test dataset: " + str(len(trec_qa_test)))

Length of the train1 dataset: 254
Length of the train2 dataset: 244
Length of the test dataset: 104


In [None]:
# self deduplicate first
trec_qa_train_1_self_dedup, removed_questions_self_train_1, removed_answers_self_train_1 = deduplication_within_dataset_qa(trec_qa_train_1)
print(len(removed_questions_self_train_1), len(removed_answers_self_train_1))
trec_qa_train_2_self_dedup, removed_questions_self_train_2, removed_answers_self_train_2 = deduplication_within_dataset_qa(trec_qa_train_2)
print(len(removed_questions_self_train_2), len(removed_answers_self_train_2))
trec_qa_test_self_dedup, removed_questions_self_test, removed_answers_self_test = deduplication_within_dataset_qa(trec_qa_test)
print(len(removed_questions_self_test), len(removed_answers_self_test))

Generating embeddings: 100%|██████████| 4/4 [00:01<00:00,  2.68it/s]
Calcuating Similarity: 100%|██████████| 1/1 [00:00<00:00, 155.21it/s]
Generating embeddings: 100%|██████████| 4/4 [00:02<00:00,  1.86it/s]
Calcuating Similarity: 100%|██████████| 1/1 [00:00<00:00, 195.85it/s]


55 10


Generating embeddings: 100%|██████████| 4/4 [00:01<00:00,  2.70it/s]
Calcuating Similarity: 100%|██████████| 1/1 [00:00<00:00, 178.92it/s]
Generating embeddings: 100%|██████████| 4/4 [00:01<00:00,  2.21it/s]
Calcuating Similarity: 100%|██████████| 1/1 [00:00<00:00, 155.92it/s]


0 4


Generating embeddings: 100%|██████████| 2/2 [00:00<00:00,  3.22it/s]
Calcuating Similarity: 100%|██████████| 1/1 [00:00<00:00, 315.72it/s]
Generating embeddings: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s]
Calcuating Similarity: 100%|██████████| 1/1 [00:00<00:00, 312.87it/s]

0 0





In [None]:
old_questions = []
old_answers = []

with open("../deduplicated_embeddings/QAs/medicationqa_question_embeddings.pkl", "rb") as f:
    medication_qa_q_embed = pickle.load(f)
    old_questions.append(medication_qa_q_embed)

with open("../deduplicated_embeddings/QAs/medicationqa_answer_embeddings.pkl", "rb") as f:
    medication_qa_a_embed = pickle.load(f)
    old_answers.append(medication_qa_a_embed)

#pubmed1,2,3
with open("../deduplicated_embeddings/QAs/pubmed1_question_embeddings.pkl", "rb") as f:
    pubmed1_q_embed = pickle.load(f)
    old_questions.append(pubmed1_q_embed)

with open("../deduplicated_embeddings/QAs/pubmed1_answer_embeddings.pkl", "rb") as f:
    pubmed1_a_embed = pickle.load(f)
    old_answers.append(pubmed1_a_embed)

with open("../deduplicated_embeddings/QAs/pubmed2_question_embeddings.pkl", "rb") as f:
    pubmed2_q_embed = pickle.load(f)
    old_questions.append(pubmed2_q_embed)

with open("../deduplicated_embeddings/QAs/pubmed2_answer_embeddings.pkl", "rb") as f:
    pubmed2_a_embed = pickle.load(f)
    old_answers.append(pubmed2_a_embed)

with open("../deduplicated_embeddings/QAs/pubmed3_question_embeddings.pkl", "rb") as f:
    pubmed3_q_embed = pickle.load(f)
    old_questions.append(pubmed3_q_embed)

with open("../deduplicated_embeddings/QAs/pubmed3_answer_embeddings.pkl", "rb") as f:
    pubmed3_a_embed = pickle.load(f)
    old_answers.append(pubmed3_a_embed)

# medmcqa
with open("../deduplicated_embeddings/QAs/medmcqa_train_question_embeddings.pkl", "rb") as f:
    medmcqa_train_q_embed = pickle.load(f)
    old_questions.append(medmcqa_train_q_embed)

with open("../deduplicated_embeddings/QAs/medmcqa_train_answer_embeddings.pkl", "rb") as f:
    medmcqa_train_a_embed = pickle.load(f)
    old_answers.append(medmcqa_train_a_embed)

with open("../deduplicated_embeddings/QAs/medmcqa_dev_question_embeddings.pkl", "rb") as f:
    medmcqa_dev_q_embed = pickle.load(f)
    old_questions.append(medmcqa_dev_q_embed)

with open("../deduplicated_embeddings/QAs/medmcqa_dev_answer_embeddings.pkl", "rb") as f:
    medmcqa_dev_a_embed = pickle.load(f)
    old_answers.append(medmcqa_dev_a_embed)

with open("../deduplicated_embeddings/QAs/medmcqa_test_question_embeddings.pkl", "rb") as f:
    medmcqa_test_q_embed = pickle.load(f)
    old_questions.append(medmcqa_test_q_embed)

with open("../deduplicated_embeddings/QAs/medmcqa_test_answer_embeddings.pkl", "rb") as f:
    medmcqa_test_a_embed = pickle.load(f)
    old_answers.append(medmcqa_test_a_embed)

with open("../deduplicated_embeddings/QAs/medqa_train_question_embeddings.pkl", "rb") as f:
    medqa_train_a_embed = pickle.load(f)
    old_questions.append(medqa_train_a_embed)

with open("../deduplicated_embeddings/QAs/medqa_train_answer_embeddings.pkl", "rb") as f:
    medqa_train_a_embed = pickle.load(f)
    old_answers.append(medqa_train_a_embed)

with open("../deduplicated_embeddings/QAs/medqa_dev_question_embeddings.pkl", "rb") as f:
    medqa_dev_a_embed = pickle.load(f)
    old_questions.append(medqa_dev_a_embed)

with open("../deduplicated_embeddings/QAs/medqa_dev_answer_embeddings.pkl", "rb") as f:
    medqa_dev_a_embed = pickle.load(f)
    old_answers.append(medqa_dev_a_embed)

with open("../deduplicated_embeddings/QAs/medqa_test_question_embeddings.pkl", "rb") as f:
    medqa_test_a_embed = pickle.load(f)
    old_questions.append(medqa_test_a_embed)

with open("../deduplicated_embeddings/QAs/medqa_test_answer_embeddings.pkl", "rb") as f:
    medqa_test_a_embed = pickle.load(f)
    old_answers.append(medqa_test_a_embed)

In [None]:
# deduplicate between existing dataset
trec_qa_train1_full, removed_questions_train, removed_answers_train = deduplicate_across_datasets_qa(trec_qa_train_1_self_dedup, old_questions, old_answers)
print(len(removed_questions_train), len(removed_answers_train))

Generating embeddings: 100%|██████████| 3/3 [00:01<00:00,  2.69it/s]
Generating embeddings: 100%|██████████| 3/3 [00:01<00:00,  1.58it/s]
Processing dataset1 in chunks: 100%|██████████| 1/1 [00:02<00:00,  2.72s/it]
Processing dataset1 in chunks: 100%|██████████| 1/1 [00:02<00:00,  2.72s/it]

1 0





In [None]:
trec_qa_train1_full.to_csv("../deduplicated_data/QAs/LiveQA/trec_qa_train1_fulltext_deduplicated.csv", index = False)

In [None]:
with open("../deduplicated_embeddings/QAs/trec_train1_question_embeddings.pkl", "rb") as f:
    trec_train1_q_embed = pickle.load(f)
    old_questions.append(trec_train1_q_embed)

with open("../deduplicated_embeddings/QAs/trec_train1_answer_embeddings.pkl", "rb") as f:
    trec_train1_a_embed = pickle.load(f)
    old_answers.append(trec_train1_a_embed)

In [None]:
trec_qa_train2_full, removed_questions_train, removed_answers_train = deduplicate_across_datasets_qa(trec_qa_train_2_self_dedup, old_questions, old_answers)
print(len(removed_questions_train), len(removed_answers_train))

Generating embeddings: 100%|██████████| 4/4 [00:01<00:00,  2.92it/s]
Generating embeddings: 100%|██████████| 4/4 [00:01<00:00,  2.36it/s]
Processing dataset1 in chunks: 100%|██████████| 1/1 [00:02<00:00,  2.96s/it]
Processing dataset1 in chunks: 100%|██████████| 1/1 [00:02<00:00,  3.00s/it]

27 3





In [None]:
trec_qa_train2_full.to_csv("../deduplicated_data/QAs/LiveQA/trec_qa_train2_fulltext_deduplicated.csv", index = False)

In [None]:
with open("../deduplicated_embeddings/QAs/trec_train2_question_embeddings.pkl", "rb") as f:
    trec_train2_q_embed = pickle.load(f)
    old_questions.append(trec_train2_q_embed)

with open("../deduplicated_embeddings/QAs/trec_train2_answer_embeddings.pkl", "rb") as f:
    trec_train2_a_embed = pickle.load(f)
    old_answers.append(trec_train2_a_embed)

In [None]:
trec_qa_test_full, removed_questions_test, removed_answers_test = deduplicate_across_datasets_qa(trec_qa_test_self_dedup, old_questions, old_answers)
print(len(removed_questions_test), len(removed_answers_test))

Generating embeddings: 100%|██████████| 2/2 [00:00<00:00,  3.41it/s]
Generating embeddings: 100%|██████████| 2/2 [00:00<00:00,  2.10it/s]
Processing dataset1 in chunks: 100%|██████████| 1/1 [00:02<00:00,  2.43s/it]
Processing dataset1 in chunks: 100%|██████████| 1/1 [00:02<00:00,  2.65s/it]

15 1





In [None]:
trec_qa_test_full.to_csv("../deduplicated_data/QAs/LiveQA/trec_qa_test_fulltext_deduplicated.csv", index = False)

## Deduploicate MedQA

In [None]:
medqa = load_dataset(path = directory + "/MedQA-USMLE/questions/US", filetype = "jsonl")
print("Available Keys: " + str(medqa.keys()))
#

Loading JSONL files: 0it [00:00, ?it/s]
Processing file: 100%|██████████| 4/4 [00:00<00:00, 39016.78it/s]

Processing file: 100%|██████████| 3/3 [00:00<00:00, 32768.00it/s]

Processing file: 0it [00:00, ?it/s]

Processing file: 100%|██████████| 1/1 [00:00<00:00, 10591.68it/s]

Processing file: 100%|██████████| 1/1 [00:00<00:00, 11915.64it/s]

Processing file: 100%|██████████| 1/1 [00:00<00:00, 9510.89it/s]
Loading JSONL files: 6it [00:00, 136.03it/s]


current file:  ../dataset/QAs/MedQA-USMLE/questions/US/US_qbank.jsonl
current file:  ../dataset/QAs/MedQA-USMLE/questions/US/dev.jsonl
current file:  ../dataset/QAs/MedQA-USMLE/questions/US/test.jsonl
current file:  ../dataset/QAs/MedQA-USMLE/questions/US/train.jsonl
current file:  ../dataset/QAs/MedQA-USMLE/questions/US/4_options/phrases_no_exclude_test.jsonl
current file:  ../dataset/QAs/MedQA-USMLE/questions/US/4_options/phrases_no_exclude_train.jsonl
current file:  ../dataset/QAs/MedQA-USMLE/questions/US/4_options/phrases_no_exclude_dev.jsonl
current file:  ../dataset/QAs/MedQA-USMLE/questions/US/metamap_extracted_phrases/train/phrases_train.jsonl
current file:  ../dataset/QAs/MedQA-USMLE/questions/US/metamap_extracted_phrases/test/phrases_test.jsonl
current file:  ../dataset/QAs/MedQA-USMLE/questions/US/metamap_extracted_phrases/dev/phrases_dev.jsonl
Available Keys: dict_keys(['../dataset/QAs/MedQA-USMLE/questions/US/US_qbank.jsonl', '../dataset/QAs/MedQA-USMLE/questions/US/dev.js

In [None]:
# first deduplicate samller files
medqa_train = medqa["../dataset/QAs/MedQA-USMLE/questions/US/train.jsonl"]
medqa_dev = medqa["../dataset/QAs/MedQA-USMLE/questions/US/dev.jsonl"]
medqa_test = medqa["../dataset/QAs/MedQA-USMLE/questions/US/test.jsonl"]

In [None]:
def process_medqa(df,):
    df['old_answer'] = None
    for i, row in enumerate(df.itertuples()):
        df.at[i, 'old_answer'] = row.answer
        new_answer = f"The options you have are {row.options}. The correct answer is {row.answer}."
        df.at[i, 'answer'] = new_answer
    return df


In [None]:
medqa_train = process_medqa(medqa_train)
medqa_dev = process_medqa(medqa_dev)
medqa_test = process_medqa(medqa_test)

In [None]:
# self deduplicate first
medqa_train_self_dedup, removed_questions_self_train, removed_answers_self_train = deduplication_within_dataset_qa(medqa_train)
print(len(removed_questions_self_train), len(removed_answers_self_train))
medqa_dev_self_dedup, removed_questions_self_dev, removed_answers_self_dev = deduplication_within_dataset_qa(medqa_dev)
print(len(removed_questions_self_dev), len(removed_answers_self_dev))
medqa_test_self_dedup, removed_questions_self_test, removed_answers_self_test = deduplication_within_dataset_qa(medqa_test)
print(len(removed_questions_self_test), len(removed_answers_self_test))

Generating embeddings: 100%|██████████| 160/160 [01:10<00:00,  2.27it/s]
Calcuating Similarity: 100%|██████████| 2/2 [00:00<00:00,  3.25it/s]
Generating embeddings: 100%|██████████| 158/158 [00:57<00:00,  2.76it/s]
Calcuating Similarity: 100%|██████████| 2/2 [00:00<00:00,  3.41it/s]


69 658


Generating embeddings: 100%|██████████| 20/20 [00:08<00:00,  2.34it/s]
Calcuating Similarity: 100%|██████████| 1/1 [00:00<00:00, 34.62it/s]
Generating embeddings: 100%|██████████| 20/20 [00:07<00:00,  2.77it/s]
Calcuating Similarity: 100%|██████████| 1/1 [00:00<00:00, 36.67it/s]


1 25


Generating embeddings: 100%|██████████| 20/20 [00:08<00:00,  2.31it/s]
Calcuating Similarity: 100%|██████████| 1/1 [00:00<00:00, 36.97it/s]
Generating embeddings: 100%|██████████| 20/20 [00:07<00:00,  2.77it/s]
Calcuating Similarity: 100%|██████████| 1/1 [00:00<00:00, 33.49it/s]

2 17





In [None]:
#length after deduplication
print("Numer of data after deduplicaton for train set: " + str(len(medqa_train_self_dedup)))
print("Numer of data after deduplicaton for dev set: " + str(len(medqa_dev_self_dedup)))
print("Numer of data after deduplicaton for test set: " + str(len(medqa_test_self_dedup)))

Numer of data after deduplicaton for train set: 9451
Numer of data after deduplicaton for dev set: 1246
Numer of data after deduplicaton for test set: 1254


In [None]:
old_questions = []
old_answers = []

with open("../deduplicated_embeddings/QAs/medicationqa_question_embeddings.pkl", "rb") as f:
    medication_qa_q_embed = pickle.load(f)
    old_questions.append(medication_qa_q_embed)

with open("../deduplicated_embeddings/QAs/medicationqa_answer_embeddings.pkl", "rb") as f:
    medication_qa_a_embed = pickle.load(f)
    old_answers.append(medication_qa_a_embed)

#pubmed1,2,3
with open("../deduplicated_embeddings/QAs/pubmed1_question_embeddings.pkl", "rb") as f:
    pubmed1_q_embed = pickle.load(f)
    old_questions.append(pubmed1_q_embed)

with open("../deduplicated_embeddings/QAs/pubmed1_answer_embeddings.pkl", "rb") as f:
    pubmed1_a_embed = pickle.load(f)
    old_answers.append(pubmed1_a_embed)

with open("../deduplicated_embeddings/QAs/pubmed2_question_embeddings.pkl", "rb") as f:
    pubmed2_q_embed = pickle.load(f)
    old_questions.append(pubmed2_q_embed)

with open("../deduplicated_embeddings/QAs/pubmed2_answer_embeddings.pkl", "rb") as f:
    pubmed2_a_embed = pickle.load(f)
    old_answers.append(pubmed2_a_embed)

with open("../deduplicated_embeddings/QAs/pubmed3_question_embeddings.pkl", "rb") as f:
    pubmed3_q_embed = pickle.load(f)
    old_questions.append(pubmed3_q_embed)

with open("../deduplicated_embeddings/QAs/pubmed3_answer_embeddings.pkl", "rb") as f:
    pubmed3_a_embed = pickle.load(f)
    old_answers.append(pubmed3_a_embed)

# medmcqa
with open("../deduplicated_embeddings/QAs/medmcqa_train_question_embeddings.pkl", "rb") as f:
    medmcqa_train_q_embed = pickle.load(f)
    old_questions.append(medmcqa_train_q_embed)

with open("../deduplicated_embeddings/QAs/medmcqa_train_answer_embeddings.pkl", "rb") as f:
    medmcqa_train_a_embed = pickle.load(f)
    old_answers.append(medmcqa_train_a_embed)

with open("../deduplicated_embeddings/QAs/medmcqa_dev_question_embeddings.pkl", "rb") as f:
    medmcqa_dev_q_embed = pickle.load(f)
    old_questions.append(medmcqa_dev_q_embed)

with open("../deduplicated_embeddings/QAs/medmcqa_dev_answer_embeddings.pkl", "rb") as f:
    medmcqa_dev_a_embed = pickle.load(f)
    old_answers.append(medmcqa_dev_a_embed)

with open("../deduplicated_embeddings/QAs/medmcqa_test_question_embeddings.pkl", "rb") as f:
    medmcqa_test_q_embed = pickle.load(f)
    old_questions.append(medmcqa_test_q_embed)

with open("../deduplicated_embeddings/QAs/medmcqa_test_answer_embeddings.pkl", "rb") as f:
    medmcqa_test_a_embed = pickle.load(f)
    old_answers.append(medmcqa_test_a_embed)

In [None]:
# deduplicate between existing dataset
full_medqa_train, removed_questions_train, removed_answers_train = deduplicate_across_datasets_qa(medqa_train_self_dedup, old_questions, old_answers)
print(len(removed_questions_train), len(removed_answers_train))

Generating embeddings: 100%|██████████| 148/148 [01:03<00:00,  2.32it/s]
Generating embeddings: 100%|██████████| 148/148 [00:53<00:00,  2.77it/s]
Processing dataset1 in chunks: 100%|██████████| 2/2 [00:29<00:00, 14.52s/it]
Processing dataset1 in chunks: 100%|██████████| 2/2 [00:28<00:00, 14.42s/it]

3 0





In [None]:
full_medqa_train.to_csv("../deduplicated_data/QAs/MedQA-USMLE/medqa_train_deduplicated.csv", index = False)

In [None]:
with open("../deduplicated_embeddings/QAs/medqa_train_question_embeddings.pkl", "rb") as f:
    medication_qa_q_embed = pickle.load(f)
    old_questions.append(medication_qa_q_embed)

with open("../deduplicated_embeddings/QAs/medqa_train_answer_embeddings.pkl", "rb") as f:
    medication_qa_a_embed = pickle.load(f)
    old_answers.append(medication_qa_a_embed)

In [None]:
full_medqa_dev, removed_questions_dev, removed_answers_dev = deduplicate_across_datasets_qa(medqa_dev_self_dedup, old_questions, old_answers)
print(len(removed_questions_dev), len(removed_answers_dev))

Generating embeddings: 100%|██████████| 20/20 [00:08<00:00,  2.36it/s]
Generating embeddings: 100%|██████████| 20/20 [00:07<00:00,  2.83it/s]
Processing dataset1 in chunks: 100%|██████████| 1/1 [00:06<00:00,  6.45s/it]
Processing dataset1 in chunks: 100%|██████████| 1/1 [00:07<00:00,  7.36s/it]

12 110





In [None]:
full_medqa_dev.to_csv("../deduplicated_data/QAs/MedQA-USMLE/medqa_dev_deduplicated.csv", index = False)

In [None]:
with open("../deduplicated_embeddings/QAs/medqa_dev_question_embeddings.pkl", "rb") as f:
    medqa_dev_q = pickle.load(f)
    old_questions.append(medqa_dev_q)

with open("../deduplicated_embeddings/QAs/medqa_dev_answer_embeddings.pkl", "rb") as f:
    medqa_dev_a = pickle.load(f)
    old_answers.append(medqa_dev_a)

In [None]:
full_medqa_test, removed_questions_test, removed_answers_test = deduplicate_across_datasets_qa(medqa_test_self_dedup, old_questions, old_answers)
print(len(removed_questions_dev), len(removed_answers_dev))

Generating embeddings: 100%|██████████| 20/20 [00:08<00:00,  2.34it/s]
Generating embeddings: 100%|██████████| 20/20 [00:07<00:00,  2.83it/s]
Processing dataset1 in chunks: 100%|██████████| 1/1 [00:06<00:00,  6.65s/it]
Processing dataset1 in chunks: 100%|██████████| 1/1 [00:06<00:00,  6.78s/it]

12 110





In [None]:
full_medqa_test.to_csv("../deduplicated_data/QAs/MedQA-USMLE/medqa_test_deduplicated.csv", index = False)

In [None]:
# save all the indices
with open("medqa_train_removed_questions_self.txt", "w") as f:
    for item in removed_questions_self_train:
        f.write(f"{item}\n")

with open("medqa_train_removed_answers_self.txt", "w") as f:
    for item in removed_answers_self_train:
        f.write(f"{item}\n")

with open("medqa_dev_removed_questions_self.txt", "w") as f:
    for item in removed_questions_self_dev:
        f.write(f"{item}\n")

with open("medqa_dev_removed_answers_self.txt", "w") as f:
    for item in removed_answers_self_dev:
        f.write(f"{item}\n")

with open("medqa_test_removed_questions_self.txt", "w") as f:
    for item in removed_questions_self_test:
        f.write(f"{item}\n")

with open("medqa_test_removed_answers_self.txt", "w") as f:
    for item in removed_answers_self_test:
        f.write(f"{item}\n")

In [None]:
with open("medqa_train_removed_questions_full.txt", "w") as f:
    for item in removed_questions_train:
        f.write(f"{item}\n")

with open("medqa_train_removed_answers_full.txt", "w") as f:
    for item in removed_answers_train:
        f.write(f"{item}\n")

with open("medqa_dev_removed_questions_full.txt", "w") as f:
    for item in removed_questions_dev:
        f.write(f"{item}\n")

with open("medqa_dev_removed_answers_full.txt", "w") as f:
    for item in removed_answers_dev:
        f.write(f"{item}\n")

with open("medqa_test_removed_questions_full.txt", "w") as f:
    for item in removed_questions_test:
        f.write(f"{item}\n")

with open("medqa_test_removed_answers_full.txt", "w") as f:
    for item in removed_answers_test:
        f.write(f"{item}\n")

In [None]:
medqa.keys()

dict_keys(['../dataset/QAs/MedQA-USMLE/questions/US/US_qbank.jsonl', '../dataset/QAs/MedQA-USMLE/questions/US/dev.jsonl', '../dataset/QAs/MedQA-USMLE/questions/US/test.jsonl', '../dataset/QAs/MedQA-USMLE/questions/US/train.jsonl', '../dataset/QAs/MedQA-USMLE/questions/US/4_options/phrases_no_exclude_test.jsonl', '../dataset/QAs/MedQA-USMLE/questions/US/4_options/phrases_no_exclude_train.jsonl', '../dataset/QAs/MedQA-USMLE/questions/US/4_options/phrases_no_exclude_dev.jsonl', '../dataset/QAs/MedQA-USMLE/questions/US/metamap_extracted_phrases/train/phrases_train.jsonl', '../dataset/QAs/MedQA-USMLE/questions/US/metamap_extracted_phrases/test/phrases_test.jsonl', '../dataset/QAs/MedQA-USMLE/questions/US/metamap_extracted_phrases/dev/phrases_dev.jsonl'])

In [None]:
medqa_phrases_train = medqa["../dataset/QAs/MedQA-USMLE/questions/US/metamap_extracted_phrases/train/phrases_train.jsonl"]
medqa_phrases_dev = medqa["../dataset/QAs/MedQA-USMLE/questions/US/metamap_extracted_phrases/dev/phrases_dev.jsonl"]
medqa_phrases_test = medqa["../dataset/QAs/MedQA-USMLE/questions/US/metamap_extracted_phrases/test/phrases_test.jsonl"]

In [None]:
# drop the indices
medqa_phrases_train = medqa_phrases_train.drop(index = removed_questions_self_train).reset_index(drop=True)
medqa_phrases_train = medqa_phrases_train.drop(index = removed_answers_self_train).reset_index(drop=True)
remove_full_index_phrase_train = list(set(removed_questions_train).union(set(removed_answers_train)))
medqa_phrases_train = medqa_phrases_train.drop(index = remove_full_index_phrase_train).reset_index(drop=True)
medqa_phrases_train.to_csv("../deduplicated_data/QAs/MedQA-USMLE/medqa_phrases_train_deduplicated.csv", index = False)

In [None]:
medqa_phrases_dev = medqa_phrases_dev.drop(index = removed_questions_self_dev).reset_index(drop=True)
medqa_phrases_dev = medqa_phrases_dev.drop(index = removed_answers_self_dev).reset_index(drop=True)
remove_full_index_phrase_dev = list(set(removed_questions_dev).union(set(removed_answers_dev)))
medqa_phrases_dev = medqa_phrases_dev.drop(index = remove_full_index_phrase_dev).reset_index(drop=True)
medqa_phrases_dev.to_csv("../deduplicated_data/QAs/MedQA-USMLE/medqa_phrases_dev_deduplicated.csv", index = False)

In [None]:
medqa_phrases_test = medqa_phrases_test.drop(index = removed_questions_self_test).reset_index(drop=True)
medqa_phrases_test = medqa_phrases_test.drop(index = removed_answers_self_test).reset_index(drop=True)
remove_full_index_phrase_test = list(set(removed_questions_test).union(set(removed_answers_test)))
medqa_phrases_test = medqa_phrases_test.drop(index = remove_full_index_phrase_test).reset_index(drop=True)
medqa_phrases_test.to_csv("../deduplicated_data/QAs/MedQA-USMLE/medqa_phrases_test_deduplicated.csv", index = False)

In [None]:
medqa_noexclude_train = medqa["../dataset/QAs/MedQA-USMLE/questions/US/4_options/phrases_no_exclude_train.jsonl"]
medqa_noexclude_dev = medqa["../dataset/QAs/MedQA-USMLE/questions/US/4_options/phrases_no_exclude_dev.jsonl"]
medqa_noexclude_test = medqa["../dataset/QAs/MedQA-USMLE/questions/US/4_options/phrases_no_exclude_test.jsonl"]

In [None]:
# drop the indices
medqa_noexclude_train = medqa_noexclude_train.drop(index = removed_questions_self_train).reset_index(drop=True)
medqa_noexclude_train = medqa_noexclude_train.drop(index = removed_answers_self_train).reset_index(drop=True)
remove_full_index_noexclude_train = list(set(removed_questions_train).union(set(removed_answers_train)))
medqa_noexclude_train = medqa_noexclude_train.drop(index = remove_full_index_noexclude_train).reset_index(drop=True)
medqa_noexclude_train.to_csv("../deduplicated_data/QAs/MedQA-USMLE/medqa_noexclude_train_deduplicated.csv", index = False)

In [None]:
medqa_noexclude_dev = medqa_noexclude_dev.drop(index = removed_questions_self_dev).reset_index(drop=True)
medqa_noexclude_dev = medqa_noexclude_dev.drop(index = removed_answers_self_dev).reset_index(drop=True)
remove_full_index_noexclude_dev = list(set(removed_questions_dev).union(set(removed_answers_dev)))
medqa_noexclude_dev = medqa_noexclude_dev.drop(index = remove_full_index_noexclude_dev).reset_index(drop=True)
medqa_noexclude_dev.to_csv("../deduplicated_data/QAs/MedQA-USMLE/medqa_noexclude_dev_deduplicated.csv", index = False)

In [None]:
medqa_noexclude_test = medqa_noexclude_test.drop(index = removed_questions_self_test).reset_index(drop=True)
medqa_noexclude_test = medqa_noexclude_test.drop(index = removed_answers_self_test).reset_index(drop=True)
remove_full_index_noexclude_test = list(set(removed_questions_test).union(set(removed_answers_test)))
medqa_noexclude_test = medqa_noexclude_test.drop(index = remove_full_index_noexclude_test).reset_index(drop=True)
medqa_noexclude_test.to_csv("../deduplicated_data/QAs/MedQA-USMLE/medqa_noexclude_test_deduplicated.csv", index = False)

## Deduplicate MedMCQA

In [None]:
# load medmcqa
medmcqa = load_dataset(path = directory + "/MedMCQA", filetype = "jsonl")

Loading JSONL files: 0it [00:00, ?it/s]
Processing file: 100%|██████████| 2/2 [00:00<00:00, 22982.49it/s]
Loading JSONL files: 1it [00:01,  1.38s/it]
Processing file: 100%|██████████| 3/3 [00:00<00:00, 33200.30it/s]
Loading JSONL files: 2it [00:01,  1.26it/s]


current file:  ../dataset/QAs/MedMCQA/data/test.jsonl
current file:  ../dataset/QAs/MedMCQA/data/train.jsonl
current file:  ../dataset/QAs/MedMCQA/data/dev.jsonl


In [None]:
print("Available files" + str(medmcqa.keys()))
medmcqa_train = medmcqa["../dataset/QAs/MedMCQA/data/train.jsonl"]
medmcqa_dev = medmcqa["../dataset/QAs/MedMCQA/data/dev.jsonl"]
medmcqa_test = medmcqa["../dataset/QAs/MedMCQA/data/test.jsonl"]

Available filesdict_keys(['../dataset/QAs/MedMCQA/data/test.jsonl', '../dataset/QAs/MedMCQA/data/train.jsonl', '../dataset/QAs/MedMCQA/data/dev.jsonl'])


In [None]:
def process_medmcqa(df, mode = 'train'):
    df['answer'] = None
    for i, row in enumerate(df.itertuples()):
        if mode != "test":
            answer_row = f"The choices are: A) {row.opa}, B) {row.opb}, C) {row.opc}, D) {row.opd}. The correct answer is {row.cop}, because {row.exp}"
        else:
            answer_row = f"The choices are: A) {row.opa}, B) {row.opb}, C) {row.opc}, D) {row.opd}."
        df.at[i, 'answer'] = answer_row

    return df


In [None]:
medmcqa_train = process_medmcqa(medmcqa_train, mode = 'train')
medmcqa_dev = process_medmcqa(medmcqa_dev, mode = 'dev')
medmcqa_test = process_medmcqa(medmcqa_test, mode = 'test')

In [None]:
# self deduplication first
medmcqa_train_self_dedup, removed_questions_self_train, removed_answers_self_train = deduplication_within_dataset_qa(medmcqa_train)
print(len(removed_questions_self_train), len(removed_answers_self_train))
medmcqa_dev_self_dedup, removed_questions_self_dev, removed_answers_self_dev = deduplication_within_dataset_qa(medmcqa_dev)
print(len(removed_questions_self_dev), len(removed_answers_self_dev))
medmcqa_test_self_dedup, removed_questions_self_test, removed_answers_self_test = deduplication_within_dataset_qa(medmcqa_test)
print(len(removed_questions_self_test), len(removed_answers_self_test))

Generating embeddings: 100%|██████████| 2857/2857 [17:00<00:00,  2.80it/s]
Calcuating Similarity: 100%|██████████| 23/23 [02:18<00:00,  6.00s/it]
Generating embeddings: 100%|██████████| 2488/2488 [17:34<00:00,  2.36it/s]
Calcuating Similarity: 100%|██████████| 20/20 [01:44<00:00,  5.22s/it]


23598 15601


Generating embeddings: 100%|██████████| 66/66 [00:23<00:00,  2.78it/s]
Calcuating Similarity: 100%|██████████| 1/1 [00:00<00:00,  7.89it/s]
Generating embeddings: 100%|██████████| 65/65 [00:25<00:00,  2.51it/s]
Calcuating Similarity: 100%|██████████| 1/1 [00:00<00:00,  7.64it/s]


30 163


Generating embeddings: 100%|██████████| 97/97 [00:34<00:00,  2.85it/s]
Calcuating Similarity: 100%|██████████| 1/1 [00:00<00:00,  4.21it/s]
Generating embeddings: 100%|██████████| 96/96 [00:34<00:00,  2.79it/s]
Calcuating Similarity: 100%|██████████| 1/1 [00:00<00:00,  4.36it/s]

13 674





In [None]:
len(medmcqa_train_self_dedup), len(medmcqa_dev_self_dedup), len(medmcqa_test_self_dedup)

(143623, 3990, 5463)

## Now, we deduplicate between existing datas

In [None]:
medmcqa_test_self_dedup.to_csv("../deduplicated_data/QAs/MedMCQA/medmcqa_test_fulltext_deduplicated_self.csv", index = False)
medmcqa_dev_self_dedup.to_csv("../deduplicated_data/QAs/MedMCQA/medmcqa_dev_fulltext_deduplicated_self.csv", index = False)
medmcqa_train_self_dedup.to_csv("../deduplicated_data/QAs/MedMCQA/medmcqa_train_fulltext_deduplicated_self.csv", index = False)

In [None]:
# load back data
medmcqa_test_self_dedup = pd.read_csv("../deduplicated_data/QAs/MedMCQA/medmcqa_test_fulltext_deduplicated_self.csv")
medmcqa_dev_self_dedup = pd.read_csv("../deduplicated_data/QAs/MedMCQA/medmcqa_dev_fulltext_deduplicated_self.csv")
medmcqa_train_self_dedup = pd.read_csv("../deduplicated_data/QAs/MedMCQA/medmcqa_train_fulltext_deduplicated_self.csv")

In [None]:
old_questions = []
old_answers = []

with open("../deduplicated_embeddings/QAs/medicationqa_question_embeddings.pkl", "rb") as f:
    medication_qa_q_embed = pickle.load(f)
    old_questions.append(medication_qa_q_embed)

with open("../deduplicated_embeddings/QAs/medicationqa_answer_embeddings.pkl", "rb") as f:
    medication_qa_a_embed = pickle.load(f)
    old_answers.append(medication_qa_a_embed)

#pubmed1,2,3
with open("../deduplicated_embeddings/QAs/pubmed1_question_embeddings.pkl", "rb") as f:
    pubmed1_q_embed = pickle.load(f)
    old_questions.append(pubmed1_q_embed)

with open("../deduplicated_embeddings/QAs/pubmed1_answer_embeddings.pkl", "rb") as f:
    pubmed1_a_embed = pickle.load(f)
    old_answers.append(pubmed1_a_embed)

with open("../deduplicated_embeddings/QAs/pubmed2_question_embeddings.pkl", "rb") as f:
    pubmed2_q_embed = pickle.load(f)
    old_questions.append(pubmed2_q_embed)

with open("../deduplicated_embeddings/QAs/pubmed2_answer_embeddings.pkl", "rb") as f:
    pubmed2_a_embed = pickle.load(f)
    old_answers.append(pubmed2_a_embed)

with open("../deduplicated_embeddings/QAs/pubmed3_question_embeddings.pkl", "rb") as f:
    pubmed3_q_embed = pickle.load(f)
    old_questions.append(pubmed3_q_embed)

with open("../deduplicated_embeddings/QAs/pubmed3_answer_embeddings.pkl", "rb") as f:
    pubmed3_a_embed = pickle.load(f)
    old_answers.append(pubmed3_a_embed)

In [None]:
# load already there data
full_medmcqa_test_self_dedup, removed_questions_full_test, removed_answers_full_test = deduplicate_across_datasets_qa([deduplicated_medicationqa, deduplicated_pubmed1, deduplicated_pubmed2, deduplicated_pubmed3], medmcqa_test_self_dedup, old_questions, old_answers)


Generating embeddings: 100%|██████████| 86/86 [00:31<00:00,  2.73it/s]
Generating embeddings: 100%|██████████| 86/86 [00:30<00:00,  2.86it/s]
Processing dataset1 in chunks: 100%|██████████| 1/1 [00:10<00:00, 10.92s/it]
Processing dataset1 in chunks: 100%|██████████| 1/1 [00:10<00:00, 10.39s/it]


In [None]:
print(len(removed_questions_full_test), len(removed_answers_full_test))

3 0


In [None]:
full_medmcqa_test_self_dedup.to_csv("../deduplicated_data/QAs/MedMCQA/medmcqa_test_fulltext_deduplicated.csv", index = False)

In [None]:
with open("../deduplicated_embeddings/QAs/medmcqa_test_question_embeddings.pkl", "rb") as f:
    medication_qa_q_embed = pickle.load(f)
    old_questions.append(medication_qa_q_embed)

with open("../deduplicated_embeddings/QAs/medmcqa_test_answer_embeddings.pkl", "rb") as f:
    medication_qa_a_embed = pickle.load(f)
    old_answers.append(medication_qa_a_embed)

In [None]:
# load already there data
full_medmcqa_dev_self_dedup, removed_questions_full_dev, removed_answers_full_dev = deduplicate_across_datasets_qa(medmcqa_dev_self_dedup, old_questions, old_answers)


Generating embeddings: 100%|██████████| 63/63 [00:22<00:00,  2.81it/s]
Generating embeddings: 100%|██████████| 63/63 [00:24<00:00,  2.55it/s]
Processing dataset1 in chunks: 100%|██████████| 1/1 [00:09<00:00,  9.08s/it]
Processing dataset1 in chunks: 100%|██████████| 1/1 [00:08<00:00,  8.40s/it]


In [None]:
print(len(removed_questions_full_dev), len(removed_answers_full_dev))
full_medmcqa_dev_self_dedup.to_csv("../deduplicated_data/QAs/MedMCQA/medmcqa_dev_fulltext_deduplicated.csv", index = False)

5 20


In [None]:
with open("../deduplicated_embeddings/QAs/medmcqa_dev_question_embeddings.pkl", "rb") as f:
    medication_qa_q_embed = pickle.load(f)
    old_questions.append(medication_qa_q_embed)

with open("../deduplicated_embeddings/QAs/medmcqa_dev_answer_embeddings.pkl", "rb") as f:
    medication_qa_a_embed = pickle.load(f)
    old_answers.append(medication_qa_a_embed)

In [None]:
# load already there data
full_medmcqa_train_self_dedup, removed_questions_full_train, removed_answers_full_train = deduplicate_across_datasets_qa(medmcqa_train_self_dedup, old_questions, old_answers)


Generating embeddings: 100%|██████████| 2245/2245 [13:04<00:00,  2.86it/s]
Generating embeddings: 100%|██████████| 2245/2245 [15:48<00:00,  2.37it/s]
Processing dataset1 in chunks: 100%|██████████| 18/18 [04:23<00:00, 14.66s/it]
Processing dataset1 in chunks: 100%|██████████| 18/18 [04:25<00:00, 14.73s/it]


In [None]:
print(len(removed_questions_full_train), len(removed_answers_full_train))
full_medmcqa_train_self_dedup.to_csv("../deduplicated_data/QAs/MedMCQA/medmcqa_train_fulltext_deduplicated.csv", index = False)

222 614
