In [1]:
# One run of test to deduplicate the bio_med_research dataset
import pandas as pd
import os
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import xml.etree.ElementTree as ET
import json
from tqdm import tqdm
import pickle

In [None]:
# if use colab, run this part
from google.colab import drive

drive.mount('/content/drive')
os.chdir('/content/drive/MyDrive/bionlp')

In [2]:
# go to model dir
os.chdir('MedImageInsights')

In [3]:
# set directory to deduplicate
directory = "../dataset/bio_med_research"

In [4]:
# install necessary package
!pip install mup
!pip install fvcore



In [4]:
# load model
from medimageinsightmodel import MedImageInsight

classifier = MedImageInsight(
    model_dir="2024.09.27",
    vision_model_name="medimageinsigt-v1.0.0.pt",
    language_model_name="language_model.pth"
)

classifier.load_model()

python(38542) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(38543) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Model loaded successfully on device: cpu


In [5]:
# loading dataset
def parse_xml(file):
    tree = ET.parse(file)
    root = tree.getroot()

    sentence_data = []
    for sentence in root.findall('sentence'):
        sentence_id = sentence.get('id')
        sentence_text = sentence.get('text')

        sentence_data.append({
            "sentence_id": sentence_id,
            "sentence_text": sentence_text
        })

    return pd.DataFrame(sentence_data)


def load_dataset(path, filetype = "csv"):
    if filetype == "csv":
        all_files = []
        for root, dirs, files in tqdm(os.walk(path), desc = "Loading CSV files"):
            for file in tqdm(files, desc = "Processing file"):
                if file.endswith(".csv"):
                    all_files.append(os.path.join(root, file))
        ds = {}
        for f in all_files:
            df = pd.read_csv(f)
            ds[f] = df
        return ds
    elif filetype == "xml":
        all_files = []
        for root, dirs, files in tqdm(os.walk(path), desc = "Loading XML files"):
            for file in tqdm(files, desc = "Processing file"):
                if file.endswith(".xml"):
                    all_files.append(os.path.join(root, file))
        ds = {}
        for f in all_files:
            ds[f] = parse_xml(f)
        return ds
    elif filetype == "jsonl":
        all_files = []
        for root, dirs, files in tqdm(os.walk(path), desc = "Loading JSONL files"):
            for file in tqdm(files, desc = "Processing file"):
                if file.endswith(".jsonl"):
                    all_files.append(os.path.join(root, file))
        ds = {}
        for f in all_files:
            print("current file: ", f)
            with open(f, "r") as file:
                data = [json.loads(line) for line in file]
            ds[f] = pd.DataFrame(data)
        return ds
    elif filetype == "json":
        all_files = []
        for root, dirs, files in tqdm(os.walk(path), desc = "Loading JSON files"):
            for file in tqdm(files, desc = "Processing file"):
                if file.endswith(".json"):
                    all_files.append(os.path.join(root, file))
        ds = {}
        for f in all_files:
            with open(f, "r") as file:
                data = json.load(file)
            ds[f] = pd.DataFrame(data)
        return ds



In [6]:
# functions for deduplication
def get_embeddings(texts, batch_size = 64):
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size), desc = "Generating embeddings"):
        batch_texts = texts[i:i+batch_size]
        embeddings.extend(classifier.encode(texts = batch_texts)['text_embeddings'])
    return np.array(embeddings)

In [7]:
bc5cdr = load_dataset(directory + "/bc5cdr", "csv")


python(38991) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Processing file: 100%|██████████| 3/3 [00:00<00:00, 5073.75it/s]
Loading CSV files: 1it [00:00, 77.92it/s]


In [34]:
import pandas as pd
import json
import re

def preprocess_and_parse(df):
    """
    Preprocess the 'passages' column and parse JSON-like strings into structured data.

    Args:
        df (pd.DataFrame): DataFrame with a column containing JSON-like strings.

    Returns:
        pd.DataFrame: Flattened DataFrame with parsed data.
    """
    parsed_data = []

    for index, row in df.iterrows():
        raw_passage = row['passages']

        # Preprocess the JSON-like string
        try:
            # Replace Python constructs with valid JSON
            cleaned_passage = re.sub(r"array\((\[.*?\])\s*,\s*dtype=.*?\)", r"\1", raw_passage)
            cleaned_passage = cleaned_passage.replace("'", "\"")  # Replace single quotes with double quotes
            cleaned_passage = re.sub(r"dtype=object", "", cleaned_passage)  # Remove dtype=object

            # Parse the JSON string
            passages = json.loads(cleaned_passage)

            for passage in passages:
                document_id = passage.get('document_id')
                doc_type = passage.get('type')
                text = passage.get('text')

                # Extract entities
                if 'entities' in passage:
                    for entity in passage['entities']:
                        parsed_data.append({
                            'document_id': document_id,
                            'type': doc_type,
                            'text': text,
                            'entity_id': entity.get('id'),
                            'entity_text': entity.get('text', [None])[0] if isinstance(entity.get('text'), list) else entity.get('text'),
                            'entity_type': entity.get('type'),
                            'normalized_db': entity.get('normalized', [{}])[0].get('db_name') if entity.get('normalized') else None,
                            'normalized_id': entity.get('normalized', [{}])[0].get('db_id') if entity.get('normalized') else None
                        })

                # Extract relations
                if 'relations' in passage:
                    for relation in passage['relations']:
                        parsed_data.append({
                            'document_id': document_id,
                            'type': doc_type,
                            'text': text,
                            'relation_id': relation.get('id'),
                            'relation_type': relation.get('type'),
                            'arg1_id': relation.get('arg1_id'),
                            'arg2_id': relation.get('arg2_id')
                        })

        except json.JSONDecodeError as e:
            print(f"Error decoding JSON in row {index}: {e}")

    # Convert parsed data to DataFrame
    return pd.DataFrame(parsed_data)


In [35]:
bc5cdr_train = preprocess_and_parse(bc5cdr["../dataset/bio_med_research/bc5cdr/bc5cdr_train_fulltext.csv"])
bc5cdr_val = preprocess_and_parse(bc5cdr["../dataset/bio_med_research/bc5cdr/bc5cdr_val_fulltext.csv"])
bc5cdr_test = preprocess_and_parse(bc5cdr["../dataset/bio_med_research/bc5cdr/bc5cdr_test_fulltext.csv"])


bc5cdr_train.head()

Error decoding JSON in row 0: Expecting value: line 1 column 152 (char 151)
Error decoding JSON in row 1: Expecting value: line 1 column 128 (char 127)
Error decoding JSON in row 2: Expecting value: line 1 column 172 (char 171)
Error decoding JSON in row 3: Expecting value: line 1 column 227 (char 226)
Error decoding JSON in row 4: Expecting value: line 1 column 194 (char 193)
Error decoding JSON in row 5: Expecting ',' delimiter: line 1 column 74 (char 73)
Error decoding JSON in row 6: Expecting value: line 1 column 193 (char 192)
Error decoding JSON in row 7: Expecting value: line 1 column 129 (char 128)
Error decoding JSON in row 8: Expecting value: line 1 column 174 (char 173)
Error decoding JSON in row 9: Expecting value: line 1 column 166 (char 165)
Error decoding JSON in row 10: Expecting value: line 1 column 245 (char 244)
Error decoding JSON in row 11: Expecting value: line 1 column 166 (char 165)
Error decoding JSON in row 12: Expecting value: line 1 column 130 (char 129)
Err

In [29]:
bc5cdr_train = bc5cdr["../dataset/bio_med_research/bc5cdr/bc5cdr_train_fulltext.csv"]
bc5cdr_train.at[0, "passages"]

"[{'document_id': '227508', 'type': 'title', 'text': 'Naloxone reverses the antihypertensive effect of clonidine.', 'entities': array([{'id': '0', 'offsets': array([array([0, 8], dtype=int32)], dtype=object), 'text': array(['Naloxone'], dtype=object), 'type': 'Chemical', 'normalized': array([{'db_name': 'MESH', 'db_id': 'D009270'}], dtype=object)},\n        {'id': '1', 'offsets': array([array([49, 58], dtype=int32)], dtype=object), 'text': array(['clonidine'], dtype=object), 'type': 'Chemical', 'normalized': array([{'db_name': 'MESH', 'db_id': 'D003000'}], dtype=object)}],\n       dtype=object), 'relations': array([{'id': 'R0', 'type': 'CID', 'arg1_id': 'D008750', 'arg2_id': 'D007022'}],\n       dtype=object)}\n {'document_id': '227508', 'type': 'abstract', 'text': 'In unanesthetized, spontaneously hypertensive rats the decrease in blood pressure and heart rate produced by intravenous clonidine, 5 to 20 micrograms/kg, was inhibited or reversed by nalozone, 0.2 to 2 mg/kg. The hypotensi

In [30]:
example = bc5cdr_train.at[0, "passages"]
example


"[{'document_id': '227508', 'type': 'title', 'text': 'Naloxone reverses the antihypertensive effect of clonidine.', 'entities': array([{'id': '0', 'offsets': array([array([0, 8], dtype=int32)], dtype=object), 'text': array(['Naloxone'], dtype=object), 'type': 'Chemical', 'normalized': array([{'db_name': 'MESH', 'db_id': 'D009270'}], dtype=object)},\n        {'id': '1', 'offsets': array([array([49, 58], dtype=int32)], dtype=object), 'text': array(['clonidine'], dtype=object), 'type': 'Chemical', 'normalized': array([{'db_name': 'MESH', 'db_id': 'D003000'}], dtype=object)}],\n       dtype=object), 'relations': array([{'id': 'R0', 'type': 'CID', 'arg1_id': 'D008750', 'arg2_id': 'D007022'}],\n       dtype=object)}\n {'document_id': '227508', 'type': 'abstract', 'text': 'In unanesthetized, spontaneously hypertensive rats the decrease in blood pressure and heart rate produced by intravenous clonidine, 5 to 20 micrograms/kg, was inhibited or reversed by nalozone, 0.2 to 2 mg/kg. The hypotensi

In [8]:
from datasets import load_dataset

In [9]:
dataset6 = load_dataset("bigbio/bc5cdr")

In [10]:
dataset6

DatasetDict({
    train: Dataset({
        features: ['passages'],
        num_rows: 500
    })
    test: Dataset({
        features: ['passages'],
        num_rows: 500
    })
    validation: Dataset({
        features: ['passages'],
        num_rows: 500
    })
})

In [11]:
ds_test = dataset6['train']

In [16]:
str(ds_test['passages'][0])


"[{'document_id': '227508', 'type': 'title', 'text': 'Naloxone reverses the antihypertensive effect of clonidine.', 'entities': [{'id': '0', 'offsets': [[0, 8]], 'text': ['Naloxone'], 'type': 'Chemical', 'normalized': [{'db_name': 'MESH', 'db_id': 'D009270'}]}, {'id': '1', 'offsets': [[49, 58]], 'text': ['clonidine'], 'type': 'Chemical', 'normalized': [{'db_name': 'MESH', 'db_id': 'D003000'}]}], 'relations': [{'id': 'R0', 'type': 'CID', 'arg1_id': 'D008750', 'arg2_id': 'D007022'}]}, {'document_id': '227508', 'type': 'abstract', 'text': 'In unanesthetized, spontaneously hypertensive rats the decrease in blood pressure and heart rate produced by intravenous clonidine, 5 to 20 micrograms/kg, was inhibited or reversed by nalozone, 0.2 to 2 mg/kg. The hypotensive effect of 100 mg/kg alpha-methyldopa was also partially reversed by naloxone. Naloxone alone did not affect either blood pressure or heart rate. In brain membranes from spontaneously hypertensive rats clonidine, 10(-8) to 10(-5) M,

In [6]:
ds_test.to_csv("example.csv", index=False)

In [18]:
# write the file 
for i in range(len(ds_test['passages'])):
    with open("../dataset/bio_med_research/bc5cdr/train_bc5cdr.txt", "a") as f:
        f.write(str(ds_test['passages'][i]) + "\n")

In [19]:
for i in range(len(dataset6['test']['passages'])):
    with open("../dataset/bio_med_research/bc5cdr/test_bc5cdr.txt", "a") as f:
        f.write(str(dataset6['test']['passages'][i]) + "\n")
for i in range(len(dataset6['validation']['passages'])):
    with open("../dataset/bio_med_research/bc5cdr/val_bc5cdr.txt", "a") as f:
        f.write(str(dataset6['validation']['passages'][i]) + "\n")