In [8]:
import xml.etree.ElementTree as ET
import pandas as pd
import os
import json
from tqdm import tqdm

In [2]:
def parse_qa_xml(file_path):
    # Parse the XML file
    tree = ET.parse(file_path)
    root = tree.getroot()

    data = []

    # Iterate through each question
    for question in root.findall("NLM-QUESTION"):
        qid = question.attrib.get("qid", None)

        # Extract question details
        original_question = question.find("Original-Question")
        subject = original_question.find("SUBJECT").text if original_question.find("SUBJECT") is not None else None
        message = original_question.find("MESSAGE").text if original_question.find("MESSAGE") is not None else None
        paraphrase = question.find("NIST-PARAPHRASE").text if question.find("NIST-PARAPHRASE") is not None else None
        summary = question.find("NLM-Summary").text if question.find("NLM-Summary") is not None else None

        # Extract annotations
        annotations = question.find("ANNOTATIONS")
        focuses = []
        types = []
        keywords = []

        if annotations is not None:
            for focus in annotations.findall("FOCUS"):
                focuses.append({
                    "fid": focus.attrib.get("fid"),
                    "fcategory": focus.attrib.get("fcategory"),
                    "text": focus.text,
                })

            for type_elem in annotations.findall("TYPE"):
                types.append({
                    "tid": type_elem.attrib.get("tid"),
                    "hasFocus": type_elem.attrib.get("hasFocus"),
                    "hasKeyword": type_elem.attrib.get("hasKeyword"),
                    "text": type_elem.text,
                })

            for keyword in annotations.findall("KEYWORD"):
                keywords.append({
                    "kid": keyword.attrib.get("kid"),
                    "kcategory": keyword.attrib.get("kcategory"),
                    "text": keyword.text,
                })

        # Extract reference answers
        reference_answers = []
        ref_answers_elem = question.find("ReferenceAnswers")
        if ref_answers_elem is not None:
            for ref_answer in ref_answers_elem.findall("RefAnswer"):
                reference_answers.append({
                    "aid": ref_answer.attrib.get("aid"),
                    "text": ref_answer.find("ANSWER").text if ref_answer.find("ANSWER") is not None else None,
                    "url": ref_answer.find("AnswerURL").text if ref_answer.find("AnswerURL") is not None else None,
                    "comment": ref_answer.find("COMMENT").text if ref_answer.find("COMMENT") is not None else None,
                })

        # Append structured data
        data.append({
            "qid": qid,
            "subject": subject,
            "message": message,
            "paraphrase": paraphrase,
            "summary": summary,
            "focuses": focuses,
            "types": types,
            "keywords": keywords,
            "reference_answers": reference_answers,
        })

    return pd.DataFrame(data)

In [3]:
def parse_document_xml(file_path):
    """
    Parse the XML file into a structured pandas DataFrame.

    Args:
        file_path (str): Path to the XML file.

    Returns:
        pd.DataFrame: A DataFrame containing extracted data.
    """
    # Parse the XML file
    tree = ET.parse(file_path)
    root = tree.getroot()

    # Initialize storage for parsed data
    data = []

    # Extract document-level information
    doc_id = root.attrib.get("id", None)
    source = root.attrib.get("source", None)
    url = root.attrib.get("url", None)

    # Extract focus information
    focus_elem = root.find("Focus")
    focus = focus_elem.text.strip() if focus_elem is not None else None

    # Extract UMLS annotations
    umls_elem = root.find("FocusAnnotations/UMLS")
    umls_cuis = []
    semantic_types = []
    semantic_group = None

    if umls_elem is not None:
        umls_cuis = [cui.text.strip() for cui in umls_elem.findall("CUIs/CUI")]
        semantic_types = [stype.text.strip() for stype in umls_elem.findall("SemanticTypes/SemanticType")]
        semantic_group_elem = umls_elem.find("SemanticGroup")
        semantic_group = semantic_group_elem.text.strip() if semantic_group_elem is not None else None

    # Extract QA pairs
    qa_pairs_elem = root.find("QAPairs")
    if qa_pairs_elem is not None:
        for qa_pair in qa_pairs_elem.findall("QAPair"):
            pid = qa_pair.attrib.get("pid", None)

            # Extract question details
            question_elem = qa_pair.find("Question")
            question_id = question_elem.attrib.get("qid", None) if question_elem is not None else None
            question_type = question_elem.attrib.get("qtype", None) if question_elem is not None else None
            question_text = question_elem.text.strip() if question_elem is not None else None

            # Extract answer details
            answer_elem = qa_pair.find("Answer")
            answer_text = "".join(answer_elem.itertext()).strip() if answer_elem is not None else None

            # Store the extracted data
            data.append({
                "doc_id": doc_id,
                "source": source,
                "url": url,
                "focus": focus,
                "umls_cuis": umls_cuis,
                "semantic_types": semantic_types,
                "semantic_group": semantic_group,
                "pid": pid,
                "question_id": question_id,
                "question_type": question_type,
                "question_text": question_text,
                "answer_text": answer_text
            })

    # Convert data to pandas DataFrame
    df = pd.DataFrame(data)
    return df

In [4]:
file_path = "/Users/tianyixu/Documents/research/bionlp/dataset/QAs/MedQuAD/1_CancerGov_QA/0000001_1.xml"  # Replace with your file path
parse_document_xml(file_path)

Unnamed: 0,doc_id,source,url,focus,umls_cuis,semantic_types,semantic_group,pid,question_id,question_type,question_text,answer_text
0,0000001_1,CancerGov,https://www.cancer.gov/types/leukemia/patient/...,Adult Acute Lymphoblastic Leukemia,[C0751606],[T191],Disorders,1,0000001_1-1,information,What is (are) Adult Acute Lymphoblastic Leukem...,Key Points\n - Adult acute ...
1,0000001_1,CancerGov,https://www.cancer.gov/types/leukemia/patient/...,Adult Acute Lymphoblastic Leukemia,[C0751606],[T191],Disorders,2,0000001_1-2,symptoms,What are the symptoms of Adult Acute Lymphobla...,"Signs and symptoms of adult ALL include fever,..."
2,0000001_1,CancerGov,https://www.cancer.gov/types/leukemia/patient/...,Adult Acute Lymphoblastic Leukemia,[C0751606],[T191],Disorders,3,0000001_1-3,exams and tests,How to diagnose Adult Acute Lymphoblastic Leuk...,Tests that examine the blood and bone marrow a...
3,0000001_1,CancerGov,https://www.cancer.gov/types/leukemia/patient/...,Adult Acute Lymphoblastic Leukemia,[C0751606],[T191],Disorders,4,0000001_1-4,outlook,What is the outlook for Adult Acute Lymphoblas...,Certain factors affect prognosis (chance of re...
4,0000001_1,CancerGov,https://www.cancer.gov/types/leukemia/patient/...,Adult Acute Lymphoblastic Leukemia,[C0751606],[T191],Disorders,5,0000001_1-5,susceptibility,Who is at risk for Adult Acute Lymphoblastic L...,Previous chemotherapy and exposure to radiatio...
5,0000001_1,CancerGov,https://www.cancer.gov/types/leukemia/patient/...,Adult Acute Lymphoblastic Leukemia,[C0751606],[T191],Disorders,6,0000001_1-6,stages,What are the stages of Adult Acute Lymphoblast...,Key Points\n - Once adult A...
6,0000001_1,CancerGov,https://www.cancer.gov/types/leukemia/patient/...,Adult Acute Lymphoblastic Leukemia,[C0751606],[T191],Disorders,7,0000001_1-7,treatment,What are the treatments for Adult Acute Lympho...,Key Points\n - There are di...


In [3]:
# Example usage
file_path = "dataset/QAs/LiveQA/TREC-2017-LiveQA-Medical-Test-Questions-w-summaries.xml"  # Replace with your file path
qa_df = parse_qa_xml(file_path)

In [4]:
qa_df

Unnamed: 0,qid,subject,message,paraphrase,summary,focuses,types,keywords,reference_answers
0,TQ1,Noonan syndrome,What are the references with noonan syndrome a...,What is the relationship between Noonan syndro...,What is the relationship between Noonan syndro...,"[{'fid': 'F1', 'fcategory': 'Problem', 'text':...","[{'tid': 'T1', 'hasFocus': 'F1,F2', 'hasKeywor...",[],"[{'aid': 'TQ1A1', 'text': ' Noonan's syndrome ..."
1,TQ2,Gluten information,Re:NDC# 0115-0672-50 Zolmitriptan tabkets 5mg....,Do 5 mg. Zolmitriptan tabkets contain gluten?,Do Zolmitriptan 5mg tablets manufactured by G...,"[{'fid': 'F1', 'fcategory': 'DrugSupplement', ...","[{'tid': 'T1', 'hasFocus': 'F1', 'hasKeyword':...","[{'kid': 'K1', 'kcategory': 'Substance', 'text...",[]
2,TQ3,amphetamine salts 20 mg,are they gluten free\t,Are amphetamine salts of 20 mg dosage gluten f...,Do amphetamine salts 20mg tablets contain gluten?,"[{'fid': 'F1', 'fcategory': 'DrugSupplement', ...","[{'tid': 'T1', 'hasFocus': 'F1', 'hasKeyword':...","[{'kid': 'K1', 'kcategory': 'Substance', 'text...",[]
3,TQ4,vdrl positive,vdrl positive patients please tell me what ar...,What are the treatments and precautions for VD...,What are the treatments and precautions for VD...,"[{'fid': 'F1', 'fcategory': 'Problem', 'text':...","[{'tid': 'T1', 'hasFocus': 'F1', 'hasKeyword':...",[],[]
4,TQ5,how much glucagon,How much glucose is in my GlucaGen HypoKit ? ...,How much glucagon is in my GlucaGen kit?,How much glucagon is in the GlucaGen HypoKit a...,"[{'fid': 'F1', 'fcategory': 'DrugSupplement', ...","[{'tid': 'T1', 'hasFocus': 'F1', 'hasKeyword':...","[{'kid': 'K1', 'kcategory': 'Substance', 'text...",[]
...,...,...,...,...,...,...,...,...,...
99,TQ100,General health,how does effextor cause ED and what is the mi...,To what extent does Effexor cause ED?,Could effexor cause ED and in what doses?,"[{'fid': 'F1', 'fcategory': 'DrugSupplement', ...","[{'tid': 'T1', 'hasFocus': 'F1', 'hasKeyword':...","[{'kid': 'K1', 'kcategory': 'Problem', 'text':...",[]
100,TQ101,NSAIDS as a potential cause of ED,How long has this non prescription drug been i...,How long has Non-aspirin NSAID been implicated...,Could NSAIDS cause erectile dysfunction?,"[{'fid': 'F1', 'fcategory': 'DrugSupplement', ...","[{'tid': 'T1', 'hasFocus': 'F1', 'hasKeyword':...","[{'kid': 'K1', 'kcategory': 'Problem', 'text':...",[]
101,TQ102,General health,i want to know more about aeortic stenosis,"What is aortic stenosis, and is there anything...",What is aortic stenosis?,"[{'fid': 'F1', 'fcategory': 'Problem', 'text':...","[{'tid': 'T1', 'hasFocus': 'F1', 'hasKeyword':...",[],[]
102,TQ103,,What can cause white cells ti uprate,,What causes increase in white blood cell count?,"[{'fid': 'F1', 'fcategory': 'Problem', 'text':...","[{'tid': 'T1', 'hasFocus': 'F1', 'hasKeyword':...",[],[]


In [5]:
# loading dataset
def parse_xml(file):
    tree = ET.parse(file)
    root = tree.getroot()

    sentence_data = []
    for sentence in root.findall('sentence'):
        sentence_id = sentence.get('id')
        sentence_text = sentence.get('text')

        sentence_data.append({
            "sentence_id": sentence_id,
            "sentence_text": sentence_text
        })

    return pd.DataFrame(sentence_data)


def load_dataset(path, filetype = "csv"):
    if filetype == "csv":
        all_files = []
        for root, dirs, files in tqdm(os.walk(path), desc = "Loading CSV files"):
            for file in tqdm(files, desc = "Processing file"):
                if file.endswith(".csv"):
                    all_files.append(os.path.join(root, file))
        ds = {}
        for f in all_files:
            df = pd.read_csv(f)
            ds[f] = df
        return ds
    elif filetype == "xml":
        all_files = []
        for root, dirs, files in tqdm(os.walk(path), desc = "Loading XML files"):
            for file in tqdm(files, desc = "Processing file"):
                if file.endswith(".xml"):
                    all_files.append(os.path.join(root, file))
        ds = {}
        for f in all_files:
            ds[f] = parse_document_xml(f)
        return ds
    elif filetype == "jsonl":
        all_files = []
        for root, dirs, files in tqdm(os.walk(path), desc = "Loading JSONL files"):
            for file in tqdm(files, desc = "Processing file"):
                if file.endswith(".jsonl"):
                    all_files.append(os.path.join(root, file))
        ds = {}
        for f in all_files:
            print("current file: ", f)
            with open(f, "r") as file:
                data = [json.loads(line) for line in file]
            ds[f] = pd.DataFrame(data)
        return ds
    elif filetype == "json":
        all_files = []
        for root, dirs, files in tqdm(os.walk(path), desc = "Loading JSON files"):
            for file in tqdm(files, desc = "Processing file"):
                if file.endswith(".json"):
                    all_files.append(os.path.join(root, file))
        ds = {}
        for f in all_files:
            with open(f, "r") as file:
                data = json.load(file)
            ds[f] = pd.DataFrame(data)
        return ds



In [6]:
def parse_document_xml(file_path):
    """
    Parse the XML file into a structured pandas DataFrame.

    Args:
        file_path (str): Path to the XML file.

    Returns:
        pd.DataFrame: A DataFrame containing extracted data.
    """
    # Parse the XML file
    tree = ET.parse(file_path)
    root = tree.getroot()

    # Initialize storage for parsed data
    data = []

    # Extract document-level information
    doc_id = root.attrib.get("id", None)
    source = root.attrib.get("source", None)
    url = root.attrib.get("url", None)

    # Extract focus information
    focus_elem = root.find("Focus")
    if focus_elem is not None:
        focus = focus_elem.text.strip() if focus_elem.text is not None else None
    else:
        focus = None

    # Extract UMLS annotations
    umls_elem = root.find("FocusAnnotations/UMLS")
    umls_cuis = []
    semantic_types = []
    semantic_group = None

    if umls_elem is not None:
        umls_cuis = [cui.text.strip() for cui in umls_elem.findall("CUIs/CUI")]
        semantic_types = [stype.text.strip() for stype in umls_elem.findall("SemanticTypes/SemanticType")]
        semantic_group_elem = umls_elem.find("SemanticGroup")
        semantic_group = semantic_group_elem.text.strip() if semantic_group_elem.text is not None else None

    # Extract QA pairs
    qa_pairs_elem = root.find("QAPairs")
    if qa_pairs_elem is not None:
        for qa_pair in qa_pairs_elem.findall("QAPair"):
            pid = qa_pair.attrib.get("pid", None)

            # Extract question details
            question_elem = qa_pair.find("Question")
            question_id = question_elem.attrib.get("qid", None) if question_elem.attrib is not None else None
            question_type = question_elem.attrib.get("qtype", None) if question_elem.attrib is not None else None
            question_text = question_elem.text.strip() if question_elem.text is not None else None

            # Extract answer details
            answer_elem = qa_pair.find("Answer")
            answer_text = "".join(answer_elem.itertext()).strip() if "".join(answer_elem.itertext()) is not None else None

            # Store the extracted data
            data.append({
                "doc_id": doc_id,
                "source": source,
                "url": url,
                "focus": focus,
                "umls_cuis": umls_cuis,
                "semantic_types": semantic_types,
                "semantic_group": semantic_group,
                "pid": pid,
                "question_id": question_id,
                "question_type": question_type,
                "question_text": question_text,
                "answer_text": answer_text
            })

    # Convert data to pandas DataFrame
    df = pd.DataFrame(data)
    return df

In [9]:
medquad_ds = load_dataset("dataset/QAs/MedQuAD", filetype = "xml")

Processing file: 100%|██████████| 3/3 [00:00<00:00, 85598.04it/s]
Processing file: 100%|██████████| 1086/1086 [00:00<00:00, 1853138.38it/s]
Processing file: 100%|██████████| 2685/2685 [00:00<00:00, 1991812.21it/s]
Processing file: 100%|██████████| 277/277 [00:00<00:00, 1629484.16it/s]
Processing file: 100%|██████████| 99/99 [00:00<00:00, 1431848.61it/s]
Processing file: 100%|██████████| 116/116 [00:00<00:00, 1683526.87it/s]
Processing file: 100%|██████████| 88/88 [00:00<00:00, 1419610.58it/s]
Processing file: 100%|██████████| 981/981 [00:00<00:00, 2014005.00it/s]
Processing file: 100%|██████████| 48/48 [00:00<00:00, 1088251.85it/s]
Processing file: 100%|██████████| 4366/4366 [00:00<00:00, 562850.20it/s]
Processing file: 100%|██████████| 59/59 [00:00<00:00, 106298.94it/s]
Processing file: 100%|██████████| 5/5 [00:00<00:00, 9416.94it/s]
Processing file: 0it [00:00, ?it/s]
Processing file: 100%|██████████| 3/3 [00:00<00:00, 149796.57it/s]
Processing file: 0it [00:00, ?it/s]
Processing fil

In [11]:
len(medquad_ds)

11274

In [12]:
# check if all keys end with .xml
for k in medquad_ds.keys():
    if not k.endswith(".xml"):
        print(k)


In [1]:
# calculate how many samples were from a certain dataset.
import os
import pandas as pd
import json
from tqdm import tqdm
import sys
import xml.etree.ElementTree as ET

def parse_nlm_questions(file_path):
    # Parse the XML file
    tree = ET.parse(file_path)
    root = tree.getroot()

    # Initialize storage for the parsed data
    data = []

    # Iterate through each NLM-QUESTION
    for question in root.findall("NLM-QUESTION"):
        qid = question.attrib.get("qid", None)
        subject = question.find("SUBJECT").text if question.find("SUBJECT") is not None else None
        message = question.find("MESSAGE").text if question.find("MESSAGE") is not None else None

        # Extract sub-questions
        sub_questions = question.find("SUB-QUESTIONS")
        if sub_questions is not None:
            for sub_question in sub_questions.findall("SUB-QUESTION"):
                # Extract annotations
                annotations = sub_question.find("ANNOTATIONS")
                focus = annotations.find("FOCUS").text if annotations is not None and annotations.find("FOCUS") is not None else None
                qtype = annotations.find("TYPE").text if annotations is not None and annotations.find("TYPE") is not None else None

                # Extract answers
                answers_elem = sub_question.find("ANSWERS")
                answers = []
                if answers_elem is not None:
                    for answer in answers_elem.findall("ANSWER"):
                        answers.append(answer.text.strip())

                # Store the parsed data
                data.append({
                    "qid": qid,
                    "subject": subject,
                    "question": message,
                    "focus": focus,
                    "type": qtype,
                    "answer": answers
                })

    # Convert data to a pandas DataFrame
    return pd.DataFrame(data)

def parse_nlm_questions_test(file_path):
    # Parse the XML file
    tree = ET.parse(file_path)
    root = tree.getroot()

    # Initialize storage for the parsed data
    data = []

    # Iterate through each NLM-QUESTION
    for question in root.findall("NLM-QUESTION"):
        qid = question.attrib.get("qid", None)

        # Extract subject and message
        subject_elem = question.find("./Original-Question/SUBJECT")
        subject = subject_elem.text.strip() if subject_elem.text is not None else None

        message_elem = question.find("./Original-Question/MESSAGE")
        message = message_elem.text.strip() if message_elem.text is not None else None

        # Extract answers
        answers = []
        reference_answers = question.find("ReferenceAnswers")
        if reference_answers is not None:
            for ref_answer in reference_answers.findall("RefAnswer"):
                answer_elem = ref_answer.find("ANSWER")
                if answer_elem is not None:
                    # Join all parts of the answer into a single string, stripping whitespace
                    answer_text = "".join(answer_elem.itertext()).strip()
                    answers.append(answer_text)
            if reference_answers.find("RefAnswer") is None:
                for ref_answer in reference_answers.findall("ReferenceAnswer"):
                    answer_elem = ref_answer.find("ANSWER")
                    if answer_elem is not None:
                        # Join all parts of the answer into a single string, stripping whitespace
                        answer_text = "".join(answer_elem.itertext()).strip()
                        answers.append(answer_text)

        # Append to the dataset
        data.append({
            "qid": qid,
            "subject": subject,
            "question": message,
            "answer": answers  # Store all answers as a list
        })

    # Convert data to a pandas DataFrame
    return pd.DataFrame(data)

# Remove NaN values from the "question" and "answer" columns
def clean_dataframe(df):
    # Ensure "question" and "answer" columns exist and are non-empty
    df["question"] = df["question"].fillna("").astype(str)
    df["answer"] = df["answer"].fillna("").astype(str)

    # Remove rows where "question" or "answer" is an empty string
    df = df[(df["question"].str.strip() != "") & (df["answer"].str.strip() != "")]
    return df.reset_index(drop=True)

def load_dataset(path, filetype = "csv"):
    if filetype == "csv":
        all_files = []
        for root, dirs, files in tqdm(os.walk(path), desc = "Loading CSV files"):
            for file in tqdm(files, desc = "Processing file"):
                if file.endswith(".csv"):
                    all_files.append(os.path.join(root, file))
        ds = {}
        for f in all_files:
            df = pd.read_csv(f)
            ds[f] = df
        return ds
    elif filetype == "jsonl":
        all_files = []
        for root, dirs, files in tqdm(os.walk(path), desc = "Loading JSONL files"):
            for file in tqdm(files, desc = "Processing file"):
                if file.endswith(".jsonl"):
                    all_files.append(os.path.join(root, file))
        ds = {}
        for f in all_files:
            print("current file: ", f)
            with open(f, "r") as file:
                data = [json.loads(line) for line in file]
            ds[f] = pd.DataFrame(data)
        return ds
    elif filetype == "json":
        all_files = []
        for root, dirs, files in tqdm(os.walk(path), desc = "Loading JSON files"):
            for file in tqdm(files, desc = "Processing file"):
                if file.endswith(".json"):
                    all_files.append(os.path.join(root, file))
        ds = {}
        for f in all_files:
            with open(f, "r") as file:
                data = json.load(file)
            ds[f] = pd.DataFrame(data)
        return ds
    elif filetype == "xml":
        all_files = []
        for root, dirs, files in tqdm(os.walk(path), desc = "Loading XML files"):
            for file in tqdm(files, desc = "Processing file"):
                if file.endswith(".xml"):
                    all_files.append(os.path.join(root, file))
        ds = {}
        for f in all_files:
            print("Current file: ", f)
            if "LiveQA" in f:
                if "summaries" in f:
                    continue
                if "Test" in f:
                    ds[f] = clean_dataframe(parse_nlm_questions_test(f))
                else:
                    ds[f] = clean_dataframe(parse_nlm_questions(f))
            else:
                pass
        return ds
    
def parse_document_xml(file_path):
    """
    Parse the XML file into a structured pandas DataFrame.

    Args:
        file_path (str): Path to the XML file.

    Returns:
        pd.DataFrame: A DataFrame containing extracted data.
    """
    # Parse the XML file
    tree = ET.parse(file_path)
    root = tree.getroot()

    # Initialize storage for parsed data
    data = []

    # Extract document-level information
    doc_id = root.attrib.get("id", None)
    source = root.attrib.get("source", None)
    url = root.attrib.get("url", None)

    # Extract focus information
    focus_elem = root.find("Focus")
    if focus_elem is not None:
        focus = focus_elem.text.strip() if focus_elem.text is not None else None
    else:
        focus = None

    # Extract UMLS annotations
    umls_elem = root.find("FocusAnnotations/UMLS")
    umls_cuis = []
    semantic_types = []
    semantic_group = None

    if umls_elem is not None:
        umls_cuis = [cui.text.strip() for cui in umls_elem.findall("CUIs/CUI")]
        semantic_types = [stype.text.strip() for stype in umls_elem.findall("SemanticTypes/SemanticType")]
        semantic_group_elem = umls_elem.find("SemanticGroup")
        semantic_group = semantic_group_elem.text.strip() if semantic_group_elem.text is not None else None

    # Extract QA pairs
    qa_pairs_elem = root.find("QAPairs")
    if qa_pairs_elem is not None:
        for qa_pair in qa_pairs_elem.findall("QAPair"):
            pid = qa_pair.attrib.get("pid", None)

            # Extract question details
            question_elem = qa_pair.find("Question")
            question_id = question_elem.attrib.get("qid", None) if question_elem.attrib is not None else None
            question_type = question_elem.attrib.get("qtype", None) if question_elem.attrib is not None else None
            question_text = question_elem.text.strip() if question_elem.text is not None else None

            # Extract answer details
            answer_elem = qa_pair.find("Answer")
            answer_text = "".join(answer_elem.itertext()).strip() if "".join(answer_elem.itertext()) is not None else None

            # Store the extracted data
            data.append({
                "doc_id": doc_id,
                "source": source,
                "url": url,
                "focus": focus,
                "umls_cuis": umls_cuis,
                "semantic_types": semantic_types,
                "semantic_group": semantic_group,
                "pid": pid,
                "question_id": question_id,
                "question_type": question_type,
                "question_text": question_text,
                "answer_text": answer_text
            })

    # Convert data to pandas DataFrame
    df = pd.DataFrame(data)
    return df

def count_data_num(df):
    return len(df)

In [2]:
dataset_name = "PubMedQA"
file_type = "json"
data = load_dataset("dataset/QAs/" + dataset_name, file_type)
total_num = 0
for key, value in data.items():
    if "summaries" in key:
        continue
    # print(f"For the {dataset_name} file {key}")
    # print(f"There are {count_data_num(value)} samples in the dataset.")
    total_num += count_data_num(value)
print(f"Total number of samples in the dataset: {total_num}")
deduplicated_data = load_dataset("deduplicated_data/QAs/" + dataset_name, "csv")
total_num_dedup = 0
for key, value in deduplicated_data.items():
    # print(f"For the deduplicated {dataset_name} file {key}")
    # print(f"There are {count_data_num(value)} samples in the dataset.")
    total_num_dedup += count_data_num(value)
print(f"Total number of samples in the deduplicated dataset: {total_num_dedup}")

Processing file: 100%|██████████| 3/3 [00:00<00:00, 20971.52it/s]
Loading JSON files: 1it [00:00, 327.55it/s]




Total number of samples in the dataset: 21


Processing file: 100%|██████████| 5/5 [00:00<00:00, 131072.00it/s]
Loading CSV files: 1it [00:00, 547.34it/s]


Total number of samples in the deduplicated dataset: 543897


In [4]:
data.keys()

dict_keys(['dataset/QAs/PubMedQA/ori_pqaa.json', 'dataset/QAs/PubMedQA/ori_pqau.json', 'dataset/QAs/PubMedQA/ori_pqal.json'])

In [6]:
data['dataset/QAs/PubMedQA/ori_pqau.json'].T

Unnamed: 0,QUESTION,CONTEXTS,LABELS,MESHES,YEAR,LONG_ANSWER
14499029,Is naturopathy as effective as conventional th...,[Although the use of alternative medicine in t...,"[BACKGROUND, OBJECTIVE, DESIGN, SETTING, PATIE...","[Anxiety, Cohort Studies, Confidence Intervals...",2003,Naturopathy appears to be an effective alterna...
14499049,Can randomised trials rely on existing electro...,"[To estimate the feasibility, utility and reso...","[OBJECTIVES, DATA SOURCES, REVIEW METHODS, RES...","[Arthroplasty, Replacement, Knee, Bias, Blood ...",2003,Routine data have the potential to support hea...
14499672,Is laparoscopic radical prostatectomy better t...,[To compare morbidity in two groups of patient...,"[OBJECTIVE, PATIENTS AND METHODS, RESULTS]","[Aged, Follow-Up Studies, Humans, Italy, Lapar...",2003,The results of our non-randomized study show t...
14499773,Does bacterial gastroenteritis predispose peop...,[Irritable bowel syndrome (IBS) might develop ...,"[OBJECTIVES, METHODS, RESULTS]","[Adolescent, Adult, Age Distribution, Aged, Ag...",2003,Symptoms consistent with IBS and functional di...
14499777,Is early colonoscopy after admission for acute...,[Urgent colonoscopy has been proposed for the ...,"[OBJECTIVES, METHODS, RESULTS]","[Acute Disease, Aged, Aged, 80 and over, Cohor...",2003,No significant association is apparent between...
...,...,...,...,...,...,...
10632750,Diversion colitis in children: an iatrogenic a...,"[Diversion colitis (DC) is a localized, relati...","[AIMS, METHODS AND RESULTS]","[Appendix, Child, Preschool, Colitis, Colitis,...",2000,Histological features of DC in children are ve...
10632796,Raising research awareness among midwives and ...,[The primary aim of the study was to evaluate ...,"[OBJECTIVE, DESIGN, SAMPLE, ETHICS, DATA COLLE...","[Attitude of Health Personnel, Chi-Square Dist...",2000,The introduction of clinical governance challe...
10632828,Delivery of primary care to women. Do women's ...,[Women's health centers have been increasing i...,"[OBJECTIVE, DESIGN, SETTING, PARTICIPANTS, MEA...","[Adolescent, Adult, Aged, Cross-Sectional Stud...",2000,"These results suggest that, at least in this s..."
10633786,Analysis of failures after whole abdominal irr...,[To evaluate failures and to investigate the n...,"[BACKGROUND, PATIENTS AND METHOD, RESULTS]","[Abdomen, Adult, Aged, Aged, 80 and over, Comb...",1999,General prophylactic enclosure of the inguinal...


In [24]:
directory = "dataset/QAs/"

In [25]:
trec_qa_train_1 = parse_nlm_questions(directory + "/LiveQA/TREC-2017-LiveQA-Medical-Train-1.xml")
trec_qa_train_2 = parse_nlm_questions(directory + "/LiveQA/TREC-2017-LiveQA-Medical-Train-2.xml")
trec_qa_test = parse_nlm_questions_test(directory + "/LiveQA/TREC-2017-LiveQA-Medical-Test.xml")

In [32]:
trec_qa_train_1

Unnamed: 0,qid,subject,question,focus,type,answer
0,,,Literature on Cardiac amyloidosis. Please let...,cardiac amyloidosis,information,[Cardiac amyloidosis is a disorder caused by d...
1,,treatment options versus migraine types,Migraine seems to be a spectrum of conditions ...,migraine,treatment,[There is no specific cure for migraine headac...
2,,,DO I USE PYRIDOXINE TABLETS EVEN IF IM PREGNANT?,pyridoxine,contraindication,"[Before taking pyridoxine, tell your do..."
3,,cramp,i have lymphoma what causes cramp after chemo ...,cramp,cause,[Muscle cramps are common and often occur when...
4,,Retina,I wonder of new research and testing on macula...,macular degeneration,treatment,[These resources address the diagnosis or mana...
...,...,...,...,...,...,...
249,,Duloxetine Hcl Dr,When is the best time of day to take Duloxetin...,Duloxetine,usage,[Duloxetine comes as a delayed-release (releas...
250,,Peptic Ulcers & Tylenol,"I have a history of peptic ulcers, so I don't ...",Tylenol,contraindication,[Liver warning: This product contains acetamin...
251,,,What is Nephrotic Syndrome. What are its caus...,Nephrotic Syndrome,information,[Nephrotic syndrome is a collection of symptom...
252,,,What is Nephrotic Syndrome. What are its caus...,Nephrotic Syndrome,cause,[Nephrotic syndrome can be caused by diseases ...
