
## Reading CSV with all PMCID's and creating a python list of all PMCID's

In [None]:
import pandas as pd

In [None]:
# Change the path to the CSV file with the path on the server
all_pmcid = pd.read_csv("/Users/zeynepkorkmaz/Desktop/Master 1/Systems Biomedicine/NLP in biomedical research/all_pmcid.csv")

# Concatenate all lists under the same column into one list
PMC_IDs_TP_list = all_pmcid["PMC_IDs_TP"].str.split(", ").sum()
PMC_IDs_FP_list = all_pmcid["PMC_IDs_FP"].str.split(", ").sum()
PMC_IDs_R_list = all_pmcid["PMC_IDs_R"].str.split(", ").sum()

#print(PMC_IDs_R_list)

#### The lists above only contain numbers. Here we add the prefix "PMC" at the beginning of every element.

In [None]:
prefixed_PMC_ID_TP = ["PMC" + string for string in PMC_IDs_TP_list]
prefixed_PMC_ID_FP = ["PMC" + string for string in PMC_IDs_FP_list]
prefixed_PMC_ID_R = ["PMC" + string for string in PMC_IDs_R_list]

In [None]:
#print(prefixed_PMC_ID_FP)

## Option 1: This section uses PMCID's and PMID's to search for XML files

### Keep in mind that in order to use this option you need lists of PMID's on top of PMCID's and the order of the id's should match

In [None]:
from lxml import etree as ET
import os
import shutil

In [None]:

def extract_xml_from_pmcid_and_pmid(input_list_pmids, input_list_pmcids, input_dir, output_dir):
    previously_found_pmids = []
    previously_found_pmcids = []

    # Create the output directory if it does not exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for subdir, _, files in os.walk(input_dir):
        for file in files:
            if file.endswith('.xml'):
                xml_file_path = os.path.join(subdir, file)
                try:
                    parser = ET.XMLParser(recover=True, no_network=False)
                    tree = ET.parse(xml_file_path, parser)
                    root = tree.getroot()

                    for element in root.xpath('.//article-id'):
                        pub_id_type = element.get('pub-id-type')
                        text = element.text

                        if ((pub_id_type == 'pmid' and text in input_list_pmids and text not in previously_found_pmids) or 
                            (pub_id_type == 'pmc' and text in input_list_pmcids and text not in previously_found_pmcids)):
                            
                            # Save the XML file in the output directory
                            output_file_path = os.path.join(output_dir, file)
                            tree.write(output_file_path, encoding='utf-8', xml_declaration=True, pretty_print=True)

                            # Keep track of added pmids and pmcids
                            if pub_id_type == 'pmid':
                                previously_found_pmids.append(text)
                            if pub_id_type == 'pmc':
                                previously_found_pmcids.append(text)
                            
                            break  # Break the loop once a valid 'article-id' is found
                except ET.ParseError as e:
                    print(f"Skipping file {xml_file_path} due to parse error: {e}")


## Option 2: This section only uses PMCID's and checks the file names of XML files in the inpur directory.

In [None]:
import os
import shutil

def extract_xml_from_PMCID(PMC_input_list, input_all_xml, output_dir_for_set):
    # Create the output folder if it doesn't exist
    os.makedirs(output_dir_for_set, exist_ok=True)

    # Iterate over XML files in the folder
    for xml_file in os.listdir(input_all_xml):
        # Extract the basename (file name without extension)
        file_basename = os.path.splitext(xml_file)[0]

        # Check if the basename matches any of the desired article IDs
        if file_basename in PMC_input_list:
            # Build the paths for the source and destination
            source_path = os.path.join(input_all_xml, xml_file)
            destination_path = os.path.join(output_dir_for_set, xml_file)

            # Copy the matching XML file to the output folder
            shutil.copy2(source_path, destination_path)

            #print(f"Matching XML file '{xml_file}' copied to '{output_folder}'.")

## Run the function to create different directories for each set. This saves matching XML files to the corresponding directories

In [None]:
# Define input directory that contains all XML files
input_dir = '/Users/zeynepkorkmaz/Downloads/All_Articles/PMC000xxxxxx' # Change according to where we keep all XML files

# Define paths to output directories
output_dir_TP = "/add/folder/dir/in/server/TP_all_XML"
output_dir_FP = "/add/folder/dir/in/server/FP_all_XML"
output_dir_R = "/add/folder/dir/in/server/R_all_XML"

In [None]:
extract_xml_from_PMCID(prefixed_PMC_ID_TP, input_dir, output_dir_TP)
extract_xml_from_PMCID(prefixed_PMC_ID_FP, input_dir, output_dir_FP)
extract_xml_from_PMCID(prefixed_PMC_ID_R, input_dir, output_dir_R)