# Extract SQ_TP, SQ_FP and SQ_R papers from oa_comm database

In [30]:
from Bio import Entrez
import csv
import os
import xml.etree.ElementTree as ET
import copy

## Function that searches for papers using our SQs and returns a list of PubMed IDs

In [4]:
def search_pubmed_for_ids(query, max_results=13):
    Entrez.email = "zeynep.korkmaz@tum.de"  # Set email address

    handle = Entrez.esearch(db="pubmed", term=query, retmax=max_results)
    record = Entrez.read(handle)
    handle.close()

    return record["IdList"]

## Function that reads the keywords and SQs from a directory with CSV files and returns a dictionary

In [5]:

def read_keywords_from_directory(directory):
    keywords_dict = {}

    for filename in os.listdir(directory):
        if filename.endswith(".csv"):
            csv_file = os.path.join(directory, filename)
            
            # for troubleshooting (dictionary only contains 99 files but should contain ~140)
            #print("Reading keywords from file: {}".format(csv_file)) # all files are read

            with open(csv_file, 'r') as file:
                reader = csv.reader(file)

                current_pub_title = None
                current_keywords = []
                current_sq_tp = []
                current_sq_fp = []
                current_sq_r = []

                for row in reader:
                    row = [item.strip(', ') for item in row]
                    if row and not row[0].isdigit():
                        if row[0] == "Pub Title":
                            if current_pub_title:
                                keywords_dict[current_pub_title] = {
                                    "Pub Title": current_pub_title,
                                    "Keywords": current_keywords,
                                    "SQ_TP": current_sq_tp,
                                    "SQ_FP": current_sq_fp,
                                    "SQ_R": current_sq_r
                                }
                            current_pub_title = row[1]
                            current_keywords = []
                            current_sq_tp = []
                            current_sq_fp = []
                            current_sq_r = []
                        elif row[0] == "Keywords":
                            current_keywords.extend(item for item in row[1:] if item)
                        elif row[0] == "SQ_TP":
                            current_sq_tp.extend(item for item in row[1:] if item)
                        elif row[0] == "SQ_FP":
                            current_sq_fp.extend(item for item in row[1:] if item)
                        elif row[0] == "SQ_R":
                            current_sq_r.extend(item for item in row[1:] if item)

                if current_pub_title:
                    keywords_dict[current_pub_title] = {
                        "Pub Title": current_pub_title,
                        "Keywords": current_keywords,
                        "SQ_TP": current_sq_tp,
                        "SQ_FP": current_sq_fp,
                        "SQ_R": current_sq_r
                    }

    return keywords_dict



## Example usage of read_keywords_from_csv

In [17]:
# path to csv file 
input_dir = "Keyword_CSVs" 
#input_dir = "less_keywords" 

# create dictionary from csv
keywords_dict = read_keywords_from_directory(input_dir)

print("\n############# \n")

# Why is this only 99? Should be ~140
print(len(keywords_dict))

print("\n############# \n")

# print dictionary
for pub_title, data in keywords_dict.items():
            print(f"Pub Title: {data['Pub Title']}")
            print(f"Keywords: {', '.join(data['Keywords'])}")
            print(f"SQ_TP: {', '.join(data['SQ_TP'])}")
            print(f"SQ_FP: {', '.join(data['SQ_FP'])}")
            print(f"SQ_R: {', '.join(data['SQ_R'])}")
            print("\n" + "=" * 80 + "\n")  # Separator between entries


############# 

99

############# 

Pub Title: JAK-STAT1 Signaling Pathway Is an Early Response to Helicobacter pylori Infection and Contributes to Immune Escape and Gastric Carcinogenesis.
Keywords: gastric cancer, GC, STAT1, PD-L1, Helicobacter pylori, H. pylori, immune escape, mouse model
SQ_TP: STAT1 AND PD-L1 AND GC
SQ_FP: (GC OR gastric cancer) NOT H.pylori[Organism], (GC OR gastric cancer) NOT mouse model
SQ_R: 


Pub Title: Downregulation of the vitamin D receptor expression during acute gastrointestinal graft versus host disease is associated with poor outcome after allogeneic stem cell transplantation.
Keywords: vitamin D receptor, VDR, HSCT, GI-GvHD
SQ_TP: Reduced VDR expression AND acute GI-GvHD
SQ_FP: 
SQ_R: 


Pub Title: Microbiome risk profiles as biomarkers for inflammatory and metabolic disorders.
Keywords: inflammatory bowel disease, IBD, type 2 diabetes mellitus, T2DM, bacteria-derived metabolites, review, gut dysbiosis
SQ_TP: 
SQ_FP: 
SQ_R: 


Pub Title: Biosynthet

## Function that takes keyword_dict/input_dict and returns dict with list ob PubMed IDs based on SQs

In [18]:
def dict_to_pubmed_id(input_dict):
    # Initialize a new dictionary to store the results
    result_dict = {}

    # Iterate over each publication entry in the input dictionary
    for pub_title, pub_data in input_dict.items():
        # Create a copy of the publication data
        pub_result = pub_data.copy()

        # Initialize empty lists for PubMed IDs for SQ_TP, SQ_FP, and SQ_R
        pub_result['PubMed_IDs_TP'] = []
        pub_result['PubMed_IDs_FP'] = []
        pub_result['PubMed_IDs_R'] = []

        # Extract elements from SQ_TP, SQ_FP, and SQ_R lists and search PubMed for IDs
        for sq_tp_element in pub_data['SQ_TP']:
            pub_result['PubMed_IDs_TP'].extend(search_pubmed_for_ids(sq_tp_element))

        for sq_fp_element in pub_data['SQ_FP']:
            pub_result['PubMed_IDs_FP'].extend(search_pubmed_for_ids(sq_fp_element))

        for sq_r_element in pub_data['SQ_R']:
            pub_result['PubMed_IDs_R'].extend(search_pubmed_for_ids(sq_r_element))

        # Add the modified publication data to the result dictionary
        result_dict[pub_title] = pub_result

    return result_dict



In [19]:
result_dict = dict_to_pubmed_id(keywords_dict)
result_dict

{'JAK-STAT1 Signaling Pathway Is an Early Response to Helicobacter pylori Infection and Contributes to Immune Escape and Gastric Carcinogenesis.': {'Pub Title': 'JAK-STAT1 Signaling Pathway Is an Early Response to Helicobacter pylori Infection and Contributes to Immune Escape and Gastric Carcinogenesis.',
  'Keywords': ['gastric cancer',
   'GC',
   'STAT1',
   'PD-L1',
   'Helicobacter pylori',
   'H. pylori',
   'immune escape',
   'mouse model'],
  'SQ_TP': ['STAT1 AND PD-L1 AND GC'],
  'SQ_FP': ['(GC OR gastric cancer) NOT H.pylori[Organism]',
   '(GC OR gastric cancer) NOT mouse model'],
  'SQ_R': [],
  'PubMed_IDs_TP': ['37483517',
   '35456965',
   '35267483',
   '33782030',
   '33598422',
   '30567732',
   '29259270'],
  'PubMed_IDs_FP': ['38213964',
   '38213922',
   '38213731',
   '38213725',
   '38213713',
   '38213590',
   '38213537',
   '38213535',
   '38213508',
   '38213249',
   '38213117',
   '38212905',
   '38212810',
   '38213964',
   '38213922',
   '38213731',
   '38

In [9]:
#print(result_dict.items())
for pub_title, data in result_dict.items():
            print(data['PubMed_IDs_TP'])
            #print(f"Pub Title: {data['Pub Title']}")
            #print(f"Keywords: {', '.join(data['Keywords'])}")
            #print(f"SQ_TP: {', '.join(data['SQ_TP'])}")
            #print(f"SQ_FP: {', '.join(data['SQ_FP'])}")
            #print(f"SQ_R: {', '.join(data['SQ_R'])}")
            #print(f"PubMed_IDs_TP: {', '.join(data['PubMed_IDs_TP'])}")
            #print(f"PubMed_IDs_FP: {', '.join(data['PubMed_IDs_FP'])}")
            #print(f"PubMed_IDs_R: {', '.join(data['PubMed_IDs_R'])}")
            #print("\n" + "=" * 80 + "\n")  # Separator between entries



['38084837', '37578396', '37167706', '35720495', '35169117', '35158099', '34977371', '34700029', '34549599', '34467981', '34289550', '34160355', '33906557', '38176135', '38161383', '38136224', '38116552', '38103512', '38078655', '38069446', '38030696', '38030136', '37996004', '37978573', '37967787', '37947686']
['32615090']
[]
['37870244', '37470692', '35530162', '35449204', '35069696', '34286518', '33122847', '33039730', '32587776', '32212793', '31117253', '31115521', '31113936', '34913280', '34826601', '33310890', '33122847', '32913197', '32276600', '32189414', '32156781', '32081857', '31662324', '30668360', '30663221', '29360439', '32913197', '25564254', '23243017', '20227039']
['37932372', '33259205', '32951817', '32943788', '27076635', '26604261', '26299961', '25739981', '25073739', '24677795', '37932372', '37832807', '36239538', '34975788', '34531089', '34419450', '33563785', '33508040', '33259205', '32123090', '28031489', '27482742', '27138431', '33259205']
['32619440', '3156951

## Function that takes dict with list of PubMeds IDs for the SQs, searches oa_comm db for the corresping XML papers and combines all to one large XML file

In [48]:
def extract_xml_files(input_dict, input_dir):
    
    SQ_IDs = {
        'SQ_TP_IDs': [id for pub_title, data in input_dict.items() for id in data['PubMed_IDs_TP']],
        'SQ_FP_IDs': [id for pub_title, data in input_dict.items() for id in data['PubMed_IDs_FP']],
        'SQ_R_IDs': [id for pub_title, data in input_dict.items() for id in data['PubMed_IDs_R']]
    }

    xml_file_name = "output.xml" 
    

    with open(xml_file_name, 'wb') as f:
        f.write(b'<root>\n')

        for SQ, desired_IDs in SQ_IDs.items():
            # Create a new root for each SQ
            SQ_root = ET.Element(f'{SQ}')

            for root_dir, dirs, files in os.walk(input_dir):
                for xml_file in files:
                    if xml_file.endswith('.xml'):
                        xml_file_path = os.path.join(root_dir, xml_file)

                        tree = ET.parse(xml_file_path)
                        root = tree.getroot()

                        # Create a deep copy of the root
                        root_copy = copy.deepcopy(root)

                        for element in root_copy.iter('article-id'):
                            if element.attrib.get('pub-id-type') == 'pmid' and element.text in desired_IDs:
                                # Append the matching articles to the SQ root
                                SQ_root.append(root_copy)

            # Write the SQ root to the file
            f.write(ET.tostring(SQ_root, encoding='utf-8'))

        f.write(b'</root>')

In [52]:
extract_xml_files(result_dict, '/Users/tillohlendorf/Downloads/Extracted_XML')