## Task Force A: Fetching PubMed ID's from queries

In [4]:
from Bio import Entrez
import csv

### Step 1: Make function to search for papers and return their PubMED IDs

In [5]:
def search_pubmed_for_ids(query, max_results=13):
    Entrez.email = "zeynep.korkmaz@tum.de"  # Set email address

    handle = Entrez.esearch(db="pubmed", term=query, retmax=max_results)
    record = Entrez.read(handle)
    handle.close()

    return record["IdList"]

In [7]:
# Example query
search_pubmed_for_ids("Helicobacter[Organsim] NOT IBD NOT intestinal microbes")

['38179804', '38179545', '38177692', '38176660', '38176008', '38175580', '38173317', '38173305', '38172074', '38171342', '38171101', '38169417', '38167662', '38167394', '38166953', '38165856', '38164932', '38162866', '38162864', '38162071', '38162007', '38161000', '38160708']

### Step 2: Create the keyword and query list from the csv file

In [6]:
def read_keywords_from_csv(csv_file):
    with open(csv_file, 'r') as file:
        reader = csv.reader(file)

        keywords_dict = {}
        
        current_pub_title = None
        current_keywords = []
        current_sq_tp = []
        current_sq_fp = []
        current_sq_r = []

        for row in reader:
            
            # Remove trailing commas from each element in the row
            row = [item.strip(', ') for item in row]
            
            if row and not row[0].isdigit():  # Skip numeric rows
                if row[0] == "Pub Title":
                    if current_pub_title:
                        # create for every title/DOI keys
                        keywords_dict[current_pub_title] = {
                            "Pub Title": current_pub_title,
                            "Keywords": current_keywords,
                            "SQ_TP": current_sq_tp,
                            "SQ_FP": current_sq_fp,
                            "SQ_R": current_sq_r
                        }
                    current_pub_title = row[1]
                    current_keywords = []
                    current_sq_tp = []
                    current_sq_fp = []
                    current_sq_r = []
                    # add values to list of the different keys and check for empty entries
                elif row[0] == "Keywords":
                    current_keywords.extend(item for item in row[1:] if item)
                elif row[0] == "SQ_TP":
                    current_sq_tp.extend(item for item in row[1:] if item)
                elif row[0] == "SQ_FP":
                    current_sq_fp.extend(item for item in row[1:] if item)
                elif row[0] == "SQ_R":
                    current_sq_r.extend(item for item in row[1:] if item)

        # Add the last entry
        if current_pub_title:
            keywords_dict[current_pub_title] = {
                "Pub Title": current_pub_title,
                "Keywords": current_keywords,
                "SQ_TP": current_sq_tp,
                "SQ_FP": current_sq_fp,
                "SQ_R": current_sq_r
            }

    return keywords_dict

##### How the dictionary looks:

In [7]:
# path to csv file 
input_csv = "/Users/tillohlendorf/Documents/MBT/Module/Systems BioMedicine/NLP/TFA_repo/sysbiomed_nlp_project/Keywords/keywords_Till.csv" 

In [8]:
# create dictionary from csv
keywords_dict = read_keywords_from_csv(input_csv)

In [9]:
keywords_dict

{'Mitochondrial impairment drives intestinal stem cell transition into dysfunctional Paneth cells predicting Crohn’s disease recurrence.': {'Pub Title': 'Mitochondrial impairment drives intestinal stem cell transition into dysfunctional Paneth cells predicting Crohn’s disease recurrence.',
  'Keywords': ['paneth cells',
   "Crohn's disease",
   'LGR5+ intestinal stemm cells',
   'organoids',
   'TNFΔARE mice',
   'ISC morphological appearance',
   'PC morphological appearance',
   'mitochondrial dysfunction',
   'ileal tissue samples',
   'PC granularity',
   'surgical resection',
   'CD recurrence',
   'CD',
   'PC',
   'intestinal stemm cells'],
  'SQ_TP': ["(PC or paneth cells) AND (CD or Crohn's disease) AND ISC morphological appearance",
   "(LGR5+) AND (CD or Crohn's disease)",
   "((PC or paneth cells) granularity) AND (CD or Crohn's disease)",
   "(CD or Crohn's disease) AND mitochondrial dysfunction"],
  'SQ_FP': ["(PC or paneth cells) NOT ((CD or Crohn's disease) OR IBD)",
  

# print dictionary
keywords_dict

##### Other representation of the dictionary for testing and debugging purposes

In [10]:
for pub_title, data in keywords_dict.items():
            print(f"Pub Title: {data['Pub Title']}")
            print(f"Keywords: {', '.join(data['Keywords'])}")
            print(f"SQ_TP: {', '.join(data['SQ_TP'])}")
            print(f"SQ_FP: {', '.join(data['SQ_FP'])}")
            print(f"SQ_R: {', '.join(data['SQ_R'])}")
            print("\n" + "=" * 80 + "\n")  # Separator between entries

Pub Title: Mitochondrial impairment drives intestinal stem cell transition into dysfunctional Paneth cells predicting Crohn’s disease recurrence.
Keywords: paneth cells, Crohn's disease, LGR5+ intestinal stemm cells, organoids, TNFΔARE mice, ISC morphological appearance, PC morphological appearance, mitochondrial dysfunction, ileal tissue samples, PC granularity, surgical resection, CD recurrence, CD, PC, intestinal stemm cells
SQ_TP: (PC or paneth cells) AND (CD or Crohn's disease) AND ISC morphological appearance, (LGR5+) AND (CD or Crohn's disease), ((PC or paneth cells) granularity) AND (CD or Crohn's disease), (CD or Crohn's disease) AND mitochondrial dysfunction
SQ_FP: (PC or paneth cells) NOT ((CD or Crohn's disease) OR IBD), organoids NOT ((CD or Crohn's disease) OR IBD), surgical resection NOT ((CD or Crohn's disease) OR IBD)
SQ_R: ((PC or paneth cells) OR (CD or Crohn's disease)) AND organoids, ((PC or paneth cells) OR (CD or Crohn's disease)) AND TNFΔARE mice, (PC or paneth 

### Step 3: Use dictionary with queries as input and fetch PubMED IDs.
#### Option 1: This option is more comprehensive since it also saves the used queries in the dictionary.

In [11]:
def dict_to_pubmed_id(input_dict):
    # Initialize a new dictionary to store the results
    result_dict = {}

    # Iterate over each publication entry in the input dictionary
    for pub_title, pub_data in input_dict.items():
        # Create a copy of the publication data
        pub_result = pub_data.copy()

        # Initialize empty lists for PubMed IDs for SQ_TP, SQ_FP, and SQ_R
        pub_result['PubMed_IDs_TP'] = []
        pub_result['PubMed_IDs_FP'] = []
        pub_result['PubMed_IDs_R'] = []

        # Extract elements from SQ_TP, SQ_FP, and SQ_R lists and search PubMed for IDs
        for sq_tp_element in pub_data['SQ_TP']:
            pub_result['PubMed_IDs_TP'].extend(search_pubmed_for_ids(sq_tp_element))

        for sq_fp_element in pub_data['SQ_FP']:
            pub_result['PubMed_IDs_FP'].extend(search_pubmed_for_ids(sq_fp_element))

        for sq_r_element in pub_data['SQ_R']:
            pub_result['PubMed_IDs_R'].extend(search_pubmed_for_ids(sq_r_element))

        # Add the modified publication data to the result dictionary
        result_dict[pub_title] = pub_result

    return result_dict


In [12]:
result_dict = dict_to_pubmed_id(keywords_dict)
result_dict

{'Mitochondrial impairment drives intestinal stem cell transition into dysfunctional Paneth cells predicting Crohn’s disease recurrence.': {'Pub Title': 'Mitochondrial impairment drives intestinal stem cell transition into dysfunctional Paneth cells predicting Crohn’s disease recurrence.',
  'Keywords': ['paneth cells',
   "Crohn's disease",
   'LGR5+ intestinal stemm cells',
   'organoids',
   'TNFΔARE mice',
   'ISC morphological appearance',
   'PC morphological appearance',
   'mitochondrial dysfunction',
   'ileal tissue samples',
   'PC granularity',
   'surgical resection',
   'CD recurrence',
   'CD',
   'PC',
   'intestinal stemm cells'],
  'SQ_TP': ["(PC or paneth cells) AND (CD or Crohn's disease) AND ISC morphological appearance",
   "(LGR5+) AND (CD or Crohn's disease)",
   "((PC or paneth cells) granularity) AND (CD or Crohn's disease)",
   "(CD or Crohn's disease) AND mitochondrial dysfunction"],
  'SQ_FP': ["(PC or paneth cells) NOT ((CD or Crohn's disease) OR IBD)",
  

#### Option 2: This one only writes the PubMed id's in the dictionary and not the used queries

def dict_to_pubmed_id_reduced(input_dict):
    # Initialize a new dictionary to store the results
    result_dict = {}

    # Iterate over each publication entry in the input dictionary
    for pub_title, pub_data in input_dict.items():
        # Initialize a dictionary to store PubMed IDs and Pub Title
        pub_result = {'Pub Title': pub_title}

        # Initialize empty lists for PubMed IDs for SQ_TP, SQ_FP, and SQ_R
        pub_result['PubMed_IDs_TP'] = []
        pub_result['PubMed_IDs_FP'] = []
        pub_result['PubMed_IDs_R'] = []

        # Extract elements from SQ_TP, SQ_FP, and SQ_R lists and search PubMed for IDs
        for sq_tp_element in pub_data['SQ_TP']:
            pub_result['PubMed_IDs_TP'].extend(search_pubmed_for_ids(sq_tp_element))

        for sq_fp_element in pub_data['SQ_FP']:
            pub_result['PubMed_IDs_FP'].extend(search_pubmed_for_ids(sq_fp_element))

        for sq_r_element in pub_data['SQ_R']:
            pub_result['PubMed_IDs_R'].extend(search_pubmed_for_ids(sq_r_element))

        # Add the modified publication data to the result dictionary
        result_dict[pub_title] = pub_result

    return result_dict


result2_dict = dict_to_pubmed_id_reduced(keywords_dict)
result2_dict

#### Step 4: export dictionaries as xml and json files

In [20]:
import json
import xml.etree.ElementTree as ET

In [21]:
# Convert to JSON
json_data = json.dumps(result_dict, indent=2)

# Save to a JSON file
with open("output_ID2.json", "w") as json_file:
    json_file.write(json_data)

In [22]:
# Convert to XML
def save_result_dict_to_xml(result_dict, xml_file_path):
    # Create the root element
    root = ET.Element("result_dict")

    # Iterate over each publication entry in the result dictionary
    for pub_title, pub_result in result_dict.items():
        entry = ET.SubElement(root, "entry")
        ET.SubElement(entry, "PubTitle").text = pub_result['Pub Title']

        # Add PubMed IDs for SQ_TP
        sq_tp = ET.SubElement(entry, "PubMed_IDs_TP")
        for pubmed_id in pub_result['PubMed_IDs_TP']:
            ET.SubElement(sq_tp, "PubMed_ID").text = pubmed_id

        # Add PubMed IDs for SQ_FP
        sq_fp = ET.SubElement(entry, "PubMed_IDs_FP")
        for pubmed_id in pub_result['PubMed_IDs_FP']:
            ET.SubElement(sq_fp, "PubMed_ID").text = pubmed_id

        # Add PubMed IDs for SQ_R
        sq_r = ET.SubElement(entry, "PubMed_IDs_R")
        for pubmed_id in pub_result['PubMed_IDs_R']:
            ET.SubElement(sq_r, "PubMed_ID").text = pubmed_id

    # Create the XML tree
    xml_tree = ET.ElementTree(root)

    # Save the XML tree to the specified file
    xml_tree.write(xml_file_path)

In [23]:
# Specify the path for the XML file
xml_file_path = "output.xml"

# Save the result_dict to an XML file
save_result_dict_to_xml(result_dict, xml_file_path)