## Task Force A: Fetching PubMed ID's from queries

In [33]:
from Bio import Entrez
import csv

### Step 1: Make function to search for papers and return their PubMED IDs

In [34]:
def search_pubmed_for_ids(query, max_results=5):
    Entrez.email = "zeynep.korkmaz@tum.de"  # Set email address

    handle = Entrez.esearch(db="pubmed", term=query, retmax=max_results)
    record = Entrez.read(handle)
    handle.close()

    return record["IdList"]

In [47]:
# Example query
search_pubmed_for_ids("ibd AND acid stress")

['38110964', '38078699', '38069446', '38068850', '38039189']

### Step 2: Create the keyword and query list from the csv file

In [36]:
def read_keywords_from_csv(csv_file):
    with open(csv_file, 'r') as file:
        reader = csv.reader(file)

        keywords_dict = {}
        
        current_pub_title = None
        current_keywords = []
        current_sq_tp = []
        current_sq_fp = []
        current_sq_r = []

        for row in reader:
            
            # Remove trailing commas from each element in the row
            row = [item.strip(', ') for item in row]
            
            if row and not row[0].isdigit():  # Skip numeric rows
                if row[0] == "Pub Title":
                    if current_pub_title:
                        # create for every title/DOI keys
                        keywords_dict[current_pub_title] = {
                            "Pub Title": current_pub_title,
                            "Keywords": current_keywords,
                            "SQ_TP": current_sq_tp,
                            "SQ_FP": current_sq_fp,
                            "SQ_R": current_sq_r
                        }
                    current_pub_title = row[1]
                    current_keywords = []
                    current_sq_tp = []
                    current_sq_fp = []
                    current_sq_r = []
                    # add values to list of the different keys and check for empty entries
                elif row[0] == "Keywords":
                    current_keywords.extend(item for item in row[1:] if item)
                elif row[0] == "SQ_TP":
                    current_sq_tp.extend(item for item in row[1:] if item)
                elif row[0] == "SQ_FP":
                    current_sq_fp.extend(item for item in row[1:] if item)
                elif row[0] == "SQ_R":
                    current_sq_r.extend(item for item in row[1:] if item)

        # Add the last entry
        if current_pub_title:
            keywords_dict[current_pub_title] = {
                "Pub Title": current_pub_title,
                "Keywords": current_keywords,
                "SQ_TP": current_sq_tp,
                "SQ_FP": current_sq_fp,
                "SQ_R": current_sq_r
            }

    return keywords_dict

##### How the dictionary looks:

In [45]:
# save csv file
input_csv = "/Users/zeynepkorkmaz/Downloads/keywords.csv" 

In [46]:
# create dictionary from csv
keywords_dict = read_keywords_from_csv(input_csv)

In [39]:
# print dictionary
keywords_dict

{'https://dx.doi.org/10.1016/j.jhep.2022.12.028': {'Pub Title': 'https://dx.doi.org/10.1016/j.jhep.2022.12.028',
  'Keywords': ['keyword1', 'keyword2', 'keyword3'],
  'SQ_TP': ['ibd AND acid stress', 'E.coli AND ibd'],
  'SQ_FP': ['acid resistance NOT ibd', 'E.coli NOT ibd'],
  'SQ_R': ['inflammatory AND acid stress', 'E.coli AND inflammatory']},
 'https://dx.doi.org/10.1016/j.chom.2022.09.011': {'Pub Title': 'https://dx.doi.org/10.1016/j.chom.2022.09.011',
  'Keywords': [],
  'SQ_TP': ['ibd AND acid stress', 'E.coli AND ibd'],
  'SQ_FP': ['cripspr AND ibd NOT microbiome',
   'in vivo screening AND ibd NOT microbiome',
   'mouse AND ibd NOT microbiome'],
  'SQ_R': ['inflammatory AND acid stress', 'E.coli AND inflammatory']},
 'https://dx.doi.org/10.1080/19490976.2022.2107386': {'Pub Title': 'https://dx.doi.org/10.1080/19490976.2022.2107386',
  'Keywords': ['keyword1', 'keyword2'],
  'SQ_TP': [],
  'SQ_FP': ['Glycolipids AND ibd AND microbiome',
   'a-Gal syndrome  AND ibd AND microbiom

##### Other representation of the dictionary for testing and debugging purposes

In [40]:
for pub_title, data in keywords_dict.items():
            print(f"Pub Title: {data['Pub Title']}")
            print(f"Keywords: {', '.join(data['Keywords'])}")
            print(f"SQ_TP: {', '.join(data['SQ_TP'])}")
            print(f"SQ_FP: {', '.join(data['SQ_FP'])}")
            print(f"SQ_R: {', '.join(data['SQ_R'])}")
            print("\n" + "=" * 80 + "\n")  # Separator between entries

Pub Title: https://dx.doi.org/10.1016/j.jhep.2022.12.028
Keywords: keyword1, keyword2, keyword3
SQ_TP: ibd AND acid stress, E.coli AND ibd
SQ_FP: acid resistance NOT ibd, E.coli NOT ibd
SQ_R: inflammatory AND acid stress, E.coli AND inflammatory


Pub Title: https://dx.doi.org/10.1016/j.chom.2022.09.011
Keywords: 
SQ_TP: ibd AND acid stress, E.coli AND ibd
SQ_FP: cripspr AND ibd NOT microbiome, in vivo screening AND ibd NOT microbiome, mouse AND ibd NOT microbiome
SQ_R: inflammatory AND acid stress, E.coli AND inflammatory


Pub Title: https://dx.doi.org/10.1080/19490976.2022.2107386
Keywords: keyword1, keyword2
SQ_TP: 
SQ_FP: Glycolipids AND ibd AND microbiome, a-Gal syndrome  AND ibd AND microbiome, red meat allergy  AND ibd AND microbiome
SQ_R: glycolipids AND gastrointestinal, alpha-gal AND gastrointestinal, red meat AND gastrointestinal




### Step 3: Use dictionary with queries as input and fetch PubMED IDs.
#### Option 1: This option is more comprehensive since it also saves the used queries in the dictionary.

In [41]:

def dict_to_pubmed_id(input_dict):
    # Initialize a new dictionary to store the results
    result_dict = {}

    # Iterate over each publication entry in the input dictionary
    for pub_title, pub_data in input_dict.items():
        # Create a copy of the publication data
        pub_result = pub_data.copy()

        # Initialize empty lists for PubMed IDs for SQ_TP, SQ_FP, and SQ_R
        pub_result['PubMed_IDs_TP'] = []
        pub_result['PubMed_IDs_FP'] = []
        pub_result['PubMed_IDs_R'] = []

        # Extract elements from SQ_TP, SQ_FP, and SQ_R lists and search PubMed for IDs
        for sq_tp_element in pub_data['SQ_TP']:
            pub_result['PubMed_IDs_TP'].extend(search_pubmed_for_ids(sq_tp_element))

        for sq_fp_element in pub_data['SQ_FP']:
            pub_result['PubMed_IDs_FP'].extend(search_pubmed_for_ids(sq_fp_element))

        for sq_r_element in pub_data['SQ_R']:
            pub_result['PubMed_IDs_R'].extend(search_pubmed_for_ids(sq_r_element))

        # Add the modified publication data to the result dictionary
        result_dict[pub_title] = pub_result

    return result_dict


In [42]:
dict_to_pubmed_id(keywords_dict)

{'https://dx.doi.org/10.1016/j.jhep.2022.12.028': {'Pub Title': 'https://dx.doi.org/10.1016/j.jhep.2022.12.028',
  'Keywords': ['keyword1', 'keyword2', 'keyword3'],
  'SQ_TP': ['ibd AND acid stress', 'E.coli AND ibd'],
  'SQ_FP': ['acid resistance NOT ibd', 'E.coli NOT ibd'],
  'SQ_R': ['inflammatory AND acid stress', 'E.coli AND inflammatory'],
  'PubMed_IDs_TP': ['38110964',
   '38078699',
   '38069446',
   '38068850',
   '38039189',
   '38094217',
   '38078655',
   '38059748',
   '38038481',
   '38038385'],
  'PubMed_IDs_FP': ['38112098',
   '38111913',
   '38111646',
   '38111637',
   '38111307',
   '38111791',
   '38111644',
   '38111640',
   '38111598',
   '38111572'],
  'PubMed_IDs_R': ['38111669',
   '38110964',
   '38110193',
   '38109473',
   '38107532',
   '38111640',
   '38109839',
   '38108386',
   '38106073',
   '38104079']},
 'https://dx.doi.org/10.1016/j.chom.2022.09.011': {'Pub Title': 'https://dx.doi.org/10.1016/j.chom.2022.09.011',
  'Keywords': [],
  'SQ_TP': ['ibd 

#### Option 2: This one only writes the PubMed id's in the dictionary and not the used queries

In [43]:
def dict_to_pubmed_id_reduced(input_dict):
    # Initialize a new dictionary to store the results
    result_dict = {}

    # Iterate over each publication entry in the input dictionary
    for pub_title, pub_data in input_dict.items():
        # Initialize a dictionary to store PubMed IDs and Pub Title
        pub_result = {'Pub Title': pub_title}

        # Initialize empty lists for PubMed IDs for SQ_TP, SQ_FP, and SQ_R
        pub_result['PubMed_IDs_TP'] = []
        pub_result['PubMed_IDs_FP'] = []
        pub_result['PubMed_IDs_R'] = []

        # Extract elements from SQ_TP, SQ_FP, and SQ_R lists and search PubMed for IDs
        for sq_tp_element in pub_data['SQ_TP']:
            pub_result['PubMed_IDs_TP'].extend(search_pubmed_for_ids(sq_tp_element))

        for sq_fp_element in pub_data['SQ_FP']:
            pub_result['PubMed_IDs_FP'].extend(search_pubmed_for_ids(sq_fp_element))

        for sq_r_element in pub_data['SQ_R']:
            pub_result['PubMed_IDs_R'].extend(search_pubmed_for_ids(sq_r_element))

        # Add the modified publication data to the result dictionary
        result_dict[pub_title] = pub_result

    return result_dict


In [44]:
dict_to_pubmed_id_reduced(keywords_dict)

{'https://dx.doi.org/10.1016/j.jhep.2022.12.028': {'Pub Title': 'https://dx.doi.org/10.1016/j.jhep.2022.12.028',
  'PubMed_IDs_TP': ['38110964',
   '38078699',
   '38069446',
   '38068850',
   '38039189',
   '38094217',
   '38078655',
   '38059748',
   '38038481',
   '38038385'],
  'PubMed_IDs_FP': ['38112098',
   '38111913',
   '38111646',
   '38111637',
   '38111307',
   '38111791',
   '38111644',
   '38111640',
   '38111598',
   '38111572'],
  'PubMed_IDs_R': ['38111669',
   '38110964',
   '38110193',
   '38109473',
   '38107532',
   '38111640',
   '38109839',
   '38108386',
   '38106073',
   '38104079']},
 'https://dx.doi.org/10.1016/j.chom.2022.09.011': {'Pub Title': 'https://dx.doi.org/10.1016/j.chom.2022.09.011',
  'PubMed_IDs_TP': ['38110964',
   '38078699',
   '38069446',
   '38068850',
   '38039189',
   '38094217',
   '38078655',
   '38059748',
   '38038481',
   '38038385'],
  'PubMed_IDs_FP': ['38110791',
   '38049007',
   '37990349',
   '37989058',
   '37976806',
   '381109