In [15]:
from Bio import Entrez
import csv
import os
import xml.etree.ElementTree as ET
import copy

In [16]:
# Function that searches for papers using our SQs and returns a list of PubMed IDs
def search_pubmed_for_ids(query, max_results=15):
    Entrez.email = "sabrina.klotz@tum.de"  # Set email address

    handle = Entrez.esearch(db="pubmed", term=query, retmax=max_results)
    record = Entrez.read(handle)
    handle.close()

    return record["IdList"]

In [33]:
def search_pmc_for_ids(query, max_results=15):
    Entrez.email = "sabrina.klotz@tum.de"
    
    handle = Entrez.esearch(db="pmc", term=query, retmax=max_results)
    record = Entrez.read(handle)
    handle.close()

    # Return the list of PMCIDs
    return record["IdList"]

In [25]:
def read_keywords_from_csv(csv_file):
    with open(csv_file, 'r') as file:
        reader = csv.reader(file)

        keywords_dict = {}
        
        current_pub_title = None
        current_keywords = []
        current_sq_tp = []
        current_sq_fp = []
        current_sq_r = []

        for row in reader:
            
            # Remove trailing commas from each element in the row
            row = [item.strip(', ') for item in row]
            
            if row and not row[0].isdigit():  # Skip numeric rows
                if row[0] == "Pub Title":
                    if current_pub_title:
                        # create for every title/DOI keys
                        keywords_dict[current_pub_title] = {
                            "Pub Title": current_pub_title,
                            "Keywords": current_keywords,
                            "SQ_TP": current_sq_tp,
                            "SQ_FP": current_sq_fp,
                            "SQ_R": current_sq_r
                        }
                    current_pub_title = row[1]
                    current_keywords = []
                    current_sq_tp = []
                    current_sq_fp = []
                    current_sq_r = []
                    # add values to list of the different keys and check for empty entries
                elif row[0] == "Keywords":
                    current_keywords.extend(item for item in row[1:] if item)
                elif row[0] == "SQ_TP":
                    current_sq_tp.extend(item for item in row[1:] if item)
                elif row[0] == "SQ_FP":
                    current_sq_fp.extend(item for item in row[1:] if item)
                elif row[0] == "SQ_R":
                    current_sq_r.extend(item for item in row[1:] if item)

        # Add the last entry
        if current_pub_title:
            keywords_dict[current_pub_title] = {
                "Pub Title": current_pub_title,
                "Keywords": current_keywords,
                "SQ_TP": current_sq_tp,
                "SQ_FP": current_sq_fp,
                "SQ_R": current_sq_r
            }

    return keywords_dict

In [26]:
# Function that takes keyword_dict/input_dict and returns dict with list of PubMed IDs based on SQs
def dict_to_pubmed_id(input_dict):
    # Initialize a new dictionary to store the results
    result_dict = {}

    # Iterate over each publication entry in the input dictionary
    for pub_title, pub_data in input_dict.items():
        # Create a copy of the publication data
        pub_result = pub_data.copy()

        # Initialize empty lists for PubMed IDs for SQ_TP, SQ_FP, and SQ_R
        pub_result['PubMed_IDs_TP'] = []
        pub_result['PubMed_IDs_FP'] = []
        pub_result['PubMed_IDs_R'] = []

        # Extract elements from SQ_TP, SQ_FP, and SQ_R lists and search PubMed for IDs
        for sq_tp_element in pub_data['SQ_TP']:
            pub_result['PubMed_IDs_TP'].extend(search_pubmed_for_ids(sq_tp_element))

        for sq_fp_element in pub_data['SQ_FP']:
            pub_result['PubMed_IDs_FP'].extend(search_pubmed_for_ids(sq_fp_element))

        for sq_r_element in pub_data['SQ_R']:
            pub_result['PubMed_IDs_R'].extend(search_pubmed_for_ids(sq_r_element))

        # Add the modified publication data to the result dictionary
        result_dict[pub_title] = pub_result

    return result_dict

In [27]:
# Function that takes keyword_dict/input_dict and returns dict with list of PMC IDs based on SQs
def dict_to_pmc_id(input_dict):
    # Initialize a new dictionary to store the results
    result_dict = {}

    # Iterate over each publication entry in the input dictionary
    for pub_title, pub_data in input_dict.items():
        # Create a copy of the publication data
        pub_result = pub_data.copy()

        # Initialize empty lists for PubMed IDs for SQ_TP, SQ_FP, and SQ_R
        pub_result['PMC_IDs_TP'] = []
        pub_result['PMC_IDs_FP'] = []
        pub_result['PMC_IDs_R'] = []

        # Extract elements from SQ_TP, SQ_FP, and SQ_R lists and search PubMed for IDs
        for sq_tp_element in pub_data['SQ_TP']:
            pub_result['PMC_IDs_TP'].extend(search_pmc_for_ids(sq_tp_element))

        for sq_fp_element in pub_data['SQ_FP']:
            pub_result['PMC_IDs_FP'].extend(search_pmc_for_ids(sq_fp_element))

        for sq_r_element in pub_data['SQ_R']:
            pub_result['PMC_IDs_R'].extend(search_pmc_for_ids(sq_r_element))

        # Add the modified publication data to the result dictionary
        result_dict[pub_title] = pub_result

    return result_dict

In [28]:
def dict_to_pubmed_id_reduced(input_dict):
    # Initialize a new dictionary to store the results
    result_dict = {}

    # Iterate over each publication entry in the input dictionary
    for pub_title, pub_data in input_dict.items():
        # Initialize a dictionary to store PubMed IDs and Pub Title
        pub_result = {'Pub Title': pub_title}

        # Initialize empty lists for PubMed IDs for SQ_TP, SQ_FP, and SQ_R
        pub_result['PubMed_IDs_TP'] = []
        pub_result['PubMed_IDs_FP'] = []
        pub_result['PubMed_IDs_R'] = []

        # Extract elements from SQ_TP, SQ_FP, and SQ_R lists and search PubMed for IDs
        for sq_tp_element in pub_data['SQ_TP']:
            pub_result['PubMed_IDs_TP'].extend(search_pubmed_for_ids(sq_tp_element))

        for sq_fp_element in pub_data['SQ_FP']:
            pub_result['PubMed_IDs_FP'].extend(search_pubmed_for_ids(sq_fp_element))

        for sq_r_element in pub_data['SQ_R']:
            pub_result['PubMed_IDs_R'].extend(search_pubmed_for_ids(sq_r_element))

        # Add the modified publication data to the result dictionary
        result_dict[pub_title] = pub_result

    return result_dict

In [29]:
def dict_to_pmc_id_reduced(input_dict):
    # Initialize a new dictionary to store the results
    result_dict = {}

    # Iterate over each publication entry in the input dictionary
    for pub_title, pub_data in input_dict.items():
        # Initialize a dictionary to store PubMed IDs and Pub Title
        pub_result = {'Pub Title': pub_title}

        # Initialize empty lists for PubMed IDs for SQ_TP, SQ_FP, and SQ_R
        pub_result['PMC_IDs_TP'] = []
        pub_result['PMC_IDs_FP'] = []
        pub_result['PMC_IDs_R'] = []

        # Extract elements from SQ_TP, SQ_FP, and SQ_R lists and search PubMed for IDs
        for sq_tp_element in pub_data['SQ_TP']:
            pub_result['PMC_IDs_TP'].extend(search_pmc_for_ids(sq_tp_element))

        for sq_fp_element in pub_data['SQ_FP']:
            pub_result['PMC_IDs_FP'].extend(search_pmc_for_ids(sq_fp_element))

        for sq_r_element in pub_data['SQ_R']:
            pub_result['PMC_IDs_R'].extend(search_pumc_for_ids(sq_r_element))

        # Add the modified publication data to the result dictionary
        result_dict[pub_title] = pub_result

    return result_dict

In [39]:
def save_pubmed_dict_to_csv(input_dict, output_csv):
    with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['Pub Title', 'Keywords', 'SQ_TP', 'SQ_FP', 'SQ_R', 'PubMed_IDs_TP', 'PubMed_IDs_FP', 'PubMed_IDs_R']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        for pub_title, data in input_dict.items():
            writer.writerow({
                'Pub Title': data['Pub Title'],
                'Keywords': ', '.join(data['Keywords']),
                'SQ_TP': ', '.join(data['SQ_TP']),
                'SQ_FP': ', '.join(data['SQ_FP']),
                'SQ_R': ', '.join(data['SQ_R']),
                'PubMed_IDs_TP': ', '.join(data['PubMed_IDs_TP']),
                'PubMed_IDs_FP': ', '.join(data['PubMed_IDs_FP']),
                'PubMed_IDs_R': ', '.join(data['PubMed_IDs_R'])
            })

In [40]:
def save_pmc_dict_to_csv(input_dict, output_csv):
    with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['Pub Title', 'Keywords', 'SQ_TP', 'SQ_FP', 'SQ_R', 'PMC_IDs_TP', 'PMC_IDs_FP', 'PMC_IDs_R']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        for pub_title, data in input_dict.items():
            writer.writerow({
                'Pub Title': data['Pub Title'],
                'Keywords': ', '.join(data['Keywords']),
                'SQ_TP': ', '.join(data['SQ_TP']),
                'SQ_FP': ', '.join(data['SQ_FP']),
                'SQ_R': ', '.join(data['SQ_R']),
                'PMC_IDs_TP': ', '.join(data['PMC_IDs_TP']),
                'PMC_IDs_FP': ', '.join(data['PMC_IDs_FP']),
                'PMC_IDs_R': ', '.join(data['PMC_IDs_R'])
            })

In [37]:
# Example usage:

# Read in keywords from the directory
csv_input_file = 'Sabrina.csv'
keywords_dict = read_keywords_from_csv(csv_input_file)

# Fetch PubMed IDs for SQs
pubmed_ids_dict = dict_to_pubmed_id(keywords_dict)

# Fetch PMC IDs for SQs
pmc_ids_dict = dict_to_pmc_id(keywords_dict)

# save PMIDs in one csv file
output_csv = "PMID_Sabrina.csv"
save_pubmed_dict_to_csv(pubmed_ids_dict, output_csv)

# save PMCIDs in one csv file
output_csv = "PMC_Sabrina.csv"
save_pmc_dict_to_csv(pmc_ids_dict, output_csv)