In [1]:
import requests
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm

In [2]:
# create function to query pubmed
def get_citation_count(protein_id):
    url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term={protein_id}&retmax=1&usehistory=y"
    response = requests.get(url)
    
    try:
        count = int(response.text.split("<Count>")[1].split("</Count>")[0])
        return count
    except IndexError:
        print(f"Error: Unable to retrieve citation count for protein {protein_id}")
        print("Response:", response.text)
        return None

In [3]:
# read in data from RNA-seq pipeline
protein_df = pd.read_csv("/Users/talgalper/Documents/GitHub/PhD-MOC/RNA-Seq_pipeline/latest/MOC/results/MOC_PCSF_drugability.csv")
protein_list = protein_df['external_gene_name'].tolist()
protein_list = list(set(protein_list))
len(protein_list)

302

In [1]:
# select MeSh terms to filter search
mesh_terms = ["Neoplasms"]

In [2]:
# Combine the MeSH terms into a single search query
mesh_terms_with_mh = [term + "[MH]" for term in mesh_terms]
mesh_query = " OR ".join(mesh_terms_with_mh)
full_query = '(' + mesh_query + ')'
print(full_query)

(Neoplasms[MH])


In [None]:
# get counts
results = pd.DataFrame(columns=['gene_id', 'base_count', 'cancer_count', 'MeSH_count', 'title_query', 'ovarian_count'])

for protein in tqdm(protein_list):
    # get base count for gene term
    base_count = get_citation_count(protein)
    
    # get count with MeSH term(s)
    MeSH_query_full = f"{protein} AND Human[MH] AND {full_query}"
    MeSH_count = get_citation_count(MeSH_query_full)
    
    # get count with associated keyword: 'cancer'
    cancer_query = f"{protein} AND cancer"
    cancer_count = get_citation_count(cancer_query)
    
    # get count for ovarian cacner associated citations
    ovarian_query = f"{protein} AND cancer AND (ovary OR ovarian)"
    ovarian_count = get_citation_count(ovarian_query)

    # search title only
    title_query = f"{protein}[Title] AND cancer"
    title_count = get_citation_count(title_query)
    
    results = pd.concat([results, pd.DataFrame({'gene_id': [protein], 
                                                'base_count': [base_count], 
                                                'MeSH_count': [MeSH_count],
                                                'cancer_count': [cancer_count],
                                                'title_count': [title_count],
                                                'ovarian_count': [ovarian_count]})], ignore_index=True)


In [None]:
# function will check for missing values as a result of failed pubmed requests and re-run them
def update_nan_values(df):
    print(f"Number of NaN values: {results.isna().sum().sum()}")
    for index, row in df.iterrows():
        for col in df.columns:
            if pd.isnull(row[col]):
                protein = row['gene_id']

                if col == 'MeSH_count':
                    MeSH_query_full = f"{protein} AND Human[MH] AND {full_query}"
                    new_value = get_citation_count(MeSH_query_full)
                elif col == 'cancer_count':
                    cancer_query = f"{protein} AND cancer"
                    new_value = get_citation_count(cancer_query)
                elif col == 'title_query':
                    title_query = f"{protein}[Title] AND cancer"
                    new_value = get_citation_count(title_query)
                else:
                    new_value = get_citation_count(protein)

                df.at[index, col] = new_value
    return df

updated_results = update_nan_values(results)

In [None]:
# save results
updated_results.to_csv("/Users/talgalper/Documents/GitHub/PhD-MOC/RNA-Seq_pipeline/latest/MOC/intermediate/citation_scores_3.0.csv", index=False)