In [None]:
import requests
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm

In [None]:
# create function to query pubmed
def get_citation_count(protein_id):
    url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term={protein_id}&retmax=1&usehistory=y"
    response = requests.get(url)
    
    try:
        count = int(response.text.split("<Count>")[1].split("</Count>")[0])
        return count
    except IndexError:
        print(f"Error: Unable to retrieve citation count for protein {protein_id}")
        print("Response:", response.text)
        return None

In [None]:
# read in target data
target_genes = pd.read_csv("/Users/talgalper/Documents/GitHub/PhD-MOC/Citation_search/data/PCSF_results.csv")

# replace any terms with 2 or less characters with description
target_genes['PubMed_search'] = np.where(
    target_genes['external_gene_name'].str.len() > 2,
    target_genes['external_gene_name'],
    target_genes['description']
)

# drop NA rows and get list of genes to search
target_genes = target_genes.dropna(subset=['external_gene_name'])
target_genes = target_genes['PubMed_search'].tolist()
target_genes = list(set(target_genes))
len(target_genes)

In [None]:
# select MeSh terms to filter search
mesh_terms = ["Neoplasms", "Humans"]

# Combine the MeSH terms into a single search query
mesh_terms_with_mh = [term + "[MH]" for term in mesh_terms]
mesh_query = " AND ".join(mesh_terms_with_mh)
mesh_query = '(' + mesh_query + ')'
print(mesh_query)

In [None]:
# get counts
results = pd.DataFrame(columns=['gene_id', 'MeSH_count'])

for gene in tqdm(target_genes):
    MeSH_query = f'{mesh_query} AND "{gene}"'
    MeSH_count = get_citation_count(MeSH_query)
    results = pd.concat([results, pd.DataFrame({'gene_id': [gene], 
                                                'MeSH_count': [MeSH_count]})], 
                        ignore_index=True)


In [None]:
print(f"Number of NaN values: {results['MeSH_count'].isna().sum()}")

In [None]:
def update_nan_values(df):
    for index, row in df[df['MeSH_count'].isna()].iterrows():
        gene = row['gene_id']
        MeSH_query = f'{mesh_query} AND "{gene}"'
        new_value = get_citation_count(MeSH_query)
        
        # Update the DataFrame
        df.at[index, 'MeSH_count'] = new_value

    print(f"Number of NaN values after update: {df['MeSH_count'].isna().sum()}")
    return df

# Use the function to update NaN values
updated_results = update_nan_values(results)

In [None]:
updated_results.to_csv("/Users/talgalper/Documents/GitHub/PhD-MOC/Citation_search/results/PCSF_citation_counts.csv", index=False)