In [None]:
import requests
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm

In [None]:
# create function to query pubmed
def get_citation_count(protein_id):
    url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term={protein_id}&retmax=1&usehistory=y"
    response = requests.get(url)
    
    try:
        count = int(response.text.split("<Count>")[1].split("</Count>")[0])
        return count
    except IndexError:
        print(f"Error: Unable to retrieve citation count for protein {protein_id}")
        print("Response:", response.text)
        return None

In [None]:
# read in synonym data
gene_syno = pd.read_csv("/Users/talgalper/Documents/GitHub/PhD-MOC/Citation_search/data/human_syno.csv")
gene_syno[gene_syno.isnull().any(axis=1)]
gene_syno = gene_syno.dropna()

In [None]:
def collect_synonyms(group):
    synonyms = group['synonym'].dropna().astype(str).tolist()
    ref_term = group.name  # group.name is the ref_term
    # Include ref_term in the synonyms, avoiding duplicates
    if ref_term not in synonyms:
        synonyms.insert(0, ref_term)
    # Enclose each synonym in double quotes
    synonyms = ['"' + syn + '"' for syn in synonyms]
    # Join with ' OR ' and wrap in parentheses
    return '(' + ' OR '.join(synonyms) + ')'

df_grouped = gene_syno.groupby('ref_term').apply(collect_synonyms).reset_index(name='search_string')


In [None]:
# read in target data
target_genes = pd.read_csv("/Users/talgalper/Documents/GitHub/PhD-MOC/Citation_search/data/df_subnetNeighs.csv")
target_genes = target_genes.dropna(subset=['external_gene_name'])
target_genes = target_genes['external_gene_name'].tolist()
target_genes = list(set(target_genes))
len(target_genes)

In [None]:
# match target genes to ref_term & synonym (if not in ref term)
merged_ref = pd.DataFrame({'gene': target_genes})
merged_ref = merged_ref.merge(
    gene_syno[['ref_term']],
    left_on='gene',
    right_on='ref_term',
    how='left'
)

unmatched_ref = merged_ref[merged_ref['ref_term'].isnull()]['gene']

merged_syn = unmatched_ref.to_frame(name='gene').merge(
    gene_syno[['ref_term', 'synonym']],
    left_on='gene',
    right_on='synonym',
    how='left'
)

combined= pd.concat([
    merged_ref[['gene', 'ref_term']],
    merged_syn[['gene', 'ref_term']]
])

In [None]:
# subset target genes from synonyms
target_geneSyno = df_grouped[df_grouped['ref_term'].isin(combined['ref_term'])]

In [None]:
# select MeSh terms to filter search
mesh_terms = ["Neoplasms", "Humans"]

# Combine the MeSH terms into a single search query
mesh_terms_with_mh = [term + "[MH]" for term in mesh_terms]
mesh_query = " AND ".join(mesh_terms_with_mh)
mesh_query = '(' + mesh_query + ')'
print(mesh_query)

In [None]:
results_list = []

# Loop over the rows of target_geneSyno
for index, row in tqdm(target_geneSyno.iterrows(), total=target_geneSyno.shape[0]):
    gene_id = row['ref_term']
    search_string = row['search_string']
    MeSH_query = f"{mesh_query} AND {search_string}"
    MeSH_count = get_citation_count(MeSH_query)
    results_list.append({
        'gene_id': gene_id,
        'search_string': search_string,
        'MeSH_count': MeSH_count
    })

In [None]:
# Convert the list of dictionaries to a DataFrame
results = pd.DataFrame(results_list)
print(f"Number of NaN values: {results['MeSH_count'].isna().sum()}")

In [None]:
def update_nan_values(df, mesh_query):
    nan_mask = df['MeSH_count'].isna()
    # Loop over rows with NaN in 'MeSH_count'
    for index, row in df[nan_mask].iterrows():
        gene_id = row['gene_id']
        search_string = row['search_string']
        MeSH_query = f"{mesh_query} AND {search_string}"
        new_value = get_citation_count(MeSH_query)
        df.at[index, 'MeSH_count'] = new_value

    print(f"Number of NaN values after update: {df['MeSH_count'].isna().sum()}")
    return df

# Use the function to update NaN values

updated_results = update_nan_values(results, mesh_query)


In [None]:
updated_results.to_csv("/Users/talgalper/Documents/GitHub/PhD-MOC/Citation_search/results/HHnetNeighs_citation_counts.csv")