In [None]:
import requests
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
import nltk
from nltk.corpus import words

In [None]:
# create function to query pubmed
def get_citation_count(protein_id):
    url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term={protein_id}&retmax=1&usehistory=y"
    response = requests.get(url)
    
    try:
        count = int(response.text.split("<Count>")[1].split("</Count>")[0])
        return count
    except IndexError:
        print(f"Error: Unable to retrieve citation count for protein {protein_id}")
        print("Response:", response.text)
        return None

In [None]:
# read in target data
target_genes = pd.read_csv("/Users/talgalper/Documents/GitHub/PhD-MOC/Citation_search/data/PCSF_results.csv")

# replace any terms with 2 or less characters with description
target_genes['PubMed_search'] = np.where(
    target_genes['external_gene_name'].str.len() > 2,
    target_genes['external_gene_name'],
    target_genes['description']
)

# drop NA rows and get list of genes to search
target_genes = target_genes.dropna(subset=['external_gene_name'])

# if any of the values in the PubMed_search column have a "," in the name remove the "," and everything that follows
target_genes['PubMed_search'] = target_genes['PubMed_search'].str.split(",").str[0]

In [None]:
# select MeSh terms to filter search
mesh_terms = ["Neoplasms", "Humans"]

# Combine the MeSH terms into a single search query
mesh_terms_with_mh = [term + "[MH]" for term in mesh_terms]
mesh_query = " AND ".join(mesh_terms_with_mh)
mesh_query = '(' + mesh_query + ')'
print(mesh_query)

In [None]:
# get counts
results = pd.DataFrame(columns=['external_gene_name', 'description', 'search_id', 'MeSH_count'])

for index, row in tqdm(target_genes.iterrows(), total=len(target_genes)):
    gene = row['PubMed_search']
    external_name = row['external_gene_name']
    description = row['description']
    MeSH_query = f'{gene}[TIAB] AND "{mesh_query}"'
    MeSH_count = get_citation_count(MeSH_query)
    results = pd.concat([results, pd.DataFrame({'external_gene_name': [external_name],
                                                'description': [description],
                                                'search_id': [gene], 
                                                'MeSH_count': [MeSH_count]})], 
                      ignore_index=True)

In [None]:
# number of missing scores
print(f"Number of NaN values: {results['MeSH_count'].isna().sum()}")

In [None]:

# Check for missing entries and update score
def update_nan_values(df):
    for index, row in tqdm(df[df['MeSH_count'].isna()].iterrows(), total=len(df)):
        gene = row['search_id']
        MeSH_query = f'"{gene}"[TIAB] AND {mesh_query}'
        new_value = get_citation_count(MeSH_query)
        
        # Update the DataFrame
        df.at[index, 'MeSH_count'] = new_value

    print(f"Number of NaN values after update: {df['MeSH_count'].isna().sum()}")
    return df

# Use the function to update NaN values
updated_results = update_nan_values(results)
updated_results['MeSH_count'] = updated_results['MeSH_count'].astype(int)

In [None]:
# save checkpoint
results_TIAB = updated_results

In [None]:
    
# Function to check if searched gene is an English word
english_words = set(words.words())
english_words = {word.upper() for word in english_words}

def is_english_word(word):
    if isinstance(word, str):
        return word.upper() in english_words
    else:
        return False
    
print(f"Number of english words in set: {len([gene for gene in results_TIAB['search_id'] if gene.upper() in english_words])}")
for gene in results_TIAB['search_id']:
    if is_english_word(gene):
        print(f"Warning: {gene} appears to be an English word")
        



In [None]:
results_TIAB['andGene_count'] = np.nan
# First pass: Check for English words and rerun with "gene" following gene name
for idx, row in tqdm(results_TIAB.iterrows(), total=len(results_TIAB)):
    gene = row['search_id']
    
    # Only process if it's an English word
    if is_english_word(gene):
        try:
            # Construct new query with gene[TIAB]
            new_query = f'"{gene} gene"[TIAB] AND {mesh_query}'
            count = get_citation_count(new_query)
            results_TIAB.loc[idx, 'andGene_count'] = count
        except Exception as e:
            print(f"Error processing gene {gene}: {str(e)}")
            results_TIAB.loc[idx, 'andGene_count'] = np.nan
            
results_TIAB['andGene_count'] = results_TIAB['andGene_count'].astype('Int64')

In [None]:
results_TIAB['description_count'] = np.nan
# Second pass: Use descriptions for English words
for idx, row in tqdm(results_TIAB.iterrows(), total=len(results_TIAB)):
    gene = row['search_id']
    
    if is_english_word(gene):
        try:
            description = row['description']
            # Construct query with description
            desc_query = f'"{description}"[TIAB] AND {mesh_query}'
            count = get_citation_count(desc_query)
            results_TIAB.loc[idx, 'description_count'] = count
            
        except Exception as e:
            print(f"Error processing description for gene {gene}: {str(e)}")
            results_TIAB.loc[idx, 'description_count'] = np.nan

results_TIAB['description_count'] = results_TIAB['description_count'].astype('Int64')

In [None]:
# read in COSMIC hallmark genes
cancer_hallmark = pd.read_csv("/Users/talgalper/Documents/GitHub/PhD-MOC/Citation_search/data/Cosmic_CancerGeneCensusHallmarksOfCancer_v101_GRCh38.tsv", sep="\t")
cancer_hallmark = cancer_hallmark['GENE_SYMBOL'].tolist()
cancer_hallmark = list(set(cancer_hallmark))
len(cancer_hallmark)

In [None]:
# Get "best score" 
def get_best_score(row):
    # Check if the gene is a cancer hallmark gene
    if row['external_gene_name'] in cancer_hallmark:
        return row['MeSH_count']
    else:
        # If both base_count and TIAB_count are NA, use MeSH_count
        if pd.isna(row['andGene_count']) and pd.isna(row['description_count']):
            return row['MeSH_count']
        # Otherwise, return the highest non-NA value between base_count and TIAB_count
        return max(
            row['andGene_count'] if not pd.isna(row['andGene_count']) else float('-inf'),
            row['description_count'] if not pd.isna(row['description_count']) else float('-inf')
        )

# Add the new column to results_TIAB
results_TIAB['best_score'] = results_TIAB.apply(get_best_score, axis=1)

In [None]:
results_TIAB.to_csv("/Users/talgalper/Documents/GitHub/PhD-MOC/Citation_search/results/PCSF_citation_counts[TIAB].csv", index=False)