### **Function Definitions**

#### *Given requirements (for the data retrieved):*
- For queries with gene names: It is preferred to have the **gene name** in title and/or abstract. Use search tag *[tiab]* for this in the PubMed database. Note: For the PubMed Central (PMC) database, *[tiab]* is an unknown field as it doesn't exist in that DB. 
- The publication must be well-cited by having at least **25** articles that cited it.
- For a query that includes "Ivermectin", exclude articles relating to COVID-19 with *"NOT covid-19"* in the query.

Obtain the top-k article IDs for the results that meet the criteria above.

PubMed was decided to be the first source for database searches. PMC was then used for secondary searches of relevant articles.

In [128]:
import time
from Bio import Entrez
import pandas as pd
import numpy as np


## Function to get query counts for PubMed and PMC databases only
def global_db_search(query_list, genes_list=[]):

    if len(genes_list) != 0:
        for i in query_list:
            for j in genes_list:
                query = i + j + "[tiab]"

                handle = Entrez.egquery(term=query)
                record = Entrez.read(handle)
                df = pd.DataFrame(record["eGQueryResult"]).head(2)
                df["Query"] = query
                append_data(df, 'global_query_res.csv', False)
                time.sleep(0.11)
    else:
        for i in query_list:
            handle = Entrez.egquery(term=i)
            record = Entrez.read(handle)
            df = pd.DataFrame(record["eGQueryResult"]).head(2)
            df["Query"] = i
            append_data(df, 'global_query_res.csv', False)
            time.sleep(0.11)
    return


## Function for reading in the df "summary" results
def read_in_results(file_name):

    # The converters are there so that each list is NOT inside a string
    res_df = pd.read_csv(file_name,  converters={"MainID_List": lambda x: x.strip("[]").replace("'", "").split(", "),
                                                    "P_Dates": lambda x: x.strip("[]").replace("'", "").split(", "),
                                                    "P_Years": lambda x: x.strip("[]").replace("'", "").split(", "),
                                                    "LinkedID_List": lambda x: x.strip("[]").replace("'", "").split(", "),
                                                    "Query_Count": int})
    return res_df


def esummary_info(in_webenv_key, in_query_key, db_name):

    # Obtaining DocSums for a set of IDs that are stored on the Entrez History server.
    handle = Entrez.esummary(db=db_name, webenv=in_webenv_key, query_key=in_query_key)
    record = Entrez.read(handle)

    publ_dates, publ_years = get_published_dates(record)

    if db_name == "pubmed":
        ids_list = get_pmcids(record)
    else:
        ids_list = get_pmids(record)

    return publ_dates, publ_years, ids_list


def get_published_dates(esummary_rec):

    retr_dates = []
    retr_years = []

    for article in esummary_rec:
        # "PubDate" is often of the form: '2021 Nov 26'
        retr_dates.append(article["PubDate"])
        p_year = int(article["PubDate"].split()[0])
        retr_years.append(p_year)

    return retr_dates, retr_years


def get_pmcids(esummary_rec):

    pmcids_list = []
    for i in esummary_rec:
        # If "pmc" is there, then this article also has a PMCID (i.e., it's also found in the PubMed Central db)
        if "pmc" in i["ArticleIds"]:
            pmcids_list.append(i["ArticleIds"]["pmc"])
        else:
            pmcids_list.append(np.NaN)

    return pmcids_list


def get_pmids(esummary_rec):

    pmids_list = []
    for i in esummary_rec:
        # '0' means that the article has no PMID (i.e., it's not found in the PubMed db)
        if i["ArticleIds"]["pmid"] == '0':
            pmids_list.append(np.NaN)
        else:
            pmids_list.append(i["ArticleIds"]["pmid"])

    return pmids_list


## Function that retrieves summary results from a given set of queries (which don't require a gene list)
def get_query_info_no_genes(query_in, db_name):

    # relevance: Records are sorted based on relevance to your search. (Relevance ranking)
    search_results = Entrez.read(
        Entrez.esearch(db=db_name, term=query_in, sort="relevance", retmax=5000, usehistory="y")
        )

    # NEED TO FIRST CHECK IF WE GOT ANY RESULTS FROM THAT QUERY
    if len(search_results["IdList"]) == 0:
        return
    else:
        # With search_results, we will use its WebEnv value and QueryKey value
        p_dates, p_years, ids_list = esummary_info(search_results["WebEnv"], search_results["QueryKey"], db_name)

        time.sleep(0.11)

        return pd.DataFrame([[query_in, db_name, search_results['Count'], search_results['IdList'], p_dates, p_years, ids_list]],
                                columns=['Query', 'Db_Name', 'Query_Count', 'MainID_List', 'P_Dates', 'P_Years', 'LinkedID_List'])


## Function that retrieves summary results from a given set of queries (which requires a gene list)
def get_query_info(query_in, genes, db_name):

    gene_query = []
    query = ""

    for i in genes:
        # Example of db_name values in this use case: "pubmed" or "pmc"
        if db_name == "pubmed":
            # PubMed's Search field tag: Title/Abstract [tiab]
            query = query_in + i + "[tiab]"
        else:
            query = query_in + i

        # relevance: Records are sorted based on relevance to your search. (Relevance ranking)
        search_results = Entrez.read(
            Entrez.esearch(db=db_name, term=query, sort="relevance", retmax=5000, usehistory="y")
            )
     
        # NEED TO FIRST CHECK IF WE GOT ANY RESULTS FROM THAT QUERY
        if len(search_results["IdList"]) == 0:
            continue

        # With search_results, we will use its WebEnv value and QueryKey value
        p_dates, p_years, ids_list = esummary_info(search_results["WebEnv"], search_results["QueryKey"], db_name)
       
        gene_query.append([query, db_name, search_results['Count'], search_results['IdList'], p_dates, p_years, ids_list])
        time.sleep(0.11)
          
    return pd.DataFrame(gene_query, columns=['Query', 'Db_Name', 'Query_Count', 'MainID_List', 'P_Dates', 'P_Years', 'LinkedID_List'])


## Function for obtaining citation counts for the set of IDs found in the "summary" df
def cited_cnt_table(df_summary, db_name, no_dupes_col=False):

    """
    By default, no_dupes_col is False. 
    
    If no_dupes_col is specified as True, then we refer to the DataFrame with results from the PMC database.

    For no_dupes_col=True: This column is called "NoDupes_PMCIDs", which is where we removed duplicate IDs (if any) per query.
    In this case, we are excluding any articles previously pulled from the PubMed results, so that we're not looking at 
    duplicate articles after querying against both databases (with PubMed being the first database, and then PMC).
    """


    elink_data = []
    link_name = ""

    if db_name == "pubmed":
        link_name = "pubmed_pubmed_citedin"
    else:
        link_name = "pmc_pmc_citedby"  # "pmc" is the other db_name in this use case


    if no_dupes_col is True:
        id_col = "NoDupes_PMCIDs"
    else:
        id_col = "MainID_List"


    for i in range(0, len(df_summary)):

        query_term = df_summary.iloc[i]["Query"]

        for id_num in df_summary.iloc[i][id_col]:

            record = Entrez.read(Entrez.elink(id=id_num, dbfrom=db_name, db=db_name, linkname=link_name))
         
            if len(record[0]["LinkSetDb"]) != 0:
                cited_counts = len(record[0]["LinkSetDb"][0]["Link"])
            else:
                # 'LinkSetDb' key contains empty list when an article has no citation counts
                cited_counts = 0
            elink_data.append([query_term, db_name, id_num, cited_counts])

            if (df_summary.iloc[i][id_col].index(id_num) + 1) % 3 == 0:
                time.sleep(0.11)

    return pd.DataFrame(elink_data, columns=["Query", "Db_Name", "Id_List", "Citation_Cnts"]) 


## Function that returns the Top-k results (pass in k as an argument to the function, input by the user)
def get_top_k(df, k_val):

    q_top_k = []

    for q in df["Query"].unique():
        matches_ids = []  # For each query version, this will be the IDs meeting the criteria of having citation counts >= 25
        counts = []
        df_temp = df[df["Query"] == q]

        for i in range(0, len(df_temp)):
            if df_temp.iloc[i]["Citation_Cnts"] >= 25:
                matches_ids.append(int(df_temp.iloc[i]["Id_List"]))
                counts.append(df_temp.iloc[i]["Citation_Cnts"])
                if len(matches_ids) == k_val:
                    break
        if len(matches_ids) == 0:
            continue
        q_top_k.append([q, matches_ids, counts])

    top_k_df = pd.DataFrame(q_top_k, columns=["Query", "Top_"+str(k_val)+"_Ids", "Citation_Cnts"])

    return top_k_df


## Function that appends DataFrame rows to a CSV file
def append_data(df, file_name, is_new_file):

    if is_new_file:
        # if True, then
        df.to_csv(file_name, index=False)
    else:
        # False: This is an existing CSV file
        df.to_csv(file_name, mode='a', index=False, header=False)

    return


---

In [8]:
Entrez.email = "Your.Email.Here@example.org"  # Always tell NCBI who you are

In [60]:
genes = ['GABRA1', 'GABRA2', 'GABRA3', 'GABRA4', 'GABRA5']

query_vers_with_genes = ["GABA AND ", "Zebrafish AND ", "GABA AND Ivermectin NOT covid-19 AND ", "GABA AND Ivermectin NOT covid-19 AND Zebrafish AND "]
query_vers_no_genes = ["GABA AND Ivermectin NOT covid-19", "GABA AND Ivermectin NOT covid-19 AND Zebrafish"]

##### PubMed information retrieval

In [25]:
df_q = get_query_info(query_vers_with_genes[0], genes, "pubmed")

df_q

Unnamed: 0,Query,Db_Name,Query_Count,MainID_List,P_Dates,P_Years,LinkedID_List
0,GABA AND GABRA1[tiab],pubmed,169,"[33825593, 30324621, 32047208, 28535436, 24613...","[2021 Mar, 2018 Nov, 2020 Feb 11, 2017 Jul, 20...","[2021, 2018, 2020, 2017, 2014, 2014, 2015, 202...","[nan, nan, PMC7012862, nan, PMC6294571, PMC392..."
1,GABA AND GABRA2[tiab],pubmed,173,"[32565954, 24613745, 18005236, 24136292, 34174...","[2020 Jul, 2014 May, 2008 Jun, 2014 Mar, 2021 ...","[2020, 2014, 2008, 2014, 2021, 2008, 2018, 201...","[PMC7286117, PMC6294571, nan, PMC3924525, PMC8..."
2,GABA AND GABRA3[tiab],pubmed,71,"[32565954, 19084931, 24040174, 19087248, 34174...","[2020 Jul, 2008 Dec 21, 2013, 2008 Dec 16, 202...","[2020, 2008, 2013, 2008, 2021, 2008, 2008, 201...","[PMC7286117, PMC2776875, PMC3764027, PMC261575..."
3,GABA AND GABRA4[tiab],pubmed,56,"[29299688, 29720720, 29445327, 26405827, 29151...","[2018 Apr, 2018 Jul, 2018, 2016 Apr, 2018 Jan,...","[2018, 2018, 2018, 2016, 2018, 2011, 2008, 200...","[nan, nan, PMC5797743, PMC4821055, PMC5792317,..."
4,GABA AND GABRA5[tiab],pubmed,105,"[33442857, 30815456, 21070817, 22383672, 12661...","[2021 Nov, 2019 Feb, 2011 Mar 1, 2012 Apr, 200...","[2021, 2019, 2011, 2012, 2003, 2021, 2009, 201...","[nan, PMC6388437, nan, nan, nan, PMC8278801, n..."


In [26]:
append_data(df_q, "pubmed_raw_res.csv", True)

In [9]:
for i in range(1, len(query_vers_with_genes)):
    df_q = get_query_info(query_vers_with_genes[i], genes, "pubmed")
    append_data(df_q, "pubmed_raw_res.csv", False)

In [10]:
for i in range(0, len(query_vers_no_genes)):
    df_q = get_query_info_no_genes(query_vers_no_genes[i], "pubmed")
    append_data(df_q, "pubmed_raw_res.csv", False)

##### PubMed Central (PMC) information retrieval

In [11]:
df_q = get_query_info(query_vers_with_genes[0], genes, "pmc")
append_data(df_q, "pmc_raw_res.csv", True)

In [12]:
for i in range(1, len(query_vers_with_genes)):
    df_q = get_query_info(query_vers_with_genes[i], genes, "pmc")
    append_data(df_q, "pmc_raw_res.csv", False)

In [13]:
for i in range(0, len(query_vers_no_genes)):
    df_q = get_query_info_no_genes(query_vers_no_genes[i], "pmc")
    append_data(df_q, "pmc_raw_res.csv", False)

---
#### Now, getting the table of citation counts for all the IDs and then obtaining the top-5 per query with the functions ***cited_cnt_table(df_summary, db_name, no_dupes_col)*** and ***get_top_k(df, k_val)***, respectively.

In [5]:
# Our PubMed summary results
df_summary = read_in_results("pubmed_raw_res.csv")

In [6]:
df_summary

Unnamed: 0,Query,Db_Name,Query_Count,MainID_List,P_Dates,P_Years,LinkedID_List
0,GABA AND GABRA1[tiab],pubmed,169,"[33825593, 30324621, 32047208, 28535436, 24613...","[2021 Mar, 2018 Nov, 2020 Feb 11, 2017 Jul, 20...","[2021, 2018, 2020, 2017, 2014, 2014, 2015, 202...","[nan, nan, PMC7012862, nan, PMC6294571, PMC392..."
1,GABA AND GABRA2[tiab],pubmed,173,"[32565954, 24613745, 18005236, 24136292, 34174...","[2020 Jul, 2014 May, 2008 Jun, 2014 Mar, 2021 ...","[2020, 2014, 2008, 2014, 2021, 2008, 2018, 201...","[PMC7286117, PMC6294571, nan, PMC3924525, PMC8..."
2,GABA AND GABRA3[tiab],pubmed,71,"[32565954, 19084931, 24040174, 19087248, 34174...","[2020 Jul, 2008 Dec 21, 2013, 2008 Dec 16, 202...","[2020, 2008, 2013, 2008, 2021, 2008, 2008, 201...","[PMC7286117, PMC2776875, PMC3764027, PMC261575..."
3,GABA AND GABRA4[tiab],pubmed,56,"[29299688, 29720720, 29445327, 26405827, 29151...","[2018 Apr, 2018 Jul, 2018, 2016 Apr, 2018 Jan,...","[2018, 2018, 2018, 2016, 2018, 2011, 2008, 200...","[nan, nan, PMC5797743, PMC4821055, PMC5792317,..."
4,GABA AND GABRA5[tiab],pubmed,105,"[33442857, 30815456, 21070817, 22383672, 12661...","[2021 Nov, 2019 Feb, 2011 Mar 1, 2012 Apr, 200...","[2021, 2019, 2011, 2012, 2003, 2021, 2009, 201...","[nan, PMC6388437, nan, nan, nan, PMC8278801, n..."
5,Zebrafish AND GABRA1[tiab],pubmed,8,"[28535436, 32205311, 34411917, 32753576, 34925...","[2017 Jul, 2020 Apr 13, 2021 Oct, 2020 Aug 4, ...","[2017, 2020, 2021, 2020, 2021, 2019, 2018, 2020]","[nan, PMC7197724, nan, PMC7403336, PMC8672801,..."
6,Zebrafish AND GABRA2[tiab],pubmed,3,"[32753576, 32725455, 29124181]","[2020 Aug 4, 2021 Oct, 2015 Sep]","[2020, 2021, 2015]","[PMC7403336, nan, PMC5668850]"
7,GABA AND Ivermectin NOT covid-19,pubmed,200,"[27543424, 27742867, 33069391, 29055807, 28656...","[2016 Oct, 2017 Jan, 2021 Mar, 2017 Dec, 2017 ...","[2016, 2017, 2021, 2017, 2017, 2021, 2010, 202...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ..."
8,GABA AND Ivermectin NOT covid-19 AND Zebrafish,pubmed,3,"[25733401, 28479061, 24040509]","[2015 May-Jun, 2017 Jun 6, 2013 Sep 10]","[2015, 2017, 2013]","[nan, nan, PMC3771564]"


In [7]:
df_summary.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Query          9 non-null      object
 1   Db_Name        9 non-null      object
 2   Query_Count    9 non-null      int64 
 3   MainID_List    9 non-null      object
 4   P_Dates        9 non-null      object
 5   P_Years        9 non-null      object
 6   LinkedID_List  9 non-null      object
dtypes: int64(1), object(6)
memory usage: 632.0+ bytes


In [71]:
df_c = cited_cnt_table(df_summary, db_name="pubmed")  # obtaining the citation counts for the PMIDs in the df_summary

In [72]:
df_c

Unnamed: 0,Query,Db_Name,Id_List,Citation_Cnts
0,GABA AND GABRA1[tiab],pubmed,33825593,0
1,GABA AND GABRA1[tiab],pubmed,30324621,20
2,GABA AND GABRA1[tiab],pubmed,32047208,3
3,GABA AND GABRA1[tiab],pubmed,28535436,3
4,GABA AND GABRA1[tiab],pubmed,24613745,14
...,...,...,...,...
783,GABA AND Ivermectin NOT covid-19,pubmed,13678839,33
784,GABA AND Ivermectin NOT covid-19,pubmed,11714703,63
785,GABA AND Ivermectin NOT covid-19 AND Zebrafish,pubmed,25733401,11
786,GABA AND Ivermectin NOT covid-19 AND Zebrafish,pubmed,28479061,22


In [122]:
append_data(df_c, "pubmed_citation_cnts.csv", True)

In [123]:
# get_top_k() function with k_val=5
df_top_5 = get_top_k(df_c, 5)

In [124]:
df_top_5

Unnamed: 0,Query,Top_5_Ids,Citation_Cnts
0,GABA AND GABRA1[tiab],"[24136292, 24361861, 20438718, 27521439, 25194...","[48, 65, 40, 41, 48]"
1,GABA AND GABRA2[tiab],"[18005236, 24136292, 18482426, 29961870, 22501...","[34, 48, 27, 27, 30]"
2,GABA AND GABRA3[tiab],"[18334916, 18821008, 23169495, 18550761, 29078...","[34, 203, 25, 51, 49]"
3,GABA AND GABRA4[tiab],"[18334916, 18482426, 16901909, 16080114, 25124...","[34, 27, 67, 140, 27]"
4,GABA AND GABRA5[tiab],"[22383672, 19809285, 29961870, 16452257, 23886...","[26, 31, 27, 110, 54]"
5,GABA AND Ivermectin NOT covid-19,"[12421359, 23133688, 7957605, 21575687, 11082440]","[28, 34, 33, 29, 44]"


In [125]:
append_data(df_top_5, "pubmed_top5_25cited.csv", True)

#### Now, for the **PMC** database results, before getting the top-5, we want to remove any duplicate IDs (*on a per query basis*) that were already retrieved from the PubMed database.

In [3]:
pmc_df = read_in_results("pmc_raw_res.csv")

In [14]:
pmc_df

Unnamed: 0,Query,Db_Name,Query_Count,MainID_List,P_Dates,P_Years,LinkedID_List
0,GABA AND GABRA1,pmc,1214,"[6214766, 6543741, 7012862, 7197724, 7697095, ...","[2018 Sep 7, 2019 Jun, 2020 Feb 11, 2020 Apr 2...","[2018, 2019, 2020, 2020, 2020, 2017, 2019, 201...","[30103280, 30571139, 32047208, 32205311, 33187..."
1,GABA AND GABRA2,pmc,1055,"[6061692, 6081406, 5668850, 7697095, 6449455, ...","[2018 Jun 28, 2018 Aug 7, 2015 Aug 6, 2020 Nov...","[2018, 2018, 2015, 2020, 2019, 2014, 2020, 201...","[29961870, 30087324, 29124181, 33187258, 30984..."
2,GABA AND GABRA3,pmc,475,"[7697095, 2776875, 4754346, 7286117, 6180030, ...","[2020 Nov 11, 2008 Dec 21, 2016 Feb 12, 2020 A...","[2020, 2008, 2016, 2020, 2018, 2020, 2020, 201...","[33187258, 19084931, 26869349, 32565954, 30305..."
3,GABA AND GABRA4,pmc,488,"[7007694, 7697095, 8316187, 6180030, 7698927, ...","[2020 Feb 7, 2020 Nov 11, 2021 Jun 21, 2018 Oc...","[2020, 2020, 2021, 2018, 2020, 2016, 2018, 202...","[32033586, 33187258, 34152447, 30305619, 33218..."
4,GABA AND GABRA5,pmc,714,"[8447520, 7697095, 4307650, 6214766, 6388437, ...","[2021 Sep 16, 2020 Nov 11, 2015 Jan 19, 2018 S...","[2021, 2020, 2015, 2018, 2019, 1997, 2018, 201...","[34530807, 33187258, 25653499, 30103280, 30815..."
5,Zebrafish AND GABRA1,pmc,142,"[7197724, 5466539, 7403336, 6435997, 5922542, ...","[2020 Apr 28, 2017 Apr 5, 2020 Aug 4, 2019 Mar...","[2020, 2017, 2020, 2019, 2018, 2021, 2021, 201...","[32205311, 28217866, 32753576, 30949046, 29702..."
6,Zebrafish AND GABRA2,pmc,78,"[5668850, 7405782, 7336760, 7403336, 5922542, ...","[2015 Aug 6, 2020 May 23, 2020 Apr 29, 2020 Au...","[2015, 2020, 2020, 2020, 2018, 2008, 2016, 202...","[29124181, 32446246, 32347641, 32753576, 29702..."
7,Zebrafish AND GABRA3,pmc,55,"[5922542, 7973766, 6894506, 5515482, 3860818, ...","[2018 Apr 27, 2021 Mar 18, 2019 Dec 5, 2016 Au...","[2018, 2021, 2019, 2016, 2011, 2020, 2016, 201...","[29702678, 33737538, 31806011, 28730152, 22207..."
8,Zebrafish AND GABRA4,pmc,43,"[5922542, 5515482, 7973766, 8462739, 3860818, ...","[2018 Apr 27, 2016 Aug 27, 2021 Mar 18, 2021 S...","[2018, 2016, 2021, 2021, 2011, 2020, 2021, 201...","[29702678, 28730152, 33737538, 34559810, 22207..."
9,Zebrafish AND GABRA5,pmc,56,"[5922542, 7973766, 5515482, 5110251, 4478541, ...","[2018 Apr 27, 2021 Mar 18, 2016 Aug 27, 2016 M...","[2018, 2021, 2016, 2016, 2015, 2011, 2019, 201...","[29702678, 33737538, 28730152, 27857842, 25840..."


In [78]:
pmc_df.columns

Index(['Query', 'Db_Name', 'Query_Count', 'MainID_List', 'P_Dates', 'P_Years',
       'LinkedID_List'],
      dtype='object')

In [36]:
df_summary["Query"].replace("\[tiab\]", "", regex=True)

0                                   GABA AND GABRA1
1                                   GABA AND GABRA2
2                                   GABA AND GABRA3
3                                   GABA AND GABRA4
4                                   GABA AND GABRA5
5                              Zebrafish AND GABRA1
6                              Zebrafish AND GABRA2
7                  GABA AND Ivermectin NOT covid-19
8    GABA AND Ivermectin NOT covid-19 AND Zebrafish
Name: Query, dtype: object

In [81]:
for i in range(len(pmc_df)):

    queries_pubmed = df_summary["Query"].replace("\[tiab\]", "", regex=True).to_list()

    # "PubMed DataFrame idx" refers to the respective index where that query is found (if it exists) in the df_summary
    if (pmc_df.iloc[i]["Query"]).strip() in queries_pubmed:
        idx = queries_pubmed.index(pmc_df.iloc[i]["Query"])
        print(pmc_df.iloc[i]["Query"], "\tPubMed DataFrame idx:", idx)
    else:
        print(pmc_df.iloc[i]["Query"], "\tPubMed DataFrame idx: none")

GABA AND GABRA1 	Pubmed DataFrame idx: 0
GABA AND GABRA2 	Pubmed DataFrame idx: 1
GABA AND GABRA3 	Pubmed DataFrame idx: 2
GABA AND GABRA4 	Pubmed DataFrame idx: 3
GABA AND GABRA5 	Pubmed DataFrame idx: 4
Zebrafish AND GABRA1 	Pubmed DataFrame idx: 5
Zebrafish AND GABRA2 	Pubmed DataFrame idx: 6
Zebrafish AND GABRA3 	Pubmed DataFrame idx: none
Zebrafish AND GABRA4 	Pubmed DataFrame idx: none
Zebrafish AND GABRA5 	Pubmed DataFrame idx: none
GABA AND Ivermectin NOT covid-19 AND GABRA1 	Pubmed DataFrame idx: none
GABA AND Ivermectin NOT covid-19 AND GABRA2 	Pubmed DataFrame idx: none
GABA AND Ivermectin NOT covid-19 AND GABRA3 	Pubmed DataFrame idx: none
GABA AND Ivermectin NOT covid-19 AND GABRA4 	Pubmed DataFrame idx: none
GABA AND Ivermectin NOT covid-19 AND GABRA5 	Pubmed DataFrame idx: none
GABA AND Ivermectin NOT covid-19 	Pubmed DataFrame idx: 7
GABA AND Ivermectin NOT covid-19 AND Zebrafish 	Pubmed DataFrame idx: 8


In [116]:
index_list = []
for i in range(len(pmc_df)):

    # Creating List: we remove the tag "[tiab]" in the PubMed "Query" column so we can check for existence (based on the strings).
    queries_pubmed = df_summary["Query"].replace("\[tiab\]", "", regex=True).to_list()


    # Per query. First, we need to see if PubMed returned any results for that query.
    if pmc_df.iloc[i]["Query"] in queries_pubmed:
        idx = queries_pubmed.index(pmc_df.iloc[i]["Query"])

        # Finding the difference between the two lists.
        first_list = [pmc_df.iloc[i]["LinkedID_List"].index(id) for id in pmc_df.iloc[i]["LinkedID_List"] 
                            if id not in set(df_summary.iloc[idx]["MainID_List"]) and id is not np.nan]
        # Grabbing the indexes of all the nans.
        second_list = [j for j, x in enumerate(pmc_df.iloc[i]["LinkedID_List"]) if x is np.nan]
        index_list.append(sorted(first_list + second_list))
    else:
        index_list.append(list(range(len(pmc_df.iloc[i]["LinkedID_List"]))))



no_dupes_list = []
for row in range(len(pmc_df)):
    # Given the list of indexes, retrieving the corresponding PMCIDs, while still preserving order.
    no_dupes_list.append([pmc_df.loc[row]["MainID_List"][indx] for indx in index_list[row]])

pmc_df["NoDupes_PMCIDs"] = pd.Series(no_dupes_list)


In [120]:
pmc_df

Unnamed: 0,Query,Db_Name,Query_Count,MainID_List,P_Dates,P_Years,LinkedID_List,NoDupes_PMCIDs
0,GABA AND GABRA1,pmc,1214,"[6214766, 6543741, 7012862, 7197724, 7697095, ...","[2018 Sep 7, 2019 Jun, 2020 Feb 11, 2020 Apr 2...","[2018, 2019, 2020, 2020, 2020, 2017, 2019, 201...","[30103280, 30571139, 32047208, 32205311, 33187...","[6214766, 6543741, 7697095, 6463728, 6180030, ..."
1,GABA AND GABRA2,pmc,1055,"[6061692, 6081406, 5668850, 7697095, 6449455, ...","[2018 Jun 28, 2018 Aug 7, 2015 Aug 6, 2020 Nov...","[2018, 2018, 2015, 2020, 2019, 2014, 2020, 201...","[29961870, 30087324, 29124181, 33187258, 30984...","[7697095, 4454465, 6180030, 7698927, 8380214, ..."
2,GABA AND GABRA3,pmc,475,"[7697095, 2776875, 4754346, 7286117, 6180030, ...","[2020 Nov 11, 2008 Dec 21, 2016 Feb 12, 2020 A...","[2020, 2008, 2016, 2020, 2018, 2020, 2020, 201...","[33187258, 19084931, 26869349, 32565954, 30305...","[7697095, 6180030, 7698927, 5538121, 8380214, ..."
3,GABA AND GABRA4,pmc,488,"[7007694, 7697095, 8316187, 6180030, 7698927, ...","[2020 Feb 7, 2020 Nov 11, 2021 Jun 21, 2018 Oc...","[2020, 2020, 2021, 2018, 2020, 2016, 2018, 202...","[32033586, 33187258, 34152447, 30305619, 33218...","[7697095, 8316187, 6180030, 7698927, 8380214, ..."
4,GABA AND GABRA5,pmc,714,"[8447520, 7697095, 4307650, 6214766, 6388437, ...","[2021 Sep 16, 2020 Nov 11, 2015 Jan 19, 2018 S...","[2021, 2020, 2015, 2018, 2019, 1997, 2018, 201...","[34530807, 33187258, 25653499, 30103280, 30815...","[7697095, 4307650, 6214766, 5972534, 6180030, ..."
5,Zebrafish AND GABRA1,pmc,142,"[7197724, 5466539, 7403336, 6435997, 5922542, ...","[2020 Apr 28, 2017 Apr 5, 2020 Aug 4, 2019 Mar...","[2020, 2017, 2020, 2019, 2018, 2021, 2021, 201...","[32205311, 28217866, 32753576, 30949046, 29702...","[5466539, 5922542, 7973766, 5515482, 6899022, ..."
6,Zebrafish AND GABRA2,pmc,78,"[5668850, 7405782, 7336760, 7403336, 5922542, ...","[2015 Aug 6, 2020 May 23, 2020 Apr 29, 2020 Au...","[2015, 2020, 2020, 2020, 2018, 2008, 2016, 202...","[29124181, 32446246, 32347641, 32753576, 29702...","[7405782, 7336760, 5922542, 2656604, 5515482, ..."
7,Zebrafish AND GABRA3,pmc,55,"[5922542, 7973766, 6894506, 5515482, 3860818, ...","[2018 Apr 27, 2021 Mar 18, 2019 Dec 5, 2016 Au...","[2018, 2021, 2019, 2016, 2011, 2020, 2016, 201...","[29702678, 33737538, 31806011, 28730152, 22207...","[5922542, 7973766, 6894506, 5515482, 3860818, ..."
8,Zebrafish AND GABRA4,pmc,43,"[5922542, 5515482, 7973766, 8462739, 3860818, ...","[2018 Apr 27, 2016 Aug 27, 2021 Mar 18, 2021 S...","[2018, 2016, 2021, 2021, 2011, 2020, 2021, 201...","[29702678, 28730152, 33737538, 34559810, 22207...","[5922542, 5515482, 7973766, 8462739, 3860818, ..."
9,Zebrafish AND GABRA5,pmc,56,"[5922542, 7973766, 5515482, 5110251, 4478541, ...","[2018 Apr 27, 2021 Mar 18, 2016 Aug 27, 2016 M...","[2018, 2021, 2016, 2016, 2015, 2011, 2019, 201...","[29702678, 33737538, 28730152, 27857842, 25840...","[5922542, 7973766, 5515482, 5110251, 4478541, ..."


In [126]:
append_data(pmc_df, "pmc_summary_nodupes.csv", True)

#### "NoDupes_PMCIDs" column: **for each of the queries**, we only grabbed the article IDs that were *not* found in the (previously obtained) PubMed search results.
----

#### Now, based on what we currently have in **pmc_df**, we can get the top-5 results per query!

In [129]:
df_cnt = cited_cnt_table(pmc_df, db_name="pmc", no_dupes_col=True)
append_data(df_cnt, "pmc_citation_cnts.csv", True)

In [130]:
df_cnt

Unnamed: 0,Query,Db_Name,Id_List,Citation_Cnts
0,GABA AND GABRA1,pmc,6214766,7
1,GABA AND GABRA1,pmc,6543741,12
2,GABA AND GABRA1,pmc,7697095,0
3,GABA AND GABRA1,pmc,6463728,8
4,GABA AND GABRA1,pmc,6180030,43
...,...,...,...,...
5112,GABA AND Ivermectin NOT covid-19 AND Zebrafish,pmc,4278187,9
5113,GABA AND Ivermectin NOT covid-19 AND Zebrafish,pmc,8953458,0
5114,GABA AND Ivermectin NOT covid-19 AND Zebrafish,pmc,8233170,3
5115,GABA AND Ivermectin NOT covid-19 AND Zebrafish,pmc,6174339,34


In [131]:
top_5_df = get_top_k(df_cnt, 5)

# Below, the Top_5_Ids column contains PMCIDs only.
top_5_df

Unnamed: 0,Query,Top_5_Ids,Citation_Cnts
0,GABA AND GABRA1,"[6180030, 5195897, 6061692, 3412149, 5771312]","[43, 63, 28, 252, 34]"
1,GABA AND GABRA2,"[6180030, 6870297, 3181829, 3717553, 4933285]","[43, 34, 48, 29, 27]"
2,GABA AND GABRA3,"[6180030, 6061692, 5545734, 3412149, 4239297]","[43, 28, 32, 252, 30]"
3,GABA AND GABRA4,"[6180030, 4477717, 5301472, 3924525, 6061692]","[43, 160, 31, 44, 28]"
4,GABA AND GABRA5,"[4307650, 6180030, 2075237, 4477717, 3412149]","[46, 43, 143, 160, 252]"
5,Zebrafish AND GABRA1,"[5466539, 5515482, 5027381, 4786103, 3484860]","[27, 25, 33, 33, 50]"
6,Zebrafish AND GABRA2,"[2656604, 5515482, 3860818, 2577853, 4159132]","[52, 25, 42, 103, 34]"
7,Zebrafish AND GABRA3,"[5515482, 3860818, 6180030, 4053846, 6757416]","[25, 42, 43, 93, 27]"
8,Zebrafish AND GABRA4,"[5515482, 3860818, 6180030, 2802832, 2577853]","[25, 42, 43, 54, 103]"
9,Zebrafish AND GABRA5,"[5515482, 3860818, 5168934, 6673626, 6180030]","[25, 42, 158, 78, 43]"


In [132]:
append_data(top_5_df, "pmc_top5_25cited.csv", True)