### Week 5 - Biological Databases - PubMed
- October 2023
- [https://https://github.com/tisimpson/bioinformatics1](https://github.com/tisimpson/bioinformatics1)
- [ian.simpson@ed.ac.uk](mailto:ian.simpson@ed.ac.uk)

In [None]:
import pandas as pd
import urllib as ul
import numpy as np

In [None]:
%pip install biopython

1. Plot a graph of how many papers were in PubMed in each year for the last 10 years
2. How many papers are there relating to Cadherin-7 and human disease? what diseases are mentioned?
    - Try three different search strategies, which one do you think is best and why?
3. Can you use PubMed to work out how the emerging popularity of single-cell RNA sequencing to measure gene expression over the last 10 years?

In [None]:
from Bio import Entrez

Entrez.email = "A.N.Other@example.com" # You should replace this with your e-mail address 

year_counts = {}

# note the egquery function provides Entrez database counts from a global search.
for i in range(2012,2023,1):
    handle = Entrez.egquery(term='1900:'+str(i)+'[dp]')
    record = Entrez.read(handle)
    handle.close()
    # we can iterate through the record and only return the 'nucleotide' result
    for row in record["eGQueryResult"]:
        if row["DbName"]=="pubmed":
            year_counts[i] = int(row["Count"])

print(year_counts)

In [None]:
papers_by_year = pd.DataFrame.from_dict(year_counts,orient='index',columns=['counts'])

#plot with scientific notation on the y-axis
ax = papers_by_year.plot(kind='bar',logy=False,legend=False)
ax.set_xlabel('Year')
ax.set_ylabel('Number of Papers')
ax.set_title('PubMed Paper Content by Year')

In [None]:
from Bio import Entrez

Entrez.email = "A.N.Other@example.com" # You should replace this with your e-mail address 

decade_counts = {}

# note the egquery function provides Entrez database counts from a global search.
for i in range(1910,2023,10):
    handle = Entrez.egquery(term='1900:'+str(i)+'[dp]')
    record = Entrez.read(handle)
    handle.close()
    # we can iterate through the record and only return the 'nucleotide' result
    for row in record["eGQueryResult"]:
        if row["DbName"]=="pubmed":
            decade_counts[i] = int(row["Count"])

print(decade_counts)

In [None]:
papers_by_year = pd.DataFrame.from_dict(decade_counts,orient='index',columns=['counts'])

#plot with scientific notation on the y-axis
ax = papers_by_year.plot(kind='bar',logy=False,legend=False)
ax.set_xlabel('Year')
ax.set_ylabel('Number of Papers')
ax.set_title('PubMed Paper Content by Decade')

In [None]:
#how many Cadherin-7 disease papers?

from Bio import Medline

#search 1
search_term = "Cadherin-7 AND disease"
handle = Entrez.egquery(term=search_term)
record = Entrez.read(handle)
handle.close()

for row in record["eGQueryResult"]:
    if row["DbName"]=="pubmed":
        print('Using',search_term)
        print(row["Count"]+' papers\n')

#fetch the papers
handle = Entrez.esearch(db="pubmed", term=search_term, retmax=100)
record = Entrez.read(handle)
handle.close()

# we're going to use the biopython Medline parser to parse the records
# https://biopython.org/docs/1.75/api/Bio.Medline.html
# print out the paper titles
for id in record['IdList']:
    handle = Entrez.efetch(db="pubmed", id=id, rettype="medline", retmode="text")
    record = Medline.parse(handle)
    for article in record:
        print(article['TI'])
    handle.close()

In [None]:
#search 2

# design some simple function to make this a bit more scalable - count the number of papers
def paperCount(search_term):
    handle = Entrez.egquery(term=search_term)
    record = Entrez.read(handle)
    handle.close()
    for row in record["eGQueryResult"]:
        if row["DbName"]=="pubmed":
            print(f'Using',search_term,'returned',row["Count"],'papers')

# design some simple function to make this a bit more scalable - fetch the papers and print the titles
def fetchPapers(search_term):
    handle = Entrez.esearch(db="pubmed", term=search_term, retmax=100)
    record = Entrez.read(handle)
    handle.close()
    # we're going to use the biopython Medline parser to parse the records
    # https://biopython.org/docs/1.75/api/Bio.Medline.html
    # print out the paper titles
    for id in record['IdList']:
        handle = Entrez.efetch(db="pubmed", id=id, rettype="medline", retmode="text")
        record = Medline.parse(handle)
        for article in record:
            print(article['TI'])
        handle.close()

paperCount('CDH7 AND disease')
fetchPapers('CDH7 AND disease')

In [None]:
#single cell RNA sequencing
scrna_sequencing_counts = {}

# note the egquery function provides Entrez database counts from a global search.
for i in range(2010,2022,1):
    handle = Entrez.egquery(term=str(i)+'[dp] AND "single cell RNA sequencing"')
    record = Entrez.read(handle)
    handle.close()
    # we can iterate through the record and only return the 'nucleotide' result
    for row in record["eGQueryResult"]:
        if row["DbName"]=="pubmed":
            scrna_sequencing_counts[i] = int(row["Count"])

scrnaseq_papers_by_year = pd.DataFrame.from_dict(scrna_sequencing_counts,orient='index',columns=['counts'])
scrnaseq_papers_by_year.plot.line(xlabel='year',ylabel='paper count', title='single cell RNA sequencing papers by year', legend=False)