In [1]:
from Bio import Entrez
import pickle
from lib.Publications import Publication

# Always tell NCBI who you are
Entrez.email = input("Email: ")

Email:  mats.topel@marine.gu.se


# Custom functions

In [2]:
# Store the search results locally
record_cash = {}

# Download individual records
def get_record(pubmedID):
    # Check if the publication has already been downloaded
    try:
        return record_cash[pubmedID]
    except KeyError:
        handle = Entrez.efetch(db="pubmed", \
                               rettype="text", \
                               id=pubmedID)
        result = Entrez.read(handle)
        
        # Update the cashed results
        record_cash[pubmedID] = result
        return result

# Print progress
def progress(x = None, y = None):
    # Print progress
    prog = str(x) + "/" + str(y)
    print(prog, end="\r")

# Query string

In [3]:
# Some usefull search tags to use:
# https://pubmed.ncbi.nlm.nih.gov/help/#search-tags
#
# All Fields [all]
# Author [au]
# Journal [ta]
# PMID [pmid]
# Publication Type [pt]
# Text Words [tw]
# Title [ti]
# Title/Abstract [tiab]


# [Author]
# [Journal]
# [Keyword]
# [Title]
# [pt] (Pulication type)
# Publication types: https://pubmed.ncbi.nlm.nih.gov/help/#publication-types
#query = "reproducible AND eDNA"
#query = "eDNA AND metabarcoding"
query = "eDNA[tiab] AND metabarcoding"

# Search PubMed

In [4]:
# DevNote: Check the default retmax value
handle = Entrez.esearch(db="pubmed", \
                        term=query, \
                        retmax=100000)

record = Entrez.read(handle)

# Explore the result

In [6]:
for pubmedID in record["IdList"]:
    x = get_record(pubmedID)
    break
    
    
# Title
print(x["PubmedArticle"][0]["MedlineCitation"]["Article"]["ArticleTitle"])

# Keywords
keywords = []
for i in x["PubmedArticle"][0]["MedlineCitation"]["KeywordList"][0]:
    keywords.append(str(i))
print(keywords)

# Abstract
x["PubmedArticle"][0]["MedlineCitation"]["Article"]["Abstract"]["AbstractText"][0]

# DOI
x["PubmedArticle"][0]["MedlineCitation"]["Article"]["ELocationID"][0].title()

# eid

# pii

# Journal
print(x["PubmedArticle"][0]["MedlineCitation"]["Article"]["Journal"]["Title"])

# Date
y = x["PubmedArticle"][0]["MedlineCitation"]["Article"]["ArticleDate"][0]
print(y["Year"] + "-" + y["Month"] + "-" + y["Day"])

Effects of sampling strategies and DNA extraction methods on eDNA metabarcoding: A case study of estuarine fish diversity monitoring.
['DNA extraction', 'Estuarine ecosystem', 'Fish diversity', 'Sampling strategies', 'eDNA metabarcoding']
Zoological research


IndexError: list index out of range

In [8]:
# Display the number of results found for this search string
print(f"Number of records: {len(record['IdList'])}")

Number of records: 330


# Explore the result in Pandas

In [9]:
import pandas as pd

# Explore keywords

In [10]:
keywords = []
prog = 1

for pubmedID in record["IdList"]:
    x = get_record(pubmedID)
    try:
        for i in x["PubmedArticle"][0]["MedlineCitation"]["KeywordList"][0]:
            keywords.append(str(i))
    except IndexError:
        pass
    # Print progress
    progress(prog, len(record["IdList"]))
    prog += 1
    
# Count the keywords
count = {}
for i in keywords:
    i = str(i)
    try:
        count[i] += 1
    except:
        count[i] = 1

# Turn the dictionary of keywords into a Pandas dataframe
kw = pd.DataFrame.from_dict(count, orient='index')

# Display the 20 top keywords
kw.sort_values(0, ascending=False).head(20)

330/330

Unnamed: 0,0
metabarcoding,77
eDNA,77
environmental DNA,63
Metabarcoding,31
eDNA metabarcoding,27
biodiversity,23
biomonitoring,21
Environmental DNA,20
high-throughput sequencing,12
Biomonitoring,12


# Explore abstracts in html format

In [13]:
publications = []

prog = 1
with open("original_abstracts_PubMed.html", "w") as file:
    for pubmedID in record["IdList"]:
        
        progress(prog, len(record["IdList"]))
        
        paper = get_record(pubmedID)
        
        # Title
        title = "<h1>" + paper["PubmedArticle"][0]["MedlineCitation"]["Article"]["ArticleTitle"] + "</h1>" + "\n"
        title_obj= paper["PubmedArticle"][0]["MedlineCitation"]["Article"]["ArticleTitle"]
        
        # DOI
        try:  
            doi = "<p><a href=\"https://doi.org/" + \
                  paper["PubmedArticle"][0]["MedlineCitation"]["Article"]["ELocationID"][0].title() + \
                  "\">" + \
                  "doi:" + \
                  paper["PubmedArticle"][0]["MedlineCitation"]["Article"]["ELocationID"][0].title() + \
                  "</a></p>" + \
                  "\n"
            doi_obj = paper["PubmedArticle"][0]["MedlineCitation"]["Article"]["ELocationID"][0].title()
        except:
            doi = "<p>No DOI</p>"
            doi_obj = None
        
        # Keywords
        try:
            keywords = []
            for i in paper["PubmedArticle"][0]["MedlineCitation"]["KeywordList"][0]:
                keywords.append(str(i))
            #keywords = paper["PubmedArticle"][0]["MedlineCitation"]["KeywordList"][0]
        except:
            keywords = "<p>No keywords</p>"
        
        # Abstract
        try:
            abstract = "<p>" + \
                       paper["PubmedArticle"][0]["MedlineCitation"]["Article"]["Abstract"]["AbstractText"][0] + \
                       "</p>" + \
                       "\n"
            abstract_obj = paper["PubmedArticle"][0]["MedlineCitation"]["Article"]["Abstract"]["AbstractText"][0]
        except KeyError:
            abstract = "<p>No Abstract</p>"
            abstract_obj = None
            
        string = title + doi + str(keywords) + abstract

        publications.append(Publication(title = title_obj, doi = doi_obj, abstract = abstract_obj))

        prog += 1
        file.write(string)

330/330

# Save result to binary file

In [14]:
# Save the result to a binary file, and analyse it together with data from other searches.
pickle.dump(publications, open("pubmed_eDNA_metabarcoding.p" ,"wb"))