# BASE

In [1]:
# To start using the BASE API, a request for access (unblocking of IP-address or IP-address range) 
# must first be sent to https://www.base-search.net/about/en/contact.php. More information can be 
# found at https://www.base-search.net/about/en/about_develop.php#chap02
# The API is described at https://www.base-search.net/about/download/base_interface.pdf

import requests
from lib.Publications import Publication
import pickle
import time
import json

# Query string

In [27]:
# Multi keyword search
#
# "eDNA+metabarcoding"
# "algebra+AND+(linear+OR+numerical)""
# "(operator +OR+ algorithms)+ AND+ (linear +OR+ numerical)"
# "linear+algebra+NOT+numerical"
# More examples can be found at https://www.base-search.net/about/download/base_interface.pdf

#query = '("eDNA" OR "environmental DNA" OR "metabarcoding" OR "eRNA" OR "environmental RNA") AND ("biodiversity" OR "species richness" OR "monitoring" OR "biomonitoring") AND ("high throughput sequencing" OR "HTS" OR "throughput")'
#query = '(eDNA+OR+environmental+DNA+OR+metabarcoding+OR+eRNA+OR+environmental+RNA+)+AND+(biodiversity+OR+species+richness+OR+monitoring+OR+biomonitoring+)+AND+(high+throughput+sequencing+OR+HTS+OR+throughput)+dctype:11'
#query = "dctitle:eDNA+monitoring"
#query = "dctitle:eDNA+monitoring+dctype:*"
#query = "dctitle:eDNA+monitoring+dctype:+11*+14" 
#query = "dctitle:eDNA+monitoring+dctype:+14"

#query = "dctitle:eDNA+monitoring" #+dctype:+1*" 
query = "dctitle:eDNA+monitoring+dctype:+1" 



#        eDNA+doctype%3A%2817+1A%29
#query = "eDNA+diatom+"


# Search BASE

In [28]:
def search_db(query, offset):
    # https://www.base-search.net/about/download/base_interface.pdf
    print(f"https://api.base-search.net/cgi-bin/BaseHttpSearchInterface.fcgi?func=PerformSearch&query={query}&hits=100&offset={offset}&format=json")
    return requests.get(f"https://api.base-search.net/cgi-bin/BaseHttpSearchInterface.fcgi?func=PerformSearch&query={query}&hits=120&offset={offset}&format=json")

In [29]:
# The maximum number of records that can be retreived is 120.
# This loop will download all records (up to record number 
# 999 + 120 = 1119) if this limit is exceded.

offset = 0
results = []
done = False

while not done:
        
    r = search_db(query, offset)
    
    results.append(r)
    print(f"Offset: {offset}")
    # Rate limiting: One request per second (1 QPS).
    time.sleep(1)

    # Maximum offset is 999
    if offset < 960:
        offset += 120
    else:
        done = True      
    
    # Check how many hits where found
    hits = 0
    for pub in r.json()['response']['docs']:
        hits += 1
    
#    hits = int(r.json()['responseHeader']['params']['rows'])
    print(f"Number of matches: {hits}")
    if hits != 120:
        done = True

https://api.base-search.net/cgi-bin/BaseHttpSearchInterface.fcgi?func=PerformSearch&query=dctitle:eDNA+monitoring+dctype:+1&hits=100&offset=0&format=json
Offset: 0
Number of matches: 75


In [25]:
# Number of publications
n = 0
for query in results:
    for pub in query.json()['response']['docs']:
        n += 1
print(f"Number of records: {n}")

Number of records: 2


# Explore the result

In [26]:
#dir(i.json())
#i.json().keys()
#i.json().values()

#for x in i.json().items():
#    print(x)
    
#for i in pub.json()['response']['docs']:
#    print(pub.keys())
#    print(pub['dccollection'])

# URL
print(pub['dcidentifier'])
# Title
print(pub['dctitle'])
# Abstract
print(pub['dcdescription'])
# DOI
try:
    print(pub['dcdoi'])
except:
    pass
# Keywords
print(pub['dcsubject'])
    


['http://hdl.handle.net/10251/65124', 'https://doi.org/10.1371/journal.pone.0119311']
The Use of Genus-Specific Amplicon Pyrosequencing to Assess Phytophthora Species Diversity Using eDNA from Soil and Water in Northern Spain
[EN] Phytophthora is one of the most important and aggressive plant pathogenic genera in agriculture and forestry. Early detection and identification of its pathways of infection and spread are of high importance to minimize the threat they pose to natural ecosystems. eDNA was extracted from soil and water from forests and plantations in the north of Spain. Phytophthora-specific primers were adapted for use in high-throughput Sequencing (HTS). Primers were tested in a control reaction containing eight Phytophthora species and applied to water and soil eDNA samples from northern Spain. Different score coverage threshold values were tested for optimal Phytophthora species separation in a custom-curated database and in the control reaction. Clustering at 99% was the 

In [20]:
publications = []

with open("original_abstracts_Reports_BASE.html", "w") as file:
    for query in results:
        for pub in query.json()['response']['docs']:      
        
            title = "<h1>" + pub['dctitle'] + "</h1>" + "\n"
            
            try:
                doi = "<p><a href=\"https://doi.org/" + \
                        pub['dcdoi'][0] + \
                        "\">" + \
                        "doi:" + \
                        pub['dcdoi'][0] + \
                        "</a></p>" + \
                        "\n"
            except KeyError:
                doi = "<p>No DOI</p>"
                        
            # First make sure there is a description for this item,
            # then concatenate the list of descriptions if needed.
        
            try:
                description = pub['dcdescription']
            
                abstract = "<p>" + \
                            description + \
                            "</p>" + \
                            "\n"
            except:
                description = ""
                abstract = "<p>No Abstract</p>"

            try:
                publications.append(Publication(title = str(pub['dctitle']), 
                                                doi = str(pub['dcdoi'][0]), 
                                                abstract = str(description)))
            except KeyError:
                publications.append(Publication(title = str(pub['dctitle']), 
                                                abstract = str(description)))
        
            string = title + doi + abstract
        
            file.write(string)

# Save results to binary file

In [21]:
# Save the result to a binary file, and analyse it together with data from other searches.
pickle.dump(publications, open("reports_BASE.p" ,"wb"))