In [7]:
import requests
import urllib.parse
import configparser

config_path = './input/config.ini'
## general config
config = configparser.ConfigParser()
config.read(config_path)
project_name = config['GENERAL-settings']['project_name']

def write_results_to_csv(prefix_filename, topic, number_of_results, label, field, operator, dict_entities):
    terms = ""
    for key in dict_entities:
        terms = terms + dict_entities[key]+","
    terms=terms[:-1]
    terms_url = urllib.parse.quote(terms)    
    dict_entities_inv = {v: k for k, v in dict_entities.items()}
    
    api_url = 'http://knowledge-graph-api-'+project_name+':5000/api/v1/top_n_articles_for_label?count='+str(number_of_results)+'&norm_by_age=true&label='+label+'&field='+field+'&operator='+operator+'&terms='+terms_url

    #print(api_url)
    r = requests.get(api_url)
    literature_results = r.json()['results']
    
    f = open(prefix_filename+topic+'.csv', 'w')
    if len(dict_entities) <= 1:
        literature_results = [literature_results]
    
    str_output_header = ""
    for key in literature_results[0][0]:
        str_output_header= str_output_header + str(key) + "|"
    str_output = str_output_header + "term_label"
            
    for i in range(len(dict_entities)):
        for entry in literature_results[i]:
            temp_str_output= ""
            ## if there are no results, the entry "message: ... no reults" appears
            if entry == "message":
                break
            for key in entry:
                #print("key = " + str(key) +" entry = "+ str(entry))
                temp_str_output = temp_str_output + str(entry[key]).replace("|","/") +"|"
            temp_str_output = temp_str_output + str(dict_entities_inv[entry["term"]])
            str_output = str_output + "\n"+ temp_str_output
    f.write(str_output)

In [8]:
r = requests.get('http://knowledge-graph-api-'+project_name+':5000/api/v1/statistics')
print(r.json())

{'results': [{'labels': {'Article': 67008, 'GO_BP': 11173, 'GO_CC': 1414, 'GO_MF': 3161, 'Keyword': 52, 'cellline': 117, 'chemical': 4114, 'disease': 4041, 'gene': 9096, 'pathway': 2047, 'pathway_biocarta': 251, 'pathway_kegg': 635, 'pathway_netpath': 35, 'pathway_pid': 210, 'pathway_reactome': 6301, 'pathway_wikipathways': 808, 'species': 1248}}]}


## Manual version (POC for ALS)

In [10]:
prefix_filename = project_name+'_literature_'

number_of_results = 2
topic = 'cirrhosis_auoimmune_liver_diseases'
dict_diseases = {
    #"ALS":"Disease:MESH:D000690",
    #"ALS":"Disease:MESH:D000690",
    #"TDP-43-linked ALS":"Disease:MESH:D057177",
    
    "cirrhosis": "Disease:MESH:D005355",
    "autoimmune liver disease": "Disease:MESH:D008107",
}
label = 'disease'
field = 'name'
operator = urllib.parse.quote("=")

## process query and write output-file
write_results_to_csv(prefix_filename = prefix_filename, topic = topic, number_of_results=number_of_results, label=label, field=field, operator = operator, dict_entities = dict_diseases)

# Automatic version
For each label_class (disease, gene, article, chemical, Keyword) retrieve the top instances. For those top instances / entities get the most relevant articles of the graph

In [13]:
from datetime import datetime
## timing
now_start = datetime.now()

prefix_filename = 'literature_'
label_classes = ["disease", "gene", "chemical", "pathway_kegg", "pathway_reactome","GO_BP", "species"] # Keyword if there are special Keywords
# for each label_class get the top top_count_entities entities
top_count_entities = 30
# for each of the top entities get number_of_results articles
number_of_results = 5
# use the field "name" as an identifier
field = 'name'
# exact match "=" instead of "CONTAINS"
operator = urllib.parse.quote("=")
                 
for label_class in label_classes:
    now_start_class = datetime.now()
    api_top_entities_for_label = "http://knowledge-graph-api-"+project_name+":5000/api/v1/top_entities?count="+str(top_count_entities)+"&label="+label_class
    r = requests.get(api_top_entities_for_label)
    label_results = r.json()['results']
    dict_top_entities = {}
    for entry in label_results:
        dict_top_entities[entry['label']] = entry['name']    
    write_results_to_csv(prefix_filename = prefix_filename, topic = label_class, number_of_results=number_of_results, label=label_class, field=field, operator = operator, dict_entities = dict_top_entities)       
    now_end_class = datetime.now()
    print("Class = "+label_class + " took "+ str(now_end_class - now_start_class)+" (hours:minutes:seconds.milliseconds)")

now_end = datetime.now()
print("The API took "+ str(now_end - now_start)+" (hours:minutes:seconds.milliseconds) in order to get "+str(number_of_results)+" for each of "+str(top_count_entities)+ " entities for the classes: "+str(label_classes))

Class = disease took 0:00:02.523625 (hours:minutes:seconds.milliseconds)
Class = gene took 0:00:01.727235 (hours:minutes:seconds.milliseconds)
Class = chemical took 0:00:02.211099 (hours:minutes:seconds.milliseconds)
Class = pathway_kegg took 0:00:10.550480 (hours:minutes:seconds.milliseconds)
Class = pathway_reactome took 0:00:07.743417 (hours:minutes:seconds.milliseconds)
Class = GO_BP took 0:00:07.743251 (hours:minutes:seconds.milliseconds)
Class = species took 0:00:02.621825 (hours:minutes:seconds.milliseconds)
The API took 0:00:35.122058 (hours:minutes:seconds.milliseconds) in order to get 5 for each of 30 entities for the classes: ['disease', 'gene', 'chemical', 'pathway_kegg', 'pathway_reactome', 'GO_BP', 'species']
