In [1]:
!pip install mygene neo4j flask

Collecting mygene
  Downloading mygene-3.2.2-py2.py3-none-any.whl (5.4 kB)
Collecting neo4j
  Downloading neo4j-4.3.4.tar.gz (75 kB)
[K     |████████████████████████████████| 75 kB 1.6 MB/s eta 0:00:01
[?25hCollecting flask
  Downloading Flask-2.0.1-py3-none-any.whl (94 kB)
[K     |████████████████████████████████| 94 kB 4.5 MB/s  eta 0:00:01
[?25hCollecting Jinja2>=3.0
  Downloading Jinja2-3.0.1-py3-none-any.whl (133 kB)
[K     |████████████████████████████████| 133 kB 2.5 MB/s eta 0:00:01
[?25hCollecting itsdangerous>=2.0
  Downloading itsdangerous-2.0.1-py3-none-any.whl (18 kB)
Collecting Werkzeug>=2.0
  Downloading Werkzeug-2.0.1-py3-none-any.whl (288 kB)
[K     |████████████████████████████████| 288 kB 6.5 MB/s eta 0:00:01
Collecting MarkupSafe>=2.0
  Downloading MarkupSafe-2.0.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (30 kB)
Collecting biothings-client>=0.2.6
  Downloading biothings_client-0.2.6-py2.py3-none-any.whl 

In [2]:
import mygene
mg = mygene.MyGeneInfo()

import pandas as pd
import glob
import os
import time
import functools

import configparser
import shutil
import filecmp
import sys
from io import StringIO

import requests
from xml.etree import ElementTree


In [3]:
## request something followed by a delay (pubmed allows 3 requests per second)
def request_with_delay(url, api_delay = 0.0, my_timeout = 8.0):#= 0.35):
    try:
        response = requests.get(url, timeout=my_timeout)
    except (requests.exceptions.Timeout, requests.exceptions.ConnectionError) as err:
        return None#'Server taking too long. Try again later'
    else:
        time.sleep(api_delay)
        return response  

In [4]:
from graphapi.neo4j_api_helper import Neo4j_Manager
config_path = './input/config.ini'
## general config
config = configparser.ConfigParser()
config.read(config_path)
project_name = config['GENERAL-settings']['project_name']
db_hostname = config['NEO4J-settings']['neo4j_hostname']
neo4j_bolt = config['NEO4J-settings']['neo4j_bolt']
neo4j_user = config['NEO4J-settings']['neo4j_user']
neo4j_password = config['NEO4J-settings']['neo4j_password']

neo4j_manager = Neo4j_Manager(neo4j_bolt+"-"+project_name, neo4j_user, neo4j_password)

In [5]:
#gene_list = neo4j_manager.get_all_nodes_for_label(concept_label="gene", return_field="name")
gene_list = neo4j_manager.where_exists_field("gene", "entrezgene", "name", negate = True)
print(str(len(gene_list)))

514


In [7]:
sum_normalized = 0
sum_malformated = 0
correct_gene_names =[]
for gene_name in gene_list:
    if gene_name.startswith("Gene:"):
        if gene_name.split("Gene:")[1].isnumeric():
            sum_normalized += 1
            correct_gene_names.append(gene_name)
        else:
            #print("The following gene is not normalized correctly: "+gene_name)
            sum_malformated += 1
    else: 
        #print("The following gene is not normalized correctly: "+gene_name)
        sum_malformated += 1
print("There are "+str(sum_normalized)+" normalized genes and "+ str(sum_malformated)+" malformated gene names")

## check, if there are special cases where we find some content in the label field
gene_list_checkup = neo4j_manager.search_term_in_label(concept_label="gene", 
                                               concept_field="label", 
                                               search_operator="=~", 
                                               term=".*", 
                                               return_field="name")
print(len(gene_list_checkup))
## seems, that 2 genes have something in their label field, despite not beeing correctly normalized

There are 72 normalized genes and 442 malformated gene names
8655


In [8]:
## transform gene-names to entrez ids
entrez_gene_ids = [ gene_id.split(":")[1] for gene_id in correct_gene_names]
print(str(len(entrez_gene_ids)))
print(str(entrez_gene_ids[0:10]))

72
['8248', '13906541', '2716540', '9720867', '7011608', '13909623', '100508689', '103694877', '100131131', '609020']


In [9]:
## start the query (batches of 1000 are processed)
ginfo = mg.querymany(entrez_gene_ids, scopes='entrezgene',fields='symbol,name,alias,entrezgene,refseq.rna,ensembl.gene,taxid,pathway.kegg,pathway.reactome,pathway.biocarta,pathway.netpath,pathway.wikipathways,pathway.pid,go,type_of_gene,summary') #"all")
i = 0
csv_header = "query;_id;alias;_score;ensembl;entrezgene;name;refseq;symbol;taxid;pathway;go;type_of_gene;summary"
#    #csv_header = "query;symbol;top_feature_importances;top_feature_importances_index"
gene_list_file_name = "genelist.csv"##

with open(gene_list_file_name, "w") as outfile:
    outfile.write(csv_header+"\n")
    outfile.close()

for g in ginfo:
    line = ""
    for header in csv_header.split(";"):
        if header in g.keys():
            line = line + "{0};".format(g[header])
        else:
            line = line + "{0};".format("NA")
        
    with open(gene_list_file_name, 'a') as outfile:
        outfile.write(line+"\n")
        outfile.close()
    i += 1    

querying 1-72...done.
Finished.
72 input query terms found no hit:
	['8248', '13906541', '2716540', '9720867', '7011608', '13909623', '100508689', '103694877', '1001311
Pass "returnall=True" to return complete lists of duplicate or missing query terms.


In [10]:
## get all information for one gene by using the mygene-package
## we are adding more entities: pathways, go-terms

def get_ensembl_genelist(entry):
    ## if dict create list of dict -> [dict]
    if (isinstance(entry, dict)):
        entry_list = [entry]
    elif (isinstance(entry, list)):
        entry_list = entry
    else:
        raise Exception("wrong data type: expected: list or dict, but got: "+str(type(entry)))
    ensembl_genelist = []
    for entry_key in entry_list:
        if (isinstance(entry_key, dict)):
            if 'gene' in entry_key:
                ensembl_genelist.append(entry_key['gene'])
            else:
                raise Exception("no gene key found in ensembl entry "+str(entry_key))  
        else:
            raise Exception("wrong data type: expected: dict, but got: "+str(type(entry_query_key)))  
    
    return ensembl_genelist

def add_quotes(str_in):
    return "\'"+replace_quotes(str_in)+"\'"

def replace_quotes(str_in):
    return str(str_in).replace("\\", "/").replace("'", "\\'")

index_ginfo = 0
for gene_query in ginfo:
    if (index_ginfo %50 == 0):
        print("index = "+str(index_ginfo))
    gene_id = neo4j_manager.search_id_in_label("gene", "name", "=", "Gene:"+str(gene_query['query']))[0]
    if 'ensembl' in gene_query:
        query_key = gene_query['ensembl']
        ensembl_str = ",".join(get_ensembl_genelist(query_key))
        neo4j_manager.set_tag_node_attribute(gene_id, "ensembl_ids", add_quotes(ensembl_str))
    if 'alias' in gene_query:
        neo4j_manager.set_tag_node_attribute(gene_id, "alias", add_quotes(",".join(gene_query['alias'])))
    if 'refseq' in gene_query:
        for refseq_attribute in gene_query['refseq']:
            refseq_attribute_name = 'refseq_'+refseq_attribute
            if len(gene_query['refseq'][refseq_attribute]) > 1:
                refseq_attribute_value = add_quotes(",".join(gene_query['refseq'][refseq_attribute]))
            else:
                refseq_attribute_value = add_quotes(gene_query['refseq'][refseq_attribute])
            neo4j_manager.set_tag_node_attribute(gene_id, refseq_attribute_name, refseq_attribute_value)           
    
    if 'go' in gene_query:
        #print(gene_query)
        for go_category in gene_query['go']:
        #if 'BP' in gene_query['go']:
            go_category_list = gene_query['go'][go_category]
            if (isinstance(go_category_list, dict)):
                go_category_list = [go_category_list]
            
            for go_term in go_category_list:
                #print(go_term)
                object_attributes = ['evidence','gocategory','qualifier','term']
                object_values = [replace_quotes(go_term['evidence']),go_category,replace_quotes(go_term['qualifier']),replace_quotes(go_term['term'])]
                if go_category == "BP":
                    object_label = "GO_BP"
                    go_rel = "-[:GO_BP_contains_gene]->"
                elif go_category == "MF":
                    object_label = "GO_MF"
                    go_rel = "-[:GO_MF_contains_gene]->"
                elif go_category == "CC":
                    object_label = "GO_CC"
                    go_rel = "-[:GO_CC_contains_gene]->"
                else:
                    raise Exception("wrong go category "+go_category)
                neo4j_manager.create_object_for_entity(entity_label="gene", entity_name="Gene:"+str(gene_query['query']), object_label=object_label, object_name=go_term['id'], object_attributes=object_attributes, object_values=object_values, relationship_str = go_rel)
                
    
    if 'pathway' in gene_query:
        #print(gene_query)
        for pathway_category in gene_query['pathway']:
        #if 'BP' in gene_query['go']:
            pathway_category_list = gene_query['pathway'][pathway_category]
            if (isinstance(pathway_category_list, dict)):
                pathway_category_list = [pathway_category_list]
            
            for pathway_term in pathway_category_list:
                #print(go_term)
                object_attributes = ['id','label']
                object_values = [replace_quotes(pathway_term['id']),replace_quotes(pathway_term['name'])]
                if pathway_category not in ["kegg", "reactome", "wikipathways", "netpath", "pid", "biocarta"]:
                    raise Exception("wrong pathway category "+pathway_category)
                else:
                    object_label = "pathway_"+str(pathway_category)
                    pathway_rel = "-[:"+pathway_category+"_contains_gene]->"

                neo4j_manager.create_object_for_entity(entity_label="gene", entity_name="Gene:"+str(gene_query['query']), object_label=object_label, object_name=pathway_term['id'], object_attributes=object_attributes, object_values=object_values, relationship_str = pathway_rel)
    
    simple_fields = ["taxid", "symbol", "type_of_gene", "summary", "entrezgene"]
    for simple_field in simple_fields:
        if simple_field in gene_query:
            neo4j_manager.set_tag_node_attribute(gene_id, simple_field, add_quotes(gene_query[simple_field]))
    
    
    #"query;_id;alias;_score;ensembl;entrezgene;name;refseq;symbol;taxid;pathway;go;type_of_gene;summary"
    index_ginfo = index_ginfo + 1

index = 0
index = 50
