In [1]:
import re
import sys
from py2neo import Graph,Node,Relationship,NodeMatcher
import numpy as np
from scipy import spatial
import gensim
import jieba
import numpy as np
from scipy.linalg import norm
from fuzzywuzzy import process
import pandas as pd

In [2]:
"""
函数说明：输入一个字符串，返回图查询的结果

"""
def get_Answer(string):
    graph = Graph(host="10.88.3.81",auth=("neo4j","liubaichuan"))
    data1 = graph.run(string).to_data_frame()
    return data1

In [4]:
'''
函数说明：
用于语句计算相似度的函数
'''
model_file = '/Users/dong/Desktop/GoogleNews-vectors-negative300.bin'
model = gensim.models.KeyedVectors.load_word2vec_format(model_file, binary=True)
index2word_set = set(model.index2word)

def avg_feature_vector(sentence, model, num_features, index2word_set):
    words = sentence.split()
    feature_vec = np.zeros((num_features, ), dtype='float32')
    n_words = 0
    for word in words:
        if word in index2word_set:
            n_words += 1
            feature_vec = np.add(feature_vec, model[word])
    if (n_words > 0):
        feature_vec = np.divide(feature_vec, n_words)
    return feature_vec

In [5]:
'''
函数说明：导入所有问题模板
输入："question_pattern.txt"
输出：所有问题组成的list
'''
def qPattern(a):
    qList=[]
    with open(a, 'r') as file:
        for l in  file:
            qList.append(l.strip('\n'))
    return(qList)

In [6]:
'''
问题预处理，将一些无法被识别的sequence变成id，增强对问题类型的识别能力
protein:{id}   Q03073
Species{name}
Gene:{id: 'AET4Gv20696400', species: 'Aegilops tauschii'})
GO:{id: 'GO:1902494'})(_204455104:GO {id: 'GO:0004553'}
Sequence:UPI00013ED01D
'''
def realQ(Question):
    verb=Question.split()
    for i in verb:
        if str(i).lstrip('[\'').rstrip('\']').strip('{').strip('}').strip('(').strip(')').isalpha():
            continue
        elif str(i).lstrip('[\'').rstrip('\']').strip('{').strip('}').strip('(').strip(')').isalnum():
            if str(i).lstrip('[\'').rstrip('\']').strip('{').strip('}').strip('(').strip(')').isdigit():
                a=verb.index(i)
                verb[a]='GO id'
            elif str(i).lstrip('[\'').rstrip('\']').strip('{').strip('}').strip('(').strip(')').startswith(('AET','AMTR','g','Al','fgenesh1','fgenesh2','scaffold','AT','BVRB','BRADI','GSBRNA2T','Bo','Bra','CHLRE','CHC','CCACVL','Csa','CM','DCAR','Dr','Gasu','GLYMA','B456','HannXRQ','HORVU','LPERR','TanjilG','MANES','MTR','GSMUA','A4A49','OBART','OB','ORGLA','OGLUM','KN','AMD','OMERI','ONIVA','OPUNC','ORUFI','BGIOSGA','Os','OSTLU','PHAVU','Pp','POPTR','PRUPE','SELMODRAFT','SETIT','Solyc','PGS','SORBI')):
                a=verb.index(i)
                verb[a]='Gene id'
            elif str(i).lstrip('[\'').rstrip('\']').strip('{').strip('}').strip('(').strip(')').startswith('UPI'):
                a=verb.index(i)
                verb[a]='Sequence id'
            else:
                a=verb.index(i)
                verb[a]='Protein id'
    realQ=' '.join(verb)
    realQ=realQ.strip('{').strip('}').strip('(').strip(')')
    return(realQ)

In [7]:
'''
函数说明：将问题和question_pattern计算相似度
输入：question
{AET4Gv20696400} comes from how many (Species)?
what (genes) do {Aegilops tauschii} has?
Gene Os01g0740400 is transcribed into what sequence?
sequence UPI00004C2817 is transcribed from which gene?
输出：相似度最大的问题的类型
'''
#Q='sequence UPI00004C2817 is transcribed from which gene?'
#Question=realQ(Q)
#qpattern_dir="question_pattern.txt"
def max_sim(Question,qpattern_dir):
    m=qPattern(qpattern_dir)
    obpattern = r'[\]](.*?)[?]'
    s0_afv = avg_feature_vector(Question, model=model, num_features=300, index2word_set=index2word_set)
    simList=[]
    for i in m:
        text = i
        patternObj = re.compile(obpattern)
        result1 = str(patternObj.findall(text)).lstrip('[\'').rstrip('\']')
        s1_afv = avg_feature_vector(result1, model=model, num_features=300, index2word_set=index2word_set)
        sim = 1 - spatial.distance.cosine(s0_afv, s1_afv)
        simList.append(sim)
    #print(simList)  #查看相似度的数值
    max_index=simList.index(max(simList))
    text2=m[max_index]
    pattern1 = r'[\[](.*?)[\]]'
    patternObj2 = re.compile(pattern1)
    result2 = patternObj2.findall(text2)
    return(str(result2).lstrip('[\'').rstrip('\']'))
#max_sim(Question,qpattern_dir)

In [8]:
#模糊查询
#适用于拼写少字母的情况
def fuzzyfinder1(user_input, collection):
        suggestions = []
        pattern = '.*?'.join(user_input)    
        regex = re.compile(pattern)         
        for item in collection:
            match = regex.search(item)      
            if match:
                suggestions.append(item)
        a=sorted(suggestions)
        return a[0]
    
#要是上面那个返回的是空就用下面这个，适用于拼写字母顺序个别出错
from fuzzywuzzy import process
def fuzzyfinder2(user_input, collection):
    a=process.extractOne(user_input,collection)
    return a[0]

In [9]:

'''
函数目标：
输入问题，提取里面的实体，宽泛点~
.啊啊啊啊
protein:{id}   Q03073
Species{name}
Gene:{id: 'AET4Gv20696400', species: 'Aegilops tauschii'})
GO:{id: 'GO:1902494'})(_204455104:GO {id: 'GO:0004553'}
Sequence:UPI00013ED01D
{UPI4Gv20696400} {comes from} how many (Species)?
'''


class entity_extract():
    def __init__(self,question):
        self.question=question
    def get_id(self):
        dictt=dict()
        verb=self.question
        verb=verb.split()
        for i in verb:
            if str(i).lstrip('[\'').rstrip('\']').strip('{').strip('}').strip('(').strip(')').rstrip('\?').isalpha():
                continue
            elif str(i).lstrip('[\'').rstrip('\']').strip('{').strip('}').strip('(').strip(')').rstrip('\?').isalnum():
                if str(i).lstrip('[\'').rstrip('\']').strip('{').strip('}').strip('(').strip(')').rstrip('\?').isdigit():
                    a=verb.index(i)
                    dictt['GO id']=verb[a]
                elif str(i).lstrip('[\'').rstrip('\']').strip('{').strip('}').strip('(').strip(')').rstrip('\?').startswith(('AET','AMTR','g','Al','fgenesh1','fgenesh2','scaffold','AT','BVRB','BRADI','GSBRNA2T','Bo','Bra','CHLRE','CHC','CCACVL','Csa','CM','DCAR','Dr','Gasu','GLYMA','B456','HannXRQ','HORVU','LPERR','TanjilG','MANES','MTR','GSMUA','A4A49','OBART','OB','ORGLA','OGLUM','KN','AMD','OMERI','ONIVA','OPUNC','ORUFI','BGIOSGA','Os','OSTLU','PHAVU','Pp','POPTR','PRUPE','SELMODRAFT','SETIT','Solyc','PGS','SORBI')):
                    a=verb.index(i)
                    dictt['Gene id']=verb[a]
                elif str(i).lstrip('[\'').rstrip('\']').strip('{').strip('}').strip('(').strip(')').rstrip('\?').startswith('UPI'):
                    a=verb.index(i)
                    dictt['Sequence id']=verb[a]
                else:
                    a=verb.index(i)
                    dictt['Protein id']=verb[a]
        return(dictt)
    def get_species(self):
        verb=self.question
        species_type=["Aegilops tauschii","Arabidopsis halleri","Arabidopsis lyrata","Amborella trichopoda","Beta vulgaris","Brassica rapa","Chondrus crispus","Corchorus capsularis","Cyanidioschyzon","Daucus carota","Dioscorea rotundata","Galdieria sulphuraria","Gossypium raimondii","Hordeum vulgare","Oryza brachyantha","Oryza glaberrima","Oryza glumipatula","Arabidopsis thaliana","Brachypodium distachyon","Brassica napus","Brassica oleracea","Chlamydomonas reinhardtii","Cucumis sativus","Glycine max","Helianthus annuus","Leersia perrieri","Lupinus angustifolius","Manihot esculenta","Medicago truncatula","Musa acuminata","Nicotiana attenuata","Oryza barthii","Oryza longistaminata","Oryza rufipogon","Oryza sativa Indica Group","Phaseolus vulgaris","Populus trichocarpa","Oryza meridionalis","Oryza nivara","Oryza punctata","Oryza sativa Japonica Group","Ostreococcus lucimarinus","Physcomitrella patens","Selaginella moellendorffii","Setaria italica","Solanum tuberosum","Theobroma cacao","Vigna angularis","Vigna radiata","Zea mays","Prunus persica","Solanum lycopersicum","Sorghum bicolor","Trifolium pratense","Triticum aestivum","Triticum dicoccoides","Triticum urartu","Vitis vinifera"]
        for i in species_type:
            if i in verb:
                return (i)
            else:
                return fuzzyfinder2(verb, species_type)
    def get_goids(self):
        goid=[]
        verb=self.question
        verb=verb.split()
        for i in verb:
            if str(i).lstrip('[\'').rstrip('\']').strip('{').strip('}').strip('(').strip(')').rstrip('\?').isalpha():
                continue
            elif str(i).lstrip('[\'').rstrip('\']').strip('{').strip('}').strip('(').strip(')').rstrip('\?').isalnum():
                if str(i).lstrip('[\'').rstrip('\']').strip('{').strip('}').strip('(').strip(')').rstrip('\?').isdigit():
                    a=verb.index(i)
                    goid.append(verb[a])
        return(goid)
#m=entity_extract('what genes do molecular function includes in Vitis vinifera ?')
#print(m.get_species())
#print('Vitis vinifera' in 'what genes do molecular function includes in Vitis vinifera ?')

In [10]:
def AnswerQ(string,Question,num=5000):
    if string=='have1':
        body=entity_extract(Question)
        species_type=str(body.get_species())
        a='match(na:Species{{name:"{species_type}"}})-[have]->(nb:Gene) return nb.id limit {num}'.format(species_type=species_type,num=str(num))
        b=get_Answer(a)
        txt='The species '+str(species_type)+'have genes as follows:'
        return(txt,b)
    elif string=='have2':
        #[have2] What species do gene id exist in?
        body=entity_extract(Question)
        ids=body.get_id()['Gene id']
        a='match(na:Species)-[have]->(nb:Gene{{id:"{geneid}"}}) return na.name limit {num}'.format(geneid=ids,num=str(num))
        b=get_Answer(a)
        txt='The gene id: '+str(ids)+' exist in these species:'
        return(txt,b)
    elif string=='be_transcribed_into1':
        # [be_transcribed_into1] Gene id is transcribed into what sequence?
        body=entity_extract(Question)
        ids=body.get_id()['Gene id']
        a='match(na:Gene{{id:"{geneid}"}})-[be_transcribed_into]->(nb:Sequence) return nb.id limit {num}'.format(geneid=ids,num=str(num))
        b=get_Answer(a)
        if b.empty:
            txt='Gene id: '+str(ids)+' is transcribed into none sequence.'
            return(txt,b)
        else:
            txt='Gene id: '+str(ids)+' is transcribed into the sequence: '
            return(txt,b)
    elif string=='be_transcribed_into2':
        #[be_transcribed_into2] Gene id is transcribed into what protein?
        body=entity_extract(Question)
        ids=body.get_id()['Gene id']
        a='match(na:Gene{{id:"{geneid}"}})-[be_transcribed_into]->(nb:Protein) return nb.id,nb.name limit {num}'.format(geneid=ids,num=str(num))
        b=get_Answer(a)
        if b.empty:
            txt='Gene id: '+str(ids)+' is transcribed into none protein.'
            return(txt,b)
        else:
            txt='Gene id: '+str(ids)+' is transcribed into the protein: '
            return(txt,b)
    elif string=='be_transcribed_into32':
        #[be_transcribed_into3] sequence/protein id is transcribed from which gene?
        body=entity_extract(Question)       
        ids=body.get_id()['Protein id']
        a='match(na:Gene)-[be_transcribed_into]->(nb:Protein{{id:"{pid}"}}) return na.id limit {num}'.format(pid=ids,num=str(num))
        b=get_Answer(a)
        if b.empty:
            txt='There are no genes transcribing into the protein: '+str(ids)
            return(txt,b)
        else:
            txt='The protein: '+str(ids)+' is transcribed from the genes as follows: '
            return(txt,b)
    elif string=='be_transcribed_into31':
        body=entity_extract(Question)
        ids=body.get_id()['Sequence id']
        a='match(na:Gene)-[be_transcribed_into]->(nb:Sequence{{id:"{sid}"}}) return na.id limit {num}'.format(sid=ids,num=str(num))
        b=get_Answer(a)
        if b.empty:
            txt='There are no genes transcribing into the sequence: '+str(ids)
            return(txt,b)
        else:
            txt='The sequence: '+str(ids)+' is transcribed from the genes as follows: '
            return(txt,b)
    elif string=='is_a':
        #[is_a] what is the GO that has relationship "is a " with GO id?
        body=entity_extract(Question)
        ids=body.get_id()['GO id']
        a='match(na:GO{{id:"GO:{gid}"}})-[is_a]->(nb:GO) return nb.id limit {num}'.format(gid=ids,num=str(num))
        b=get_Answer(a)
        txt='The GO which has relationship \"is a \" with GO id: '+str(ids)+' is: '
        return(txt,b)
    elif string=='negatively_regulates':
        #[negatively_regulates] what is the GO that has relationship "negatively regulates " with GO id?
        body=entity_extract(Question)
        ids=body.get_id()['GO id']
        a='match(na:GO{{id:"GO:{gid}"}})-[negatively_regulates]->(nb:GO) return nb.id limit {num}'.format(gid=ids,num=str(num))
        b=get_Answer(a)
        txt='The GO which has relationship \"negatively regulates \" with GO id: '+str(ids)+' is: '
        return(txt,b)
    elif string=='positively_regulates':
        #[positively_regulates] what is the GO that has relationship "positively regulates " with GO id?
        body=entity_extract(Question)
        ids=body.get_id()['GO id']
        a='match(na:GO{{id:"GO:{gid}"}})-[positively_regulates]->(nb:GO) return nb.id limit {num}'.format(gid=ids,num=str(num))
        b=get_Answer(a)
        txt='The GO which has relationship \"positively regulates \" with GO id: '+str(ids)+' is: '
        return(txt,b)
    elif string=='regulates':
        #[regulates] what is the GO that has relationship "regulates " with GO id?
        body=entity_extract(Question)
        ids=body.get_id()['GO id']
        a='match(na:GO{{id:"GO:{gid}"}})-[regulates]->(nb:GO) return nb.id limit {num}'.format(gid=ids,num=str(num))
        b=get_Answer(a)
        txt='The GO which has relationship \" regulates \" with GO id: '+str(ids)+' is: '
        return(txt,b)
    elif string=='part_of':
        #[part_of] what is the GO that has relationship "part of" with GO id?
        body=entity_extract(Question)
        ids=body.get_id()['GO id']
        a='match(na:GO{{id:"GO:{gid}"}})-[part_of]->(nb:GO) return nb.id limit {num}'.format(gid=ids,num=str(num))
        b=get_Answer(a)
        txt='The GO which has relationship \" part of \" with GO id: '+str(ids)+' is: '
        return(txt,b)
    elif string=='molecular_function':
        #[molecular_function] what genes do molecular function includes in Species?
        body=entity_extract(Question)
        species_type=str(body.get_species())
        a='match(na:Gene{{species:"{species_type}"}})-[molecular_function]->(nb:GO) return na.id,na.species limit {num}'.format(species_type=species_type,num=str(num))
        b=get_Answer(a)
        txt='The genes which do molecular function includes in Species '+str(species_type)+'are as follows: '
        return(txt,b)
    elif string=='biological_process':
        #[biological_process] what genes do biological process includes?
        body=entity_extract(Question)
        species_type=str(body.get_species())
        a='match(na:Gene{{species:"{species_type}"}})-[biological_process]->(nb:GO) return na.id,na.species limit {num}'.format(species_type=species_type,num=str(num))
        b=get_Answer(a)
        txt='The genes which do biological process includes in Species '+str(species_type)+'are as follows: '
        return(txt,b)
    elif string=='cellular_component':
        #[cellular_component] what genes do cellular component includes?
        body=entity_extract(Question)
        species_type=str(body.get_species())
        a='match(na:Gene{{species:"{species_type}"}})-[cellular_component]->(nb:GO) return na.id,na.species limit {num}'.format(species_type=species_type,num=str(num))
        b=get_Answer(a)
        txt='The genes which do cellular component includes in Species '+str(species_type)+'are as follows: '
        return(txt,b)
    elif string=='eco':
        #[eco] what genes is an evidence ontology in Species?
        body=entity_extract(Question)
        species_type=str(body.get_species())
        a='match(na:Gene{{species:"{species_type}"}})-[eco]->(nb:GO) return na.id,na.species limit {num}'.format(species_type=species_type,num=str(num))
        b=get_Answer(a)
        txt='The genes which is an evidence ontology in Species '+str(species_type)+'are as follows: '
        return(txt,b)
    elif string=='belong_to':
        #[belong_to] Sequence id belongs to which protein? 
        body=entity_extract(Question)
        ids=body.get_id()['Sequence id']
        a='match(na:Sequence{{id:"{sid}"}})-[belong_to]->(nb:Protein) return nb.id limit {num}'.format(sid=ids,num=str(num))
        b=get_Answer(a)
        txt='Sequence id '+str(ids)+'belongs to the protein: '
        return(txt,b)
###########################新增
    elif string=='eco1':
        #[eco1]What genes are involved in the evidence and conclusion ontology of the gene ontology GO?
        body=entity_extract(Question)
        ids=body.get_id()['GO id']
        a='MATCH (na:Gene)-[:eco]->(nb:GO{{id:"GO:{gid}"}}) return na.id limit {num}'.format(gid=ids,num=str(num))
        b=get_Answer(a)
        if b.empty:
            txt='There are no genes involved in the evidence and conclusion ontology of the GO: '+str(ids)
            return(txt,b)
        else:
            txt='The genes involved in the evidence and conclusion ontology of the GO: '+str(ids)+' are as follows:'
            return(txt,b)
    elif  string=='eco2':
        #[eco2] How many genes are involved in the evidence and conclusion ontology of gene ontology GO ?
        body=entity_extract(Question)
        ids=body.get_id()['GO id']
        a='MATCH (na:Gene)-[:eco]->(nb:GO{{id:"GO:{gid}"}}) return na.id '.format(gid=ids)
        b=get_Answer(a)
        if len(b):
            txt='It has '+str(len(b))+' genes which are involved in the evidence and conclusion ontology of gene ontology GO: '+str(ids)+' ,as follows:'
            return(txt,b)
        else:
            txt='NONE'
            return(txt,b)
    elif string=='eco3':
        #[eco3] Does this gene participate in the evidence and conclusion ontology of gene ontology GO ?
        body=entity_extract(Question)
        gene_ids=body.get_id()['Gene id']
        go_ids=body.get_id()['GO id']
        a='MATCH (na:Gene{{id:"{geneids}"}}),(nb:GO{{id:"GO:{goids}"}}),p=(na)-[:eco]-(nb) return count(p)'.format(geneids=gene_ids,goids=go_ids)
        b=get_Answer(a)
        d=pd.DataFrame()        
        if b['count(p)'][0]:
            txt='Yes,it  participate in the evidence and conclusion ontology of gene ontology GO: '+str(go_ids)
        else:
            txt='NO'
        return(txt,d)
    elif string=='biological_process1':
        #What genes are involved in this biological process of the gene ontology GO ?
        body=entity_extract(Question)
        ids=body.get_id()['GO id']
        a='MATCH (na:Gene)-[:biological_process]->(nb:GO{{id:"GO:{gid}"}}) return na.id '.format(gid=ids,num=str(num))
        b=get_Answer(a)
        if b.empty:
            txt='There are no genes involved in this biological process of the gene ontology GO: '+str(ids)
            return(txt,b)
        else:
            txt='The genes involved in this biological process of the gene ontology GO: '+str(ids)+' are as follows:'
            return(txt,b)
    elif string=='biological_process2':
        #How many genes are involved in this biological process of gene ontology?
        body=entity_extract(Question)
        ids=body.get_id()['GO id']
        a='MATCH (na:Gene)-[:biological_process]->(nb:GO{{id:"GO:{gid}"}}) return na.id '.format(gid=ids)
        b=get_Answer(a)
        if len(b):
            txt='It has '+str(len(b))+' genes which are involved in the biological process of gene ontology GO: '+str(ids)+' ,as follows:'
            return(txt,b)
        else:
            txt='NONE'
            return(txt,b)
    elif string=='biological_process3':
        #Does this gene participate in this biological process of gene ontology GO ?
        body=entity_extract(Question)
        gene_ids=body.get_id()['Gene id']
        go_ids=body.get_id()['GO id']
        a='MATCH (na:Gene{{id:"{geneids}"}}),(nb:GO{{id:"GO:{goids}"}}),p=(na)-[:biological_process]-(nb) return count(p)'.format(geneids=gene_ids,goids=go_ids)
        b=get_Answer(a)
        d=pd.DataFrame()
        if b['count(p)'][0]:
            txt='Yes,it participate in the biological process of gene ontology GO: '+str(go_ids)
        else:
            txt='NO'
        return(txt,d)
    elif string=='cellular_component1':
        #What genes are contained in the cellular components of the gene ontology GO ?
        body=entity_extract(Question)
        ids=body.get_id()['GO id']
        a='MATCH (na:Gene)-[:cellular_component]->(nb:GO{{id:"GO:{gid}"}}) return na.id '.format(gid=ids,num=str(num))
        b=get_Answer(a)
        if b.empty:
            txt='There are no genes contained in the cellular components of the GO: '+str(ids)
            return(txt,b)
        else:
            txt='The genes contained in the cellular components of the GO: '+str(ids)+' are as follows:'
            return(txt,b)
    elif string=='cellular_component2':
        #How many genes are contained in the cellular components of gene ontology GO ?
        body=entity_extract(Question)
        ids=body.get_id()['GO id']
        a='MATCH (na:Gene)-[:cellular_component]->(nb:GO{{id:"GO:{gid}"}}) return na.id '.format(gid=ids)
        b=get_Answer(a)
        if len(b):
            txt='It has '+str(len(b))+' genes which are contained in the cellular components of gene ontology GO: '+str(ids)+' ,as follows:'
            return(txt,b)
        else:
            txt='NONE'
            return(txt,b)
    elif string=='cellular_component3':
        #Does the cell component of gene ontology GO contain this gene ?
        body=entity_extract(Question)
        gene_ids=body.get_id()['Gene id']
        go_ids=body.get_id()['GO id']
        a='MATCH (na:Gene{{id:"{geneids}"}}),(nb:GO{{id:"GO:{goids}"}}),p=(na)-[:cellular_component]-(nb) return count(p)'.format(geneids=gene_ids,goids=go_ids)
        b=get_Answer(a)
        d=pd.DataFrame()
        if b['count(p)'][0]:
            txt='Yes,the cell component of gene ontology GO '+str(go_ids)+' contain this gene '+str(gene_ids)
        else:
            txt='NO'
        return(txt,d)
    elif string=='molecular_function1':
        #Which genes perform molecular functions on this gene ontology GO ?
        body=entity_extract(Question)
        ids=body.get_id()['GO id']
        a='MATCH (na:Gene)-[:molecular_function]->(nb:GO{{id:"GO:{gid}"}}) return na.id '.format(gid=ids,num=str(num))
        b=get_Answer(a)
        if b.empty:
            txt='The GO: '+str(ids)+' has no genes which perform molecular functions.'
            return(txt,b)
        else:
            txt='The genes which perform molecular functions on the GO: '+str(ids)+' are as follows:'
            return(txt,b)
    elif string=='molecular_function2':
        #How many genes perform molecular functions on this gene ontology GO ?
        body=entity_extract(Question)
        ids=body.get_id()['GO id']
        a='MATCH (na:Gene)-[:molecular_function]->(nb:GO{{id:"GO:{gid}"}}) return na.id '.format(gid=ids)
        b=get_Answer(a)
        if len(b):
            txt='It has '+str(len(b))+' genes performing molecular functions on the gene ontology GO: '+str(ids)+',as follows:'
            return(txt,b)
        else:
            txt='NONE'
            return(txt,b)
    elif string=='molecular_function3':
        #Does this gene perform molecular functions on this gene ontology GO ?
        body=entity_extract(Question)
        gene_ids=body.get_id()['Gene id']
        go_ids=body.get_id()['GO id']
        a='MATCH (na:Gene{{id:"{geneids}"}}),(nb:GO{{id:"GO:{goids}"}}),p=(na)-[:molecular_function]-(nb) return count(p)'.format(geneids=gene_ids,goids=go_ids)
        b=get_Answer(a)
        d=pd.DataFrame()
        if b['count(p)'][0]:
            txt='Yes,the cell component of gene ontology GO: '+str(go_ids)+' contain this gene '+str(gene_ids)
        else:
            txt='NO'
        return(txt,d)
    elif string=='have3':
        #Does this gene exist in this species ?
        body=entity_extract(Question)
        gene_ids=body.get_id()['Gene id']
        species_type=str(body.get_species())
        a='match (na:Species{{name:"{species_type}"}}),(nb:Gene{{id:"{geneids}"}}),p=(na)-[:have]-(nb) return count(p)'.format(species_type=species_type,geneids=gene_ids)
        b=get_Answer(a)
        d=pd.DataFrame()
        if b['count(p)'][0]:
            txt='Yes,the gene '+str(gene_ids)+' exist in the species '+str(species_type)
            return(txt,d)
        else:
            txt='NO'
        return(txt,d)
    elif string=='have4':
        #How many genes exist in this species ?
        body=entity_extract(Question)
        species_type=str(body.get_species())
        a='MATCH (na:Species{{name:"{species_type}"}})-[:have]->(nb:Gene) return nb.id '.format(species_type=species_type)
        b=get_Answer(a)
        if len(b):
            txt='It has '+str(len(b))+' genes existing in the species '+str(species_type)+' ,as follows:'
            return(txt,b)
        else:
            txt='NONE'
            return(txt,b)
    elif string=='is_a1':
        #What are the subtype of this gene ontology GO ?
        body=entity_extract(Question)
        ids=body.get_id()['GO id']
        a='match(na:GO)-[is_a]->(nb:GO{{id:"GO:{gid}"}}) return na.id limit {num}'.format(gid=ids,num=str(num))
        b=get_Answer(a)
        if b.empty:
            txt='The GO: '+str(ids)+' has no subtype.'
            return(txt,b)
        else:
            txt='The subtype of this GO: '+str(ids)+' is as follows:'
            return(txt,b)
    elif string=='is_a2':
        #How many subcategories are there in this gene ontology GO ? 
        body=entity_extract(Question)
        ids=body.get_id()['GO id']
        a='MATCH (na:GO)-[:is_a]->(nb:GO{{id:"GO:{gid}"}}) return na.id '.format(gid=ids)
        b=get_Answer(a)
        if len(b):
            txt='It has '+str(len(b))+' subcategories in the gene ontology GO：'+str(ids)+',as follows:'
            return(txt,b)
        else:
            txt='NONE'
            return(txt,b)
    elif string=='is_a3':
        #Is this gene ontology GO a subcategory of the gene ontology GO ?
        body=entity_extract(Question)
        ids=body.get_goids()
        subgoids=ids[0]
        goids=ids[1]
        a='MATCH (na:GO{{id:"GO:{subgid}"}}),(nb:GO{{id:"GO:{gid}"}}),p=(na)-[:is_a]-(nb) return count(p)'.format(subgid=subgoids,gid=goids)
        b=get_Answer(a)
        d=pd.DataFrame()
        if b['count(p)'][0]:
            txt='Yes,the GO: '+str(subgoids)+' is a subcategory of the GO: '+str(goids)
        else:
            txt='NO'
        return(txt,d)
    elif string=='part_of1':
        #What part of this gene ontology GO ?
        body=entity_extract(Question)
        ids=body.get_id()['GO id']
        a='match(na:GO)-[:part_of]->(nb:GO{{id:"GO:{gid}"}}) return na.id limit {num}'.format(gid=ids,num=str(num))
        b=get_Answer(a)
        if b.empty:
            txt='The GO: '+str(ids)+' has no part.'
            return(txt,b)
        else:
            txt='The part of this GO: '+str(ids)+' is as follows: '
            return(txt,b)
    elif string=='part_of2':
        #How many parts of this gene ontology GO are there?
        body=entity_extract(Question)
        ids=body.get_id()['GO id']
        a='MATCH (na:GO)-[:part_of]->(nb:GO{{id:"GO:{gid}"}}) return na.id '.format(gid=ids)
        b=get_Answer(a)
        if len(b):
            txt='There are '+str(len(b))+' parts of GO: '+str(ids)+' ,as follows:'
            return(txt,b)
        else:
            txt='NONE'
            return(txt,b)
    elif string=='part_of3':
        #This gene ontology GO is part of which gene ontology GO ?
        body=entity_extract(Question)
        ids=body.get_id()['GO id']
        a='MATCH (na:GO{{id:"GO:{gid}"}})-[:part_of]->(nb:GO) return na.id '.format(gid=ids)
        b=get_Answer(a)
        if len(b):
            txt='The GO: '+str(ids)+' is part of these gene ontologys,as follows:'
        else:
            txt='NONE'
        return(txt,b)
    elif string=='part_of4':
        #Is this gene ontology GO part of that gene ontology GO ? 
        body=entity_extract(Question)
        ids=body.get_goids()
        subgoids=ids[0]
        goids=ids[1]
        a='MATCH (na:GO{{id:"GO:{subgid}"}}),(nb:GO{{id:"GO:{gid}"}}),p=(na)-[:part_of]-(nb) return count(p)'.format(subgid=subgoids,gid=goids)
        b=get_Answer(a)
        d=pd.DataFrame()
        if b['count(p)'][0]:
            txt='Yes,the GO: '+str(subgoids)+' is part of the GO: '+str(goids)
        else:
            txt='NO'
        return(txt,d)
    elif string=='regulates1':
        #Which gene ontology GO does the gene ontology GO regulate?
        body=entity_extract(Question)
        ids=body.get_id()['GO id']
        txt='The GO: '+str(ids)+' regulates the gene ontologies,as follows:'
        a='match(na:GO{{id:"GO:{gid}"}})-[:regulates]->(nb:GO) return nb.id limit {num}'.format(gid=ids,num=str(num))
        b=get_Answer(a)
        return(txt,b)
    elif string=='regulates2':
        #Does the gene ontology GO regulate the gene ontology GO ? 
        body=entity_extract(Question)
        ids=body.get_goids()
        goids=ids[0]
        subgoids=ids[1]
        a='MATCH (na:GO{{id:"GO:{gid}"}}),(nb:GO{{id:"GO:{subgid}"}}),p=(na)-[:regulates]-(nb) return count(p)'.format(gid=goids,subgid=subgoids)
        b=get_Answer(a)
        d=pd.DataFrame()
        if b['count(p)'][0]:
            txt='Yes,the GO: '+str(goids)+' regulate the gene ontology GO: '+str(subgoids)
        else:
            txt='NO'
        return(txt,d)
    elif string=='regulates3':
        #How many gene ontologies GO does this gene ontology GO regulate?
        body=entity_extract(Question)
        ids=body.get_id()['GO id']
        a='MATCH (na:GO{{id:"GO:{gid}"}})-[:regulates]->(nb:GO) return nb.id '.format(gid=ids)
        b=get_Answer(a)
        if len(b):
            txt='GO: '+str(ids)+' regulates '+str(len(b))+' gene ontologies,as follows:'
            return(txt,b)
        else:
            txt='NONE'
            return(txt,b)
    elif string=='regulates4':
        #Which gene ontology GO has a regulatory effect ?
        a='MATCH (na:GO)-[r:regulates]->(nb:GO) RETURN na.id'
        b=get_Answer(a)
        txt='GO have a regulatory effect:'
        return(txt,b)
    elif string=='positively_regulates1':
        #Which gene ontology GO does the gene ontology positive regulate ?
        body=entity_extract(Question)
        ids=body.get_id()['GO id']
        txt='The GO: '+str(ids)+' positively regulates the gene ontologies,as follows:'
        a='match(na:GO{{id:"GO:{gid}"}})-[:positively_regulates]->(nb:GO) return nb.id limit {num}'.format(gid=ids,num=str(num))
        b=get_Answer(a)
        return(txt,b)
    elif string=='positively_regulates2':
        #Does the gene ontology GO positive regulate the gene ontology GO ?
        body=entity_extract(Question)
        ids=body.get_goids()
        goids=ids[0]
        subgoids=ids[1]
        a='MATCH (na:GO{{id:"GO:{gid}"}}),(nb:GO{{id:"GO:{subgid}"}}),p=(na)-[:positively_regulates]-(nb) return count(p)'.format(gid=goids,subgid=subgoids)
        b=get_Answer(a)
        d=pd.DataFrame()
        if b['count(p)'][0]:
            txt='Yes,the GO: '+str(goids)+' positively regulate the gene ontology GO: '+str(subgoids)
        else:
            txt='NO'
        return(txt,d)
    elif string=='positively_regulates3':
        #How many gene ontologies GO does this gene ontology GO positive regulate ?
        body=entity_extract(Question)
        ids=body.get_id()['GO id']
        a='MATCH (na:GO{{id:"GO:{gid}"}})-[:positively_regulates]->(nb:GO) return nb.id '.format(gid=ids)
        b=get_Answer(a)
        if len(b):
            txt='GO: '+str(ids)+' positively regulates '+str(len(b))+' gene ontologies,as follows:'
            return(txt,b)
        else:
            txt='NONE'
            return(txt,b)
    elif string=='positively_regulates4':
        #Which gene ontology has a positive regulatory effect ?
        a='MATCH (na:GO)-[r:positively_regulates]->(nb:GO) RETURN na.id'
        b=get_Answer(a)
        txt='GO have a positive regulatory effect:'
        return(txt,b)
    elif string=='negatively_regulates1':
        #Which gene ontology GO does the gene ontology negative regulate ?
        body=entity_extract(Question)
        ids=body.get_id()['GO id']
        txt='The GO: '+str(ids)+' negatively regulates the gene ontologies,as follows:'
        a='match(na:GO{{id:"GO:{gid}"}})-[:negatively_regulates]->(nb:GO) return nb.id limit {num}'.format(gid=ids,num=str(num))
        b=get_Answer(a)
        return(txt,b)
    elif string=='negatively_regulates2':
        #Does the gene ontology GO negative regulate the gene ontology GO ?
        body=entity_extract(Question)
        ids=body.get_goids()
        goids=ids[0]
        subgoids=ids[1]
        a='MATCH (na:GO{{id:"GO:{gid}"}}),(nb:GO{{id:"GO:{subgid}"}}),p=(na)-[:negatively_regulates]-(nb) return count(p)'.format(gid=goids,subgid=subgoids)
        b=get_Answer(a)
        d=pd.DataFrame()
        if b['count(p)'][0]:
            txt='Yes,the GO: '+str(goids)+' negatively regulate the gene ontology GO: '+str(subgoids)
        else:
            txt='NO'
        return(txt,d)
    elif string=='negatively_regulates3':
        #How many gene ontologies GO does the gene ontology GO negative regulate ?
        body=entity_extract(Question)
        ids=body.get_id()['GO id']
        a='MATCH (na:GO{{id:"GO:{gid}"}})-[:negatively_regulates]->(nb:GO) return nb.id '.format(gid=ids)
        b=get_Answer(a)
        if len(b):
            txt='GO: '+str(ids)+' negatively regulates '+str(len(b))+' gene ontologies,as follows:'
            return(txt,b)
        else:
            txt='NONE'
            return(txt,b)
    elif string=='negatively_regulates4':
        #Which gene ontology has a negative regulatory effect ?
        a='MATCH (na:GO)-[r:negatively_regulates]->(nb:GO) RETURN na.id'
        b=get_Answer(a)
        txt='GO have a negative regulatory effect:'
        return(txt,b)
    elif string=='species_graph':
        #What species are in the biological gene graph ?
        a='MATCH (na:Species) return na.name limit {num}'.format(num=str(num))
        b=get_Answer(a)
        txt='The species in the biological gene graph are as follows:'
        return(txt,b)
    elif string=='genes_graph':
        #What genes are in the biological gene graph ?
        a='MATCH (na:Gene) return na.id limit {num}'.format(num=str(num))
        b=get_Answer(a)
        txt='The genes in the biological gene graph are as follows:'
        return(txt,b)
    elif string=='go_graph':
        #What gene ontologies GO are in the biological gene graph ?
        a='MATCH (na:GO) return na.id limit {num}'.format(num=str(num))
        b=get_Answer(a)
        txt='The gene ontologies in the biological gene graph are as follows:'
        return(txt,b)
    elif string=='sequences_graph':
        #What sequences are in the biological gene graph ?
        a='MATCH (na:Sequence) return na.id limit {num}'.format(num=str(num))
        b=get_Answer(a)
        txt='The sequences in the biological gene graph are as follows:'
        return(txt,b)
    elif string=='proteins_graph':
        #What proteins are in the biological gene graph ?
        a='MATCH (na:Protein) return na.id limit {num}'.format(num=str(num))
        b=get_Answer(a)
        txt='The proteins in the biological gene graph are as follows:'
        return(txt,b)

In [11]:
'''
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
BOSS函数！！！！！
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


示例Q='{Aegilops tauschii} has how many genes?'
（因为只有这个句子算是写好了）

'''
def run():
    Q=input('Enter your Question:')
    Question=realQ(Q)
    qpattern_dir="question_pattern.txt"
    m=max_sim(Question,qpattern_dir)
    txt,verb=AnswerQ(m,Q)# [,num=5000]
    print(txt)
    if verb.empty:
        return
    else:
        print(verb)

In [13]:
'''
例子：{Aegilops tauschii} {have} how many (Genes)?
What species do gene AET4Gv20696400 exist in?

Gene Os01g0740400 is transcribed into what sequence?
Gene id BGIOSGA035786 is transcribed into what protein?
sequence UPI00004C2817 is transcribed from which gene?

what is the GO that has relationship "is a " with GO 0060255 ?
what is the GO that has relationship "negatively regulates " with GO 0000086 ?
what genes do molecular function includes in Vitis vinifera?
what genes is an evidence ontology in Sorghum bicolor?
Sequence UPI0000000444 belongs to which protein? 

What genes are involved in the evidence and conclusion ontology of the gene ontology GO 0043234 ?
How many genes are involved in the evidence and conclusion ontology of gene ontology GO 0043234 ?
Does the gene Bra023371 participate in the evidence and conclusion ontology of gene ontology GO 0043234 ?

What genes are involved in this biological process of the gene ontology GO 0006281 ?
How many genes are involved in this biological process of gene ontology GO 0006281 ?  
Does this gene AET5Gv20554800 participate in this biological process of gene ontology GO 0055114 ?

What genes are contained in the cellular components of the gene ontology GO 0009941 ?
How many genes are contained in the cellular components of gene ontology GO 0009941 ?
Does the cell component of gene ontology GO 0009941 contain this gene AET4Gv20696400 ?

Which genes perform molecular functions on this gene ontology GO 0019706 ?
How many genes perform molecular functions on this gene ontology GO 0005515 ?
Does this gene AET3Gv21167000 perform molecular functions on this gene ontology GO 0005515 ?

Does this gene AET4Gv20649600 exist in this species Aegilops tauschii ?
How many genes exist in this species Brassica napus ?

What are the subtype of this gene ontology GO 0051052 ?
How many subcategories are there in this gene ontology GO 0051052 ? 
Is this gene ontology GO 0060255  a subcategory of the gene ontology GO 0051052 ?

What part of this gene ontology GO 0006886 ?
How many parts of this gene ontology GO 0051318 are there?
This gene ontology GO 0042254 is part of which gene ontology GO ?
Is this gene ontology GO 0034613 part of that gene ontology GO 0006886 ? 

Which gene ontology GO does the gene ontology GO 0006351 regulate?
Does the gene ontology GO 0006351 regulate that gene ontology GO 0006355 ? 
How many gene ontologies GO does this gene ontology GO 0006366 regulate?
Which gene ontology GO has a regulatory effect ?

Which gene ontology GO does the gene ontology GO 0023052 positive regulate ?
Does the gene ontology GO 0044700 positive regulate the gene ontology GO 0023056 ?
How many gene ontologies GO does this gene ontology GO 0044700 positive regulate ?
Which gene ontology has a positive regulatory effect ?

Which gene ontology GO does the gene ontology GO 0006351 negative regulate ?
Does the gene ontology GO 0006351 negative regulate the gene ontology GO 0045892 ?
How many gene ontologies GO does the gene ontology GO 0006351 negative regulate ?
Which gene ontology has a negative regulatory effect ?

What species are in the biological gene graph ?
What genes are in the biological gene graph ?
What gene ontologies GO are in the biological gene graph ?
What sequences are in the biological gene graph ?
What proteins are in the biological gene graph ?
运行函数

'''
if __name__=="__main__":
    run()

Enter your Question:Does the gene ontology GO 0006351 regulate that gene ontology GO 0006355 ? 
Yes,the GO: 0006351 regulate the gene ontology GO: 0006355


In [192]:
a="MATCH (Species)-[:have]->(Gene) where Gene.id=\"{geneid}\" RETURN Species LIMIT {num}"
txt='match(na:Species)-[:have]->(nb:Gene{id:"AET4Gv20696400"}) return na.name limit 10'
ar='match(na:Gene{id:"Os01g0740400"})-[be_transcribed_into]->(nb:Sequence) return nb.id limit 10'
get_Answer(ar)

Unnamed: 0,nb.id
0,UPI00004C2817


In [None]:
pip install python-Levenshtein

In [82]:
b='Is this gene ontology GO 0060255  a subcategory of the gene ontology GO 0051052 ?'
c=entity_extract(b)
subd=c.get_goids()
print(subd)
subd[0]

['0060255', '0051052']


'0060255'

In [23]:
txt='match(na:Species)-[:have]->(nb:Gene) return na.name limit 10'
c=get_Answer(txt)

In [33]:
type(c)

pandas.core.frame.DataFrame