In [1]:
gene_file_path = 'input/input_genes.txt'
phenotype_file_path = 'input/input_phenotype.txt'

In [2]:
import map_phenotype_to_gene
import collectVariantInfo
import pubmed
import ACMG
import filterVariantOnPhenotype
import csv

import pandas as pd
import numpy as np
import re
import myvariant
# from deepb.models import Main_table, Raw_input_table
from io import StringIO
from collections import Counter


# input_phenotype = 'data/sample_patient_phenotype.txt'
# input_genes = 'data/sample_genes.txt'

def format_hgvs(chrom, pos, ref, alt):
    '''get a valid hgvs name from VCF-style "chrom, pos, ref, alt" data.

    Example:

        >>> myvariant.format_hgvs("1", 35366, "C", "T")
        >>> myvariant.format_hgvs("2", 17142, "G", "GA")
        >>> myvariant.format_hgvs("MT", 8270, "CACCCCCTCT", "C")
        >>> myvariant.format_hgvs("X", 107930849, "GGA", "C")

    '''
    chrom = str(chrom)
    if chrom.lower().startswith('chr'):
        # trim off leading "chr" if any
        chrom = chrom[3:]
    if len(ref) == len(alt) == 1:
        # this is a SNP
        hgvs = 'chr{0}:g.{1}{2}>{3}'.format(chrom, pos, ref, alt)
    elif len(ref) > 1 and len(alt) == 1:
        # this is a deletion:
        if ref[0] == alt:
            start = int(pos) + 1
            end = int(pos) + len(ref) - 1
            hgvs = 'chr{0}:g.{1}_{2}del'.format(chrom, start, end)
        else:
            end = int(pos) + len(ref) - 1
            hgvs = 'chr{0}:g.{1}_{2}delins{3}'.format(chrom, pos, end, alt)
    elif len(ref) == 1 and len(alt) > 1:
        # this is a insertion
        if alt[0] == ref:
            hgvs = 'chr{0}:g.{1}_{2}ins'.format(chrom, pos, int(pos) + 1)
            ins_seq = alt[1:]
            hgvs += ins_seq
        else:
            hgvs = 'chr{0}:g.{1}delins{2}'.format(chrom, pos, alt)
    elif len(ref) > 1 and len(alt) > 1:
        end = int(pos) + len(alt) - 1
        hgvs = 'chr{0}:g.{1}_{2}delins{3}'.format(chrom, pos, end, alt)
    else:
        raise ValueError("Cannot convert {} into HGVS id.".format((chrom, pos, ref, alt)))
    return hgvs

def read_input_pheno_file(input_phenotype):
    if not input_phenotype:
        return '', ''
    text = StringIO(unicode(input_phenotype), newline=None)
    lines = text.readlines()
    lines = [line.strip() for line in lines]
    phenos = []
    for line in lines:
        if not line:
            continue
        phenos_each_line = re.split(r'  +|\t+|,|;|\.|\|', line.strip())
        phenos_each_line = [re.sub(r'^\W+|\W+$', '', s) for s in phenos_each_line]
        phenos_each_line = [s.lower() for s in phenos_each_line if s]
        phenos += phenos_each_line

    corner_cases = dict()
    for pheno in phenos:
        if re.search('development', pheno) and re.search('delay', pheno) and not re.search('growth', pheno):
            phenos.append('growth delay')
            corner_cases['growth delay'] = pheno.strip()
    for pheno in phenos:
        if re.search('growth', pheno) and re.search('delay', pheno) and not re.search('development', pheno):
            phenos.append('developmental delay')
            corner_cases['developmental delay'] = pheno.strip()
    return phenos, corner_cases

def read_input_gene_file(input_gene):
	candidate_vars = []
	input_gene = input_gene.split('\n')
	header = input_gene[0]
	sniffer = csv.Sniffer()
	dialect = sniffer.sniff(header)
	delimiter =  dialect.delimiter
	field_names = header.split(delimiter)
	chrom_idx, pos_idx, ref_idx, alt_idx, gene_idx = None, None, None, None, 0 

	for idx in xrange(len(field_names)):
		field = field_names[idx]
		if re.match(r'chrom', field, re.I): chrom_idx = idx
		if re.match(r'pos|start', field, re.I): pos_idx = idx
		if re.match(r'ref', field, re.I): ref_idx = idx
		if re.match(r'alt|allele 1', field, re.I): alt_idx = idx
		if re.match(r'gene (gene)|gene', field, re.I): gene_idx = idx

	input_gene_list = []
	CANDIDATE_GENES = []
	for line in input_gene[1:]:
		if not line:
			continue
		line = line.rstrip()
		parts = re.split(r'%s' % delimiter, line)
		input_gene_list.append(parts)
		gene = parts[gene_idx]
		CANDIDATE_GENES.append(gene)
		transcript, variant, variant_id = '', '', ''
		for part in parts:
			if re.search(r'_.*:c\.', part):
				transcript, variant = part.split(':')
			else:
				if re.search(r'c\.', part):
					variant = part
				if re.search(r'NM_', part, re.I):
					transcript = part.split(':')[0]
			if re.search(r'_.*:g\.', part):
				variant_id = 'chr' + part.split(':')[0].split('.')[-1] + part.split(':')[-1]
			if re.search(r'chr.*:g\.', part, re.I):
				variant_id = part
		if not variant_id and (chrom_idx and pos_idx and ref_idx and alt_idx):
			chrome, pos, ref, alt = parts[chrom_idx], parts[pos_idx], parts[ref_idx], parts[alt_idx]
			variant_id = format_hgvs(chrome, pos, ref, alt)
		candidate_vars.append((gene, variant, transcript, variant_id))

        # remove lines in the input file which has wrong number of fields
        field_nums = []
        for line in input_gene_list:
			field_nums.append(len(line))
        count = Counter(field_nums)
        correct_field_num = count.most_common()[0][0]
        correct_input_gene_list = []
        for line in input_gene_list:
			if len(line) == correct_field_num:
				correct_input_gene_list.append(line)
        df_genes = pd.DataFrame(correct_input_gene_list, columns = field_names)
        return candidate_vars, CANDIDATE_GENES, df_genes, field_names 

def map_phenotype2gene(CANDIDATE_GENES, phenos, corner_cases, candidate_vars):
	ranking_genes, ranking_disease = map_phenotype_to_gene.generate_score(phenos, CANDIDATE_GENES, corner_cases)
	# collect variant info
	hpo_filtered_genes = np.unique([i[0] for i in ranking_genes]).tolist()

	tmp_candidate_vars = []
	for var in candidate_vars:
		if var[0] in hpo_filtered_genes:
			tmp_candidate_vars.append(var)
	candidate_vars = tmp_candidate_vars
	return ranking_genes, candidate_vars

# 	status_step = "generating candidate variants ..." 
# 	raw_input = Raw_input_table.objects.get(id=raw_input_id)
# 	input_gene = raw_input.raw_input_gene
# 	input_phenotype = raw_input.raw_input_phenotype
input_gene = open("/Users/Tianqi/Desktop/xiaonantest1_g.txt",'rU').read()
input_phenotype = open("/Users/Tianqi/Desktop/xiaonantest1_p.txt",'rU').read()

# Read input pheno file and generate phenos and corner_cases 
phenos, corner_cases = read_input_pheno_file(input_phenotype)


# Read input gene file and generate candidate_vars. candidate_vars are (gene, variant, transcript, variant_id); CANDIDATE_GENES is a list of gene symbols; df_genes is a dataframe that keeps all the data that user uploaded; field_names are header of the input gene file 
candidate_vars, CANDIDATE_GENES, df_genes, field_names = read_input_gene_file(input_gene)

# map phenotype to gene; the candidate_vars was filtered: if it is a gene associated with phenos, then keep it.

if phenos:
#     raw_input.status = "Maping phenotypes to genes"
#     raw_input.save()
    ranking_genes, candidate_vars = map_phenotype2gene(CANDIDATE_GENES, phenos, corner_cases, candidate_vars)
else:
    ranking_genes = []
    for gene in CANDIDATE_GENES:
        ranking_genes.append((gene, 1.0, 1))

# collect variant info
# raw_input.status = "Annotating variants using genomic databases"
# raw_input.save()
mv = myvariant.MyVariantInfo()
final_res, variants = collectVariantInfo.get_variants(candidate_vars)

# pubmed
# raw_input.status = "Searching biomedical literatures"
# raw_input.save()
df_pubmed = pubmed.queryPubmedDB(final_res)

# ACMG
# raw_input.status = "Checking ACMG standard"
# raw_input.save()
df_hpo_ranking_genes = pd.DataFrame(ranking_genes, columns=['gene', 'score', 'hits'])
df_hpo_ranking_genes = df_hpo_ranking_genes[['gene', 'score']]
ACMG_result, variant_ACMG_interpretation, variant_ACMG_interpret_chinese = ACMG.Get_ACMG_result(df_hpo_ranking_genes, variants, df_pubmed)

# filter variant on phenotype

# if phenos:
# #     raw_input.status = "Filtering variants based on phenotypes"
# #     raw_input.save()
#     df_final_res, variant_ACMG_interpretation, variant_ACMG_interpret_chinese = filterVariantOnPhenotype.generateOutput(variants, ACMG_result, phenos, variant_ACMG_interpretation, variant_ACMG_interpret_chinese)
# else:
#     df_final_res = ACMG_result

ValueError: too many values to unpack

In [4]:
["<a href='https://www.ncbi.nlm.nih.gov/pubmed/%s'> %s </a>" %(i,i) for i in ['a','b']]


TypeError: not enough arguments for format string

In [11]:
ACMG_result

Unnamed: 0,gene,variant,id,final_score,pathogenicity_score,pathogenicity,hit_criteria,hpo_hit_score
0,BBS10,c.145C>T,chr12:g.76741994G>A,8.24,3.875000,Likely pathogenic,PS1|PM1|PP3|PP5,5.666667
1,HBB,c.208G>A,chr11:g.5247914C>T,8.11,2.625000,Uncertain significance,PM5|PM1|PP3|PP5,19.213095
2,DLX3,c.335A>G,chr17:g.48070945T>C,6.15,3.961200,Uncertain significance,PS1|PM1|PM2|PP3|BP1,2.000000
3,PROP1,c.652A>C,chr5:g.177419739T>G,5.23,2.250000,Uncertain significance,PM1|PM2|PP3,7.500000
4,SOX3,c.818C>T,chrX:g.139586408G>A,5.19,2.250000,Uncertain significance,PM1|PM2|PP3,7.333333
5,CEP152,c.3530A>G,chr15:g.49040744T>C,4.97,2.250000,Uncertain significance,PM1|PM2|PP3,6.404762
6,PRKDC,c.3278A>T,chr8:g.48813019T>A,4.45,2.250000,Uncertain significance,PM1|PM2|PP3,4.500000
7,PCDH15,c.4793G>A,chr10:g.55582714C>T,3.99,2.250000,Uncertain significance,PM1|PM2|PP3,3.166667
8,PLEC,c.1712G>A,chr8:g.145007482C>T,3.92,1.750000,Uncertain significance,PM1|PM2|PP3|BP1,6.666667
9,FOXRED1,c.286G>T,chr11:g.126141532G>T,3.47,1.500000,Uncertain significance,PM1|PP3,7.400000


In [29]:
gene_list = list(ACMG_result['gene'])
variant_list = list(ACMG_result['variant'])
gene_variant_list = [(gene_list[i], variant_list[i]) for i in range(len(gene_list))]

df_variant_ACMG_interpret = pd.DataFrame()
df_variant_ACMG_interpret_chinese = pd.DataFrame() 
for key in gene_variant_list:
    tmp_df = pd.DataFrame(variant_ACMG_interpretation[key], columns = ['criteria', 'interpretation'])
    tmp_df['gene'] = key[0]
    tmp_df['variant'] = key[1]
    tmp_df = tmp_df[['gene', 'variant', 'criteria', 'interpretation']] 
    df_variant_ACMG_interpret = pd.concat([df_variant_ACMG_interpret, tmp_df])

    tmp_df = pd.DataFrame(variant_ACMG_interpret_chinese[key], columns = ['criteria', 'interpretation'])
    tmp_df['gene'] = key[0]
    tmp_df['variant'] = key[1]
    tmp_df = tmp_df[['gene', 'variant', 'criteria', 'interpretation']] 
    df_variant_ACMG_interpret_chinese = pd.concat([df_variant_ACMG_interpret_chinese, tmp_df])

In [32]:
df_variant_ACMG_interpret_chinese

Unnamed: 0,gene,variant,criteria,interpretation
0,BBS10,c.145C>T,变异注释,"突变类型: missense_variant.<br/>蛋白功能区: GroEL-like equatorial domain.<br/>HGVS ID: chr12:g.76741994G>A.<br/>RefSeq ID: rs768933093.<br/>外显子: 1.<br/>ExAC 最小等位基因频率(MAF): 5.052e-05.<br/>DANN致病性分数: 0.999117068756.<br/>FATHMM致病性分数: 0.79186.<br/>MetaSVM致病性分数: 0.84564.<br/>GERP++序列保守性预测分数: 4.27.<br/>Clinvar数据库ID: 225010.<br/>Clinvar数据库记录的变异致病性: Pathogenic|Pathogenic.<br/>Clinvar数据库记录审核状态: criteria provided, single submitter.<br/>Clinvar数据库记录的Pubmed相关生物医学文献: ['25356970', '20120035', '16582908', '21044901', '20876674', '21642631', '20498079', '21517826', '20177705', '24746959', '25982971']."
1,BBS10,c.145C>T,PVS1,基因变异类型不是无效变异(null variant). 变异位点所在基因的功能丢失(loss of function)是已知的致病机制. 此变异不具有害的剪接效应(splicing effect). 不符合PVS1标准.
2,BBS10,c.145C>T,PS1和PM5,变异为错义突变. 之前报道导致相同氨基酸改变的基因变异被证明是致病的 (Clinvar数据库参考: 225010). 未发现导致相同氨基酸残基错义突变（不同氨基酸改变）导致的致病基因变异. 此变异不具有害的剪接效应(splicing effect). 符合PS1标准. 不符合PM5标准.
3,BBS10,c.145C>T,PS3和BS3,"[未发现针对此基因变异的完善的体内或体外功能性研究., 不符合PS3标准., 不符合BS3标准.]"
4,BBS10,c.145C>T,PS4,具有此基因变异的人群患病率并未显著升高，相对风险(Relative Risk)小于5.0.. 不符合PS4标准.
5,BBS10,c.145C>T,PM2,此基因变异的最小等位基因频率(MAF)极低(< 0.5%)，但并不引发隐性遗传病. 不符合PM2标准.
6,BBS10,c.145C>T,BA1和BS1,等位基因频率 <= 1%. 不符合BS1标准.
7,BBS10,c.145C>T,BS2,此基因变异在健康人中既不以隐性(纯合子)也不以显性(杂合子)状态存在. 不符合BS2标准.
8,BBS10,c.145C>T,PM1,此基因变异位于突变热点和/或关键的功能域(例如酶的活性部位)，且在这些区域不存在良性变异. 符合PM1标准.
9,BBS10,c.145C>T,PM4和BP3,变异位点在重复区域(repeat region). 不符合PM4标准. 不符合BP3标准.


In [31]:
df_variant_ACMG_interpret_chinese

Unnamed: 0,gene,variant,criteria,interpretation
0,BBS10,c.145C>T,变异注释,"突变类型: missense_variant.<br/>蛋白功能区: GroEL-like equatorial domain.<br/>HGVS ID: chr12:g.76741994G>A.<br/>RefSeq ID: rs768933093.<br/>外显子: 1.<br/>ExAC 最小等位基因频率(MAF): 5.052e-05.<br/>DANN致病性分数: 0.999117068756.<br/>FATHMM致病性分数: 0.79186.<br/>MetaSVM致病性分数: 0.84564.<br/>GERP++序列保守性预测分数: 4.27.<br/>Clinvar数据库ID: 225010.<br/>Clinvar数据库记录的变异致病性: Pathogenic|Pathogenic.<br/>Clinvar数据库记录审核状态: criteria provided, single submitter.<br/>Clinvar数据库记录的Pubmed相关生物医学文献: ['25356970', '20120035', '16582908', '21044901', '20876674', '21642631', '20498079', '21517826', '20177705', '24746959', '25982971']."
1,BBS10,c.145C>T,PVS1,基因变异类型不是无效变异(null variant). 变异位点所在基因的功能丢失(loss of function)是已知的致病机制. 此变异不具有害的剪接效应(splicing effect). 不符合PVS1标准.
2,BBS10,c.145C>T,PS1和PM5,变异为错义突变. 之前报道导致相同氨基酸改变的基因变异被证明是致病的 (Clinvar数据库参考: 225010). 未发现导致相同氨基酸残基错义突变（不同氨基酸改变）导致的致病基因变异. 此变异不具有害的剪接效应(splicing effect). 符合PS1标准. 不符合PM5标准.
3,BBS10,c.145C>T,PS3和BS3,"[未发现针对此基因变异的完善的体内或体外功能性研究., 不符合PS3标准., 不符合BS3标准.]"
4,BBS10,c.145C>T,PS4,具有此基因变异的人群患病率并未显著升高，相对风险(Relative Risk)小于5.0.. 不符合PS4标准.
5,BBS10,c.145C>T,PM2,此基因变异的最小等位基因频率(MAF)极低(< 0.5%)，但并不引发隐性遗传病. 不符合PM2标准.
6,BBS10,c.145C>T,BA1和BS1,等位基因频率 <= 1%. 不符合BS1标准.
7,BBS10,c.145C>T,BS2,此基因变异在健康人中既不以隐性(纯合子)也不以显性(杂合子)状态存在. 不符合BS2标准.
8,BBS10,c.145C>T,PM1,此基因变异位于突变热点和/或关键的功能域(例如酶的活性部位)，且在这些区域不存在良性变异. 符合PM1标准.
9,BBS10,c.145C>T,PM4和BP3,变异位点在重复区域(repeat region). 不符合PM4标准. 不符合BP3标准.


In [7]:
variant_ACMG_interpretation[('BBS10', 'c.145C>T')]

[('variant_annotations',
  u"Effect: missense_variant.<br/>Protein domain: GroEL-like equatorial domain.<br/>HGVS ID: chr12:g.76741994G>A.<br/>RefSeq ID: rs768933093.<br/>exon: 1.<br/>ExAC MAF: 5.052e-05.<br/>DANN pathogenicity score: 0.999117068756.<br/>FATHMM pathogenicity score: 0.79186.<br/>MetaSVM pathogenicity score: 0.84564.<br/>GERP++ conservation score: 4.27.<br/>Clinvar variation ids: 225010.<br/>Pathogenicity reported by Clinvar: Pathogenic|Pathogenic.<br/>Clinvar review status: criteria provided, single submitter.<br/>Pubmed references from Clinvar: ['25356970', '20120035', '16582908', '21044901', '20876674', '21642631', '20498079', '21517826', '20177705', '24746959', '25982971']."),
 ('PVS1',
  'Variant effect NOT in null variant type. Allele in a gene where loss of function (LOF) is a known mechanism of disease. The variant does NOT have damaging splicing effect. PVS1 is NOT met.'),
 ('PS1 and PM5',
  'Variant effect is missense. Same amino acid change as a previously est

In [10]:
df_final_res.head()

Unnamed: 0,gene,variant,protein,id,final_score,pathogenicity,hit_criteria,pathogenicity_score,hpo_hit_score,pheno_match_score
76,BBS10,c.145C>T,p.Arg49Trp,chr12:g.76741994G>A,9.51101,Likely pathogenic,PS1|PM1|PP3|PP5,3.875,5.666667,1.154249
103,HBB,c.208G>A,p.Gly70Ser,chr11:g.5247914C>T,8.713391,Uncertain significance,PM5|PM1|PP3|PP5,2.625,19.213095,1.074401
66,DLX3,c.335A>G,p.Lys112Arg,chr17:g.48070945T>C,6.15,Uncertain significance,PS1|PM1|PM2|PP3|BP1,3.9612,2.0,1.0
53,PROP1,c.652A>C,p.Ser218Arg,chr5:g.177419739T>G,5.23,Uncertain significance,PM1|PM2|PP3,2.25,7.5,1.0
107,SOX3,c.818C>T,p.Ser273Leu,chrX:g.139586408G>A,5.19,Uncertain significance,PM1|PM2|PP3,2.25,7.333333,1.0


In [11]:
df_genes.head()

Unnamed: 0,Chromosome,Start,Stop,Reference,Allele 1,Allele 2,Allele In Scope,Transcript,Gene Profile Report,Gene (gene),cDNA (cNomen),Protein (pNomen),HGVS cDNA-level nomenclature (fullCNomen)
0,1,1635536,1635536,.,.,A,A,NM_024011.2,,CDK11A,c.1735dupT,p.Y579Lfs*66,NM_024011.2:c.1735dupT
1,1,23713843,23713843,T,C,T,C,NM_003196.2,,TCEA3,c.889A>G,p.M297V,NM_003196.2:c.889A>G
2,1,24882663,24882663,A,A,G,G,NM_001010980.4,,NCMAP,c.-8+3A>G,,NM_001010980.4:c.-8+3A>G
3,1,26663842,26663842,G,G,A,A,NM_001039775.3,,AIM1L,c.3673C>T,p.R1225W,NM_001039775.3:c.3673C>T
4,1,39896387,39896387,A,C,A,C,NM_012090.5,,MACF1,c.10958A>C,p.E3653A,NM_012090.5:c.10958A>C


In [8]:
phenos

[u'bilateral post-axial polydactyly of hands and feet',
 u'macrocephaly',
 u'tall stature',
 u'post-natal',
 u'central hypotonia',
 u'global developmental delay',
 u'obstructive sleep apnea',
 u'delayed cns central nervous myelination',
 u'white matter',
 u'corpus callosum',
 u'mildly ectopic neurohypophys posterior pituitary upslanted palpebral fissures',
 u'epicanthus',
 u'arched eyebrows',
 u'craniofacial asymmetry',
 u'bronchial stenosis',
 u'macrosomia',
 u'recurrent infection',
 u'gastroesophageal reflux',
 u'supratentorial perivascular space',
 u'mucopolysaccharidosis plasma urine creatine',
 u'purine',
 u'pyrimidine metabolism',
 u'gross motor',
 u'language',
 u'cognitive',
 u'ventriculomegaly',
 u'metatarsal bones',
 u'proximal phalanges',
 u'obesity',
 u'wheeze',
 u'brachycephaly frontal prominence',
 u'bitemporal narrowing',
 u'clinodactyly',
 u'brachydactyly',
 u'bardet-biedl',
 'growth delay',
 'developmental delay']

In [9]:
field_names

['Chromosome',
 'Start',
 'Stop',
 'Reference',
 'Allele 1',
 'Allele 2',
 'Allele In Scope',
 'Transcript',
 'Gene Profile Report',
 'Gene (gene)',
 'cDNA (cNomen)',
 'Protein (pNomen)',
 'HGVS cDNA-level nomenclature (fullCNomen)']

In [17]:
variant_ACMG_interpretation.head()

Unnamed: 0,gene,variant,criteria,interpretation
0,BBS10,c.145C>T,variant_annotations,"Effect: missense_variant.<br/>Protein domain: GroEL-like equatorial domain.<br/>HGVS ID: chr12:g.76741994G>A.<br/>RefSeq ID: rs768933093.<br/>exon: 1.<br/>ExAC MAF: 5.052e-05.<br/>DANN pathogenicity score: 0.999117068756.<br/>FATHMM pathogenicity score: 0.79186.<br/>MetaSVM pathogenicity score: 0.84564.<br/>GERP++ conservation score: 4.27.<br/>Clinvar variation ids: 225010.<br/>Pathogenicity reported by Clinvar: Pathogenic|Pathogenic.<br/>Clinvar review status: criteria provided, single submitter.<br/>Pubmed references from Clinvar: ['25356970', '20120035', '16582908', '21044901', '20876674', '21642631', '20498079', '21517826', '20177705', '24746959', '25982971']."
1,BBS10,c.145C>T,PVS1,Variant effect NOT in null variant type. Allele in a gene where loss of function (LOF) is a known mechanism of disease. The variant does NOT have damaging splicing effect. PVS1 is NOT met.
2,BBS10,c.145C>T,PS1 and PM5,Variant effect is missense. Same amino acid change as a previously established pathogenic variant regardless of nucleotide change (Clinvar references: 225010). Not find missense change at an amino acid residue where a different missense change determined to be pathogenic has been seen before. The variant does NOT have damaging splicing effect. PS1 is met. PM5 is NOT met.
3,BBS10,c.145C>T,PS3 and BS3,Not find well-established functional studies on this variant. PS3 is NOT met. BS3 is NOT met.
4,BBS10,c.145C>T,PS4,Relative risk is smaller than 5. The prevalence of the variant in affected individuals is NOT significantly increased compared with the prevalence in controls. PS4 is NOT met.


In [18]:
variant_ACMG_interpret_chinese.head()

Unnamed: 0,基因,变异,标准,解读
0,BBS10,c.145C>T,变异注释,"突变类型: missense_variant.<br/>蛋白功能区: GroEL-like equatorial domain.<br/>HGVS ID: chr12:g.76741994G>A.<br/>RefSeq ID: rs768933093.<br/>外显子: 1.<br/>ExAC 最小等位基因频率(MAF): 5.052e-05.<br/>DANN致病性分数: 0.999117068756.<br/>FATHMM致病性分数: 0.79186.<br/>MetaSVM致病性分数: 0.84564.<br/>GERP++序列保守性预测分数: 4.27.<br/>Clinvar数据库ID: 225010.<br/>Clinvar数据库记录的变异致病性: 致病|致病.<br/>Clinvar数据库记录审核状态: 提供标准，单个提交者.<br/>Clinvar数据库记录的Pubmed相关生物医学文献: ['25356970', '20120035', '16582908', '21044901', '20876674', '21642631', '20498079', '21517826', '20177705', '24746959', '25982971']."
1,BBS10,c.145C>T,PVS1,基因变异类型不是无效变异(null variant). 变异位点所在基因的功能丢失(loss of function)是已知的致病机制. 此变异不具有害的剪接效应(splicing effect). 不符合PVS1标准.
2,BBS10,c.145C>T,PS1和PM5,变异为错义突变. 之前报道导致相同氨基酸改变的基因变异被证明是致病的 (Clinvar数据库参考: 225010). 未发现导致相同氨基酸残基错义突变（不同氨基酸改变）导致的致病基因变异. 此变异不具有害的剪接效应(splicing effect). 符合PS1标准. 不符合PM5标准.
3,BBS10,c.145C>T,PS3和BS3,未发现针对此基因变异的完善的体内或体外功能性研究. 不符合PS3标准. 不符合BS3标准.
4,BBS10,c.145C>T,PS4,具有此基因变异的人群患病率并未显著升高，相对风险(Relative Risk)小于5.0.. 不符合PS4标准.


In [None]:
df_final_res, df_genes, phenos, field_names, variant_ACMG_interpretation, variant_ACMG_interpret_chinese

In [14]:
nput_gene = df_genes.to_json(orient='records')
input_phenotype = ', '.join(phenos)
result_table = ACMG_result.to_json(orient='records')
interpretation = variant_ACMG_interpretation.to_json(orient='records')
interpretation_chinese = variant_ACMG_interpret_chinese.to_json(orient='records')
# logger.info("Finish processing data, start writing data to DB in background main task")


# sample = Main_table(
#     task_id=raw_input_id,
#     input_gene=input_gene,
#     input_phenotype=input_phenotype,
#     result=result_table,
#     interpretation=interpretation,
#     interpretation_chinese=interpretation_chinese,
#     pub_date=timezone.now(),
#     user_name=raw_input.user_name,
#     task_name=raw_input.task_name,
# )
# sample.save()
# logger.info("Finish writing data to DB in background main task")
# raw_input.status = "succeed"
# raw_input.save()

In [16]:
interpretation[:1000]

'[{"gene":"BBS10","variant":"c.145C>T","criteria":"variant_annotations","interpretation":"Effect: missense_variant.<br\\/>Protein domain: GroEL-like equatorial domain.<br\\/>HGVS ID: chr12:g.76741994G>A.<br\\/>RefSeq ID: rs768933093.<br\\/>exon: 1.<br\\/>ExAC MAF: 5.052e-05.<br\\/>DANN pathogenicity score: 0.999117068756.<br\\/>FATHMM pathogenicity score: 0.79186.<br\\/>MetaSVM pathogenicity score: 0.84564.<br\\/>GERP++ conservation score: 4.27.<br\\/>Clinvar variation ids: 225010.<br\\/>Pathogenicity reported by Clinvar: Pathogenic|Pathogenic.<br\\/>Clinvar review status: criteria provided, single submitter.<br\\/>Pubmed references from Clinvar: [\'25356970\', \'20120035\', \'16582908\', \'21044901\', \'20876674\', \'21642631\', \'20498079\', \'21517826\', \'20177705\', \'24746959\', \'25982971\']."},{"gene":"BBS10","variant":"c.145C>T","criteria":"PVS1","interpretation":"Variant effect NOT in null variant type. Allele in a gene where loss of function (LOF) is a known mechanism of dis

In [21]:
interpretation_chinese[:1000]

'[{"\\u57fa\\u56e0":"BBS10","\\u53d8\\u5f02":"c.145C>T","\\u6807\\u51c6":"\\u53d8\\u5f02\\u6ce8\\u91ca","\\u89e3\\u8bfb":"\\u7a81\\u53d8\\u7c7b\\u578b: missense_variant.<br\\/>\\u86cb\\u767d\\u529f\\u80fd\\u533a: GroEL-like equatorial domain.<br\\/>HGVS ID: chr12:g.76741994G>A.<br\\/>RefSeq ID: rs768933093.<br\\/>\\u5916\\u663e\\u5b50: 1.<br\\/>ExAC \\u6700\\u5c0f\\u7b49\\u4f4d\\u57fa\\u56e0\\u9891\\u7387(MAF): 5.052e-05.<br\\/>DANN\\u81f4\\u75c5\\u6027\\u5206\\u6570: 0.999117068756.<br\\/>FATHMM\\u81f4\\u75c5\\u6027\\u5206\\u6570: 0.79186.<br\\/>MetaSVM\\u81f4\\u75c5\\u6027\\u5206\\u6570: 0.84564.<br\\/>GERP++\\u5e8f\\u5217\\u4fdd\\u5b88\\u6027\\u9884\\u6d4b\\u5206\\u6570: 4.27.<br\\/>Clinvar\\u6570\\u636e\\u5e93ID: 225010.<br\\/>Clinvar\\u6570\\u636e\\u5e93\\u8bb0\\u5f55\\u7684\\u53d8\\u5f02\\u81f4\\u75c5\\u6027: \\u81f4\\u75c5|\\u81f4\\u75c5.<br\\/>Clinvar\\u6570\\u636e\\u5e93\\u8bb0\\u5f55\\u5ba1\\u6838\\u72b6\\u6001: \\u63d0\\u4f9b\\u6807\\u51c6\\uff0c\\u5355\\u4e2a\\u63d0\\u4ea4\

In [None]:
interpretation_chinese = variant_ACMG_interpret_chinese.to_json(orient='records')

In [35]:
round(2.7)

3.0

In [36]:
8/3

2

In [8]:
aa = ''

In [9]:
["<a href='https://www.ncbi.nlm.nih.gov/clinvar/variation/%s/'> %s </a>" %(i,i) for i in aa.split('|')]

["<a href='https://www.ncbi.nlm.nih.gov/clinvar/variation//'>  </a>"]

In [19]:
if bb:
    b=1+1

In [20]:
b

NameError: name 'b' is not defined

In [17]:
bb = []

In [18]:
["<a href='https://www.ncbi.nlm.nih.gov/pubmed/%s'> %s </a>" %(i,i) for i in bb]

[]

In [10]:
import csv
import pandas as pd
import numpy as np
import re
import myvariant
from io import StringIO
from collections import Counter

In [2]:
input_gene = open("/Users/Tianqi/Desktop/vcfsample.txt",'rU').read()

In [3]:
candidate_vars = []
input_gene = input_gene.split('\n')

for line in input_gene:
    if line and line[:2] != "##":
        header = line
        sniffer = csv.Sniffer()
        dialect = sniffer.sniff(header)
        delimiter =  dialect.delimiter
        field_names = header.split(delimiter)
        break
        
chrom_idx, pos_idx, ref_idx, alt_idx, gene_idx = None, None, None, None, None

for idx in xrange(len(field_names)):
    field = field_names[idx]
    if re.match(r'chrom|#chrom', field, re.I): chrom_idx = idx
    if re.match(r'pos|start', field, re.I): pos_idx = idx
    if re.match(r'ref', field, re.I): ref_idx = idx
    if re.match(r'alt|allele 1', field, re.I): alt_idx = idx
    if re.match(r'gene (gene)|gene', field, re.I): gene_idx = idx

input_gene_list = []
CANDIDATE_GENES = []
for line in input_gene[1:]:
    if not line:
        continue
    line = line.rstrip()
    parts = re.split(r'%s' % delimiter, line)
    if line.startswith('#'):
        continue
    input_gene_list.append(parts)
    gene, transcript, variant, variant_id = '', '', '', ''
    if gene_idx:
        gene = parts[gene_idx]
        CANDIDATE_GENES.append(gene)
    for part in parts:
        if re.search(r'_.*:c\.', part):
            transcript, variant = part.split(':')
        else:
            if re.search(r'c\.', part):
                variant = part
            if re.search(r'NM_', part, re.I):
                transcript = part.split(':')[0]
        if re.search(r'_.*:g\.', part):
            variant_id = 'chr' + part.split(':')[0].split('.')[-1] + part.split(':')[-1]
        if re.search(r'chr.*:g\.', part, re.I):
            variant_id = part
    if not variant_id and (chrom_idx is not None and pos_idx is not None and ref_idx is not None and alt_idx is not None):
        chrome, pos, ref, alt = parts[chrom_idx], parts[pos_idx], parts[ref_idx], parts[alt_idx]
        variant_id = format_hgvs(chrome, pos, ref, alt)
    candidate_vars.append((gene, variant, transcript, variant_id))

# remove lines in the input file which has wrong number of fields
field_nums = []
for line in input_gene_list:
    field_nums.append(len(line))
count = Counter(field_nums)
correct_field_num = count.most_common()[0][0]
correct_input_gene_list = []
for line in input_gene_list:
    if len(line) == correct_field_num:
        correct_input_gene_list.append(line)
df_genes = pd.DataFrame(correct_input_gene_list, columns = field_names)

In [4]:
df_genes

Unnamed: 0,#CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,NA00001,NA00002,NA00003
0,20,14370,rs6054257,G,A,29,PASS,NS=3;DP=14;AF=0.5;DB;H2,GT:GQ:DP:HQ,"0|0:48:1:51,51","1|0:48:8:51,51","1/1:43:5:.,."


In [5]:
candidate_vars

[('', '', '', 'chr20:g.14370G>A')]

In [1]:
import map_phenotype_to_gene
import collectVariantInfo
import pubmed
import ACMG
import filterVariantOnPhenotype
import csv

import pandas as pd
import numpy as np
import re
import myvariant
# from deepb.models import Main_table, Raw_input_table
from io import StringIO
from collections import Counter


# input_phenotype = 'data/sample_patient_phenotype.txt'
# input_genes = 'data/sample_genes.txt'

def format_hgvs(chrom, pos, ref, alt):
    '''get a valid hgvs name from VCF-style "chrom, pos, ref, alt" data.

    Example:

        >>> myvariant.format_hgvs("1", 35366, "C", "T")
        >>> myvariant.format_hgvs("2", 17142, "G", "GA")
        >>> myvariant.format_hgvs("MT", 8270, "CACCCCCTCT", "C")
        >>> myvariant.format_hgvs("X", 107930849, "GGA", "C")

    '''
    chrom = str(chrom)
    if chrom.lower().startswith('chr'):
        # trim off leading "chr" if any
        chrom = chrom[3:]
    if len(ref) == len(alt) == 1:
        # this is a SNP
        hgvs = 'chr{0}:g.{1}{2}>{3}'.format(chrom, pos, ref, alt)
    elif len(ref) > 1 and len(alt) == 1:
        # this is a deletion:
        if ref[0] == alt:
            start = int(pos) + 1
            end = int(pos) + len(ref) - 1
            hgvs = 'chr{0}:g.{1}_{2}del'.format(chrom, start, end)
        else:
            end = int(pos) + len(ref) - 1
            hgvs = 'chr{0}:g.{1}_{2}delins{3}'.format(chrom, pos, end, alt)
    elif len(ref) == 1 and len(alt) > 1:
        # this is a insertion
        if alt[0] == ref:
            hgvs = 'chr{0}:g.{1}_{2}ins'.format(chrom, pos, int(pos) + 1)
            ins_seq = alt[1:]
            hgvs += ins_seq
        else:
            hgvs = 'chr{0}:g.{1}delins{2}'.format(chrom, pos, alt)
    elif len(ref) > 1 and len(alt) > 1:
        end = int(pos) + len(alt) - 1
        hgvs = 'chr{0}:g.{1}_{2}delins{3}'.format(chrom, pos, end, alt)
    else:
        raise ValueError("Cannot convert {} into HGVS id.".format((chrom, pos, ref, alt)))
    return hgvs

def read_input_pheno_file(input_phenotype):
    if not input_phenotype:
        return '', ''
    text = StringIO(unicode(input_phenotype), newline=None)
    lines = text.readlines()
    lines = [line.strip() for line in lines]
    phenos = []
    for line in lines:
        if not line:
            continue
        phenos_each_line = re.split(r'  +|\t+|,|;|\.|\|', line.strip())
        phenos_each_line = [re.sub(r'^\W+|\W+$', '', s) for s in phenos_each_line]
        phenos_each_line = [s.lower() for s in phenos_each_line if s]
        phenos += phenos_each_line

    corner_cases = dict()
    for pheno in phenos:
        if re.search('development', pheno) and re.search('delay', pheno) and not re.search('growth', pheno):
            phenos.append('growth delay')
            corner_cases['growth delay'] = pheno.strip()
    for pheno in phenos:
        if re.search('growth', pheno) and re.search('delay', pheno) and not re.search('development', pheno):
            phenos.append('developmental delay')
            corner_cases['developmental delay'] = pheno.strip()
    return phenos, corner_cases

def read_input_gene_file(input_gene):
    candidate_vars = []
    input_gene = input_gene.split('\n')

    for line in input_gene:
        if line and line[:2] != "##":
            header = line
            sniffer = csv.Sniffer()
            dialect = sniffer.sniff(header)
            delimiter =  dialect.delimiter
            field_names = header.split(delimiter)
            break

    chrom_idx, pos_idx, ref_idx, alt_idx, gene_idx = None, None, None, None, None

    for idx in xrange(len(field_names)):
        field = field_names[idx]
        if re.match(r'chrom|#chrom', field, re.I): chrom_idx = idx
        if re.match(r'pos|start', field, re.I): pos_idx = idx
        if re.match(r'ref', field, re.I): ref_idx = idx
        if re.match(r'alt|allele 1', field, re.I): alt_idx = idx
        if re.match(r'gene (gene)|gene', field, re.I): gene_idx = idx

    input_gene_list = []
    CANDIDATE_GENES = []
    for line in input_gene[1:]:
        if not line:
            continue
        line = line.rstrip()
        parts = re.split(r'%s' % delimiter, line)
        if line.startswith('#'):
            continue
        input_gene_list.append(parts)
        gene, transcript, variant, variant_id = '', '', '', ''
        if gene_idx:
            gene = parts[gene_idx]
            CANDIDATE_GENES.append(gene)
        for part in parts:
            if re.search(r'_.*:c\.', part):
                transcript, variant = part.split(':')
            else:
                if re.search(r'c\.', part):
                    variant = part
                if re.search(r'NM_', part, re.I):
                    transcript = part.split(':')[0]
            if re.search(r'_.*:g\.', part):
                variant_id = 'chr' + part.split(':')[0].split('.')[-1] + part.split(':')[-1]
            if re.search(r'chr.*:g\.', part, re.I):
                variant_id = part
        if not variant_id and (chrom_idx is not None and pos_idx is not None and ref_idx is not None and alt_idx is not None):
            chrome, pos, ref, alt = parts[chrom_idx], parts[pos_idx], parts[ref_idx], parts[alt_idx]
            variant_id = format_hgvs(chrome, pos, ref, alt)
        candidate_vars.append((gene, variant, transcript, variant_id))

    # remove lines in the input file which has wrong number of fields
    field_nums = []
    for line in input_gene_list:
        field_nums.append(len(line))
    count = Counter(field_nums)
    correct_field_num = count.most_common()[0][0]
    correct_input_gene_list = []
    for line in input_gene_list:
        if len(line) == correct_field_num:
            correct_input_gene_list.append(line)
    df_genes = pd.DataFrame(correct_input_gene_list, columns = field_names)
    return candidate_vars, CANDIDATE_GENES, df_genes, field_names 

def map_phenotype2gene(CANDIDATE_GENES, phenos, corner_cases, candidate_vars):
	ranking_genes, ranking_disease = map_phenotype_to_gene.generate_score(phenos, CANDIDATE_GENES, corner_cases)
	# collect variant info
	hpo_filtered_genes = np.unique([i[0] for i in ranking_genes]).tolist()

	tmp_candidate_vars = []
	for var in candidate_vars:
		if var[0] in hpo_filtered_genes:
			tmp_candidate_vars.append(var)
	candidate_vars = tmp_candidate_vars
	return ranking_genes, candidate_vars

# def master_function(raw_input_id):
# 	status_step = "generating candidate variants ..." 
# 	raw_input = Raw_input_table.objects.get(id=raw_input_id)
# 	input_gene = raw_input.raw_input_gene
# 	input_phenotype = raw_input.raw_input_phenotype
input_gene = open("/Users/Tianqi/Desktop/vcfsample.txt",'rU').read()
input_phenotype = ''
# Read input pheno file and generate phenos and corner_cases 
phenos, corner_cases = read_input_pheno_file(input_phenotype)


# Read input gene file and generate candidate_vars. candidate_vars are (gene, variant, transcript, variant_id); CANDIDATE_GENES is a list of gene symbols; df_genes is a dataframe that keeps all the data that user uploaded; field_names are header of the input gene file 
candidate_vars, CANDIDATE_GENES, df_genes, field_names = read_input_gene_file(input_gene)

if not CANDIDATE_GENES:
    # collect variant info
#     raw_input.status = "Annotating variants using genomic databases"
#     raw_input.save()
    final_res, variants = collectVariantInfo.get_variants_from_vcf(candidate_vars)
    CANDIDATE_GENES = [_[0] for _ in final_res]
    # map phenotype to gene; the candidate_vars was filtered: if it is a gene associated with phenos, then keep it.
    if phenos:
#         raw_input.status = "Maping phenotypes to genes"
#         raw_input.save()
        ranking_genes, candidate_vars = map_phenotype2gene(CANDIDATE_GENES, phenos, corner_cases, candidate_vars)
    else:
        ranking_genes = []
        for gene in CANDIDATE_GENES:
            ranking_genes.append((gene, 1.0, 1))
else:
    # map phenotype to gene; the candidate_vars was filtered: if it is a gene associated with phenos, then keep it.
    if phenos:
#         raw_input.status = "Maping phenotypes to genes"
#         raw_input.save()
        ranking_genes, candidate_vars = map_phenotype2gene(CANDIDATE_GENES, phenos, corner_cases, candidate_vars)
    else:
        ranking_genes = []
        for gene in CANDIDATE_GENES:
            ranking_genes.append((gene, 1.0, 1))
    # collect variant info
#     raw_input.status = "Annotating variants using genomic databases"
#     raw_input.save()
    final_res, variants = collectVariantInfo.get_variants(candidate_vars)

if final_res == [] and variants == defaultdict(dict):
#     return None, df_genes, phenos, field_names, None, None
    1
else:
    # pubmed
    # raw_input.status = "Searching biomedical literatures"
    # raw_input.save()
    df_pubmed = pubmed.queryPubmedDB(final_res)

    # ACMG
    # raw_input.status = "Checking ACMG standard"
    # raw_input.save()
    df_hpo_ranking_genes = pd.DataFrame(ranking_genes, columns=['gene', 'score', 'hits'])
    df_hpo_ranking_genes = df_hpo_ranking_genes[['gene', 'score']]
    ACMG_result, variant_ACMG_interpretation, variant_ACMG_interpret_chinese, df_variant_ACMG_interpret, df_variant_ACMG_interpret_chinese = ACMG.Get_ACMG_result(df_hpo_ranking_genes, variants, df_pubmed)

    # filter variant on phenotype

    if phenos:
    #     raw_input.status = "Filtering variants based on phenotypes"
    #     raw_input.save()
        df_final_res, variant_ACMG_interpretation, variant_ACMG_interpret_chinese = filterVariantOnPhenotype.generateOutput(variants, ACMG_result, phenos, variant_ACMG_interpretation, variant_ACMG_interpret_chinese)
    #     return df_final_res, df_genes, phenos, field_names, variant_ACMG_interpretation, variant_ACMG_interpret_chinese

    # else:
    #     return ACMG_result, df_genes, phenos, field_names, df_variant_ACMG_interpret, df_variant_ACMG_interpret_chinese

TypeError: can't multiply sequence by non-int of type 'float'

In [None]:
final_res, variants = collectVariantInfo.get_variants_from_vcf(candidate_vars)

In [23]:
candidate_vars

[('', '', '', 'chr20:g.14370G>A')]

In [2]:
final_res

[(u'PTPN11', u'c.923A>G', u'p.Asn308Ser')]

In [21]:
from collections import defaultdict

In [4]:
variants

defaultdict(dict,
            {(u'PTPN11', u'c.923A>G'): {'alt': u'G',
              'clinvar_pathogenicity': u'Pathogenic|Pathogenic|Pathogenic|Pathogenic',
              'clinvar_pmids': [],
              'clinvar_review_status': '',
              'clinvar_variation_ids': '',
              'dann': '0.998400682813',
              'dbscSNV_ada_score': '',
              'dbscSNV_rf_score': '',
              'effect': u'missense_variant',
              'exon': u'8',
              'fathmm': '0.99101',
              'gene': u'PTPN11',
              'gerp++': '5.64',
              'id': u'chr12:g.112915524A>G',
              'interpro_domain': [u'PTP type protein phosphatase|Protein-tyrosine phosphatase-like'],
              'maf_1000g': '',
              'maf_esp6500': '',
              'maf_exac': '',
              'metasvm': '0.99878',
              'protein': u'p.Asn308Ser',
              'ref': u'A',
              'rsid': u'rs121918455',
              'transcript': u'NM_002834.3',
    

In [11]:
pubmed.queryPubmedDB(final_res)

KeyError: ('CD59', 'c.146delA')

In [7]:
df_genes

Unnamed: 0,#CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,NA00001,NA00002,NA00003
0,12,112915524,rs6054257,A,G,29,PASS,NS=3;DP=14;AF=0.5;DB;H2,GT:GQ:DP:HQ,"0|0:48:1:51,51","1|0:48:8:51,51","1/1:43:5:.,."


In [4]:
a = '[{"#CHROM":"16","POS":"50745926","ID":"rs2066844","REF":"C","ALT":"T","QUAL":"80","FILTER":"PASS","INFO":"NS=3;DP=14;AF=0.5;DB;H2","FORMAT":"GT:GQ:DP:HQ","NA00001":"0|0:48:1:51,51","NA00002":"1|0:48:8:51,51","NA00003":"1\/1:43:5:.,."},{"#CHROM":"20","POS":"14370","ID":"rs6054257","REF":"G","ALT":"A","QUAL":"29","FILTER":"PASS","INFO":"NS=3;DP=14;AF=0.5;DB;H2","FORMAT":"GT:GQ:DP:HQ","NA00001":"0|0:48:1:51,51","NA00002":"1|0:48:8:51,51","NA00003":"1\/1:43:5:.,."},{"#CHROM":"20","POS":"17330","ID":".","REF":"T","ALT":"A","QUAL":"3","FILTER":"q10","INFO":"NS=3;DP=11;AF=0.017","FORMAT":"GT:GQ:DP:HQ","NA00001":"0|0:49:3:58,50","NA00002":"0|1:3:5:65,3","NA00003":"0\/0:41:3"},{"#CHROM":"20","POS":"1110696","ID":"rs6040355","REF":"A","ALT":"G,T","QUAL":"67","FILTER":"PASS","INFO":"NS=2;DP=10;AF=0.333,0.667;AA=T;DB","FORMAT":"GT:GQ:DP:HQ","NA00001":"1|2:21:6:23,27","NA00002":"2|1:2:0:18,2","NA00003":"2\/2:35:4"},{"#CHROM":"20","POS":"1230237","ID":".","REF":"T","ALT":"G","QUAL":"47","FILTER":"PASS","INFO":"NS=3;DP=13;AA=T","FORMAT":"GT:GQ:DP:HQ","NA00001":"0|0:54:7:56,60","NA00002":"0|0:48:4:51,51","NA00003":"0\/0:61:2"},{"#CHROM":"20","POS":"1230288","ID":".","REF":"T","ALT":".","QUAL":"50","FILTER":"PASS","INFO":"NS=3;DP=13;AA=T","FORMAT":"GT:GQ:DP:HQ","NA00001":"0|0:54:7:56,60","NA00002":"0|0:48:4:51,51","NA00003":"0\/0:61:2"},{"#CHROM":"20","POS":"1234567","ID":"microsat1","REF":"GTCT","ALT":"G,GTACT","QUAL":"50","FILTER":"PASS","INFO":"NS=3;DP=9;AA=G","FORMAT":"GT:GQ:DP","NA00001":"0\/1:35:4","NA00002":"0\/2:17:2","NA00003":"1\/1:40:3"}]'

In [7]:
a.split("},{")

['[{"#CHROM":"16","POS":"50745926","ID":"rs2066844","REF":"C","ALT":"T","QUAL":"80","FILTER":"PASS","INFO":"NS=3;DP=14;AF=0.5;DB;H2","FORMAT":"GT:GQ:DP:HQ","NA00001":"0|0:48:1:51,51","NA00002":"1|0:48:8:51,51","NA00003":"1\\/1:43:5:.,."',
 '"#CHROM":"20","POS":"14370","ID":"rs6054257","REF":"G","ALT":"A","QUAL":"29","FILTER":"PASS","INFO":"NS=3;DP=14;AF=0.5;DB;H2","FORMAT":"GT:GQ:DP:HQ","NA00001":"0|0:48:1:51,51","NA00002":"1|0:48:8:51,51","NA00003":"1\\/1:43:5:.,."',
 '"#CHROM":"20","POS":"17330","ID":".","REF":"T","ALT":"A","QUAL":"3","FILTER":"q10","INFO":"NS=3;DP=11;AF=0.017","FORMAT":"GT:GQ:DP:HQ","NA00001":"0|0:49:3:58,50","NA00002":"0|1:3:5:65,3","NA00003":"0\\/0:41:3"',
 '"#CHROM":"20","POS":"1110696","ID":"rs6040355","REF":"A","ALT":"G,T","QUAL":"67","FILTER":"PASS","INFO":"NS=2;DP=10;AF=0.333,0.667;AA=T;DB","FORMAT":"GT:GQ:DP:HQ","NA00001":"1|2:21:6:23,27","NA00002":"2|1:2:0:18,2","NA00003":"2\\/2:35:4"',
 '"#CHROM":"20","POS":"1230237","ID":".","REF":"T","ALT":"G","QUAL":"47

In [50]:
b='[{"Gene":"AARS","Transcript":"NM_001605.2","cDNA (cNomen)":"c.1786-5T>C","Protein (pNomen)":"","Target Position":"chr16:70293094-70293094"},{"Gene":"AP4E1","Transcript":"NM_007347.4","cDNA (cNomen)":"c.2553A>T","Protein (pNomen)":"p.Glu851Asp","Target Position":"chr15:51289729-51289729"},{"Gene":"ARFGEF2","Transcript":"NM_006420.2","cDNA (cNomen)":"c.3120C>T","Protein (pNomen)":"p.Leu1040Leu","Target Position":"chr20:47611134-47611134"},{"Gene":"ARL2BP","Transcript":"NM_012106.3","cDNA (cNomen)":"c.207C>T","Protein (pNomen)":"p.Tyr69Tyr","Target Position":"chr16:57282555-57282555"},{"Gene":"ASPM","Transcript":"NM_018136.4","cDNA (cNomen)":"c.7812A>T","Protein (pNomen)":"p.Lys2604Asn","Target Position":"chr1:197070569-197070569"},{"Gene":"CD96","Transcript":"NM_198196.2","cDNA (cNomen)":"c.1295A>T","Protein (pNomen)":"p.Gln432Leu","Target Position":"chr3:111342667-111342667"},{"Gene":"CHAMP1","Transcript":"NM_001164144.1","cDNA (cNomen)":"c.449_451delCTC","Protein (pNomen)":"p.Pro150del","Target Position":"chr13:115089762-115089766"},{"Gene":"CREBBP","Transcript":"NM_004380.2","cDNA (cNomen)":"c.5354G>A","Protein (pNomen)":"p.Cys1785Tyr","Target Position":"chr16:3779694-3779694"},{"Gene":"DYNC1H1","Transcript":"NM_001376.4","cDNA (cNomen)":"c.634A>G","Protein (pNomen)":"p.Met212Val","Target Position":"chr14:102446171-102446171"},{"Gene":"DYNC2H1","Transcript":"NM_001080463.1","cDNA (cNomen)":"c.2479A>G","Protein (pNomen)":"p.Ile827Val","Target Position":"chr11:103006582-103006582"},{"Gene":"ERCC6L2","Transcript":"NM_020207.4","cDNA (cNomen)":"c.4123A>C","Protein (pNomen)":"p.Thr1375Pro","Target Position":"chr9:98774922-98774922"},{"Gene":"FRAS1","Transcript":"NM_025074.6","cDNA (cNomen)":"c.3065A>C","Protein (pNomen)":"p.Lys1022Thr","Target Position":"chr4:79295319-79295319"},{"Gene":"GBA2","Transcript":"NM_020944.2","cDNA (cNomen)":"c.379C>T","Protein (pNomen)":"p.Arg127Trp","Target Position":"chr9:35744684-35744684"},{"Gene":"KAT6B","Transcript":"NM_012330.3","cDNA (cNomen)":"c.4097_4105dupAAGAGGAAG","Protein (pNomen)":"p.Glu1366_Glu1368dup","Target Position":"chr10:76788659-76788660"},{"Gene":"KLLN","Transcript":"NM_001126049.1","cDNA (cNomen)":"c.17C>T","Protein (pNomen)":"p.Pro6Leu","Target Position":"chr10:89622228-89622228"},{"Gene":"LARP7","Transcript":"NM_016648.3","cDNA (cNomen)":"c.-2-4A>G","Protein (pNomen)":"","Target Position":"chr4:113565820-113565820"},{"Gene":"LIFR","Transcript":"NM_002310.5","cDNA (cNomen)":"c.46G>A","Protein (pNomen)":"p.Asp16Asn","Target Position":"chr5:38530704-38530704"},{"Gene":"MC1R","Transcript":"NM_002386.3","cDNA (cNomen)":"c.892T>C","Protein (pNomen)":"p.Tyr298His","Target Position":"chr16:89986558-89986558"},{"Gene":"MCM4","Transcript":"NM_005914.3","cDNA (cNomen)":"c.2102G>A","Protein (pNomen)":"p.Arg701Gln","Target Position":"chr8:48885590-48885590"},{"Gene":"NBAS","Transcript":"NM_015909.3","cDNA (cNomen)":"c.5110C>T","Protein (pNomen)":"p.His1704Tyr","Target Position":"chr2:15427225-15427225"},{"Gene":"NBAS","Transcript":"NM_015909.3","cDNA (cNomen)":"c.2290C>T","Protein (pNomen)":"p.Pro764Ser","Target Position":"chr2:15601374-15601374"},{"Gene":"NFKB1","Transcript":"NM_003998.3","cDNA (cNomen)":"c.1601G>A","Protein (pNomen)":"p.Arg534His","Target Position":"chr4:103518782-103518782"},{"Gene":"PAH","Transcript":"NM_000277.1","cDNA (cNomen)":"c.1208C>T","Protein (pNomen)":"p.Ala403Val","Target Position":"chr12:103234285-103234285"},{"Gene":"PDE6C","Transcript":"NM_006204.3","cDNA (cNomen)":"c.2087C>T","Protein (pNomen)":"p.Thr696Met","Target Position":"chr10:95418708-95418708"},{"Gene":"PIGN","Transcript":"NM_176787.4","cDNA (cNomen)":"c.1372G>A","Protein (pNomen)":"p.Ala458Thr","Target Position":"chr18:59780429-59780429"},{"Gene":"POC1B","Transcript":"NM_172240.2","cDNA (cNomen)":"c.179A>G","Protein (pNomen)":"p.Lys60Arg","Target Position":"chr12:89891041-89891041"},{"Gene":"PRF1","Transcript":"NM_001083116.1","cDNA (cNomen)":"c.1528T>C","Protein (pNomen)":"p.Cys510Arg","Target Position":"chr10:72357949-72357949"},{"Gene":"PROP1","Transcript":"NM_006261.4","cDNA (cNomen)":"c.652A>C","Protein (pNomen)":"p.Ser218Arg","Target Position":"chr5:177419739-177419739"},{"Gene":"RELN","Transcript":"NM_005045.3","cDNA (cNomen)":"c.8795C>A","Protein (pNomen)":"p.Ser2932Tyr","Target Position":"chr7:103138572-103138572"},{"Gene":"RYR1","Transcript":"NM_000540.2","cDNA (cNomen)":"c.8959G>C","Protein (pNomen)":"p.Glu2987Gln","Target Position":"chr19:39001164-39001164"},{"Gene":"SH3PXD2B","Transcript":"NM_001017995.2","cDNA (cNomen)":"c.921G>C","Protein (pNomen)":"p.Gln307His","Target Position":"chr5:171777458-171777458"},{"Gene":"SLC39A4","Transcript":"NM_130849.3","cDNA (cNomen)":"c.1126G>A","Protein (pNomen)":"p.Ala376Thr","Target Position":"chr8:145639669-145639669"},{"Gene":"SLC4A4","Transcript":"NM_003759.3","cDNA (cNomen)":"c.2542T>C","Protein (pNomen)":"p.Phe848Leu","Target Position":"chr4:72413417-72413417"},{"Gene":"SMPD1","Transcript":"NM_000543.4","cDNA (cNomen)":"c.108_109insGCG","Protein (pNomen)":"p.Val36_Leu37insAla","Target Position":"chr11:6411935-6411936"},{"Gene":"SRCAP","Transcript":"NM_006662.2","cDNA (cNomen)":"c.7853A>G","Protein (pNomen)":"p.Asn2618Ser","Target Position":"chr16:30749214-30749214"},{"Gene":"TCOF1","Transcript":"NM_001135243.1","cDNA (cNomen)":"c.925G>C","Protein (pNomen)":"p.Gly309Arg","Target Position":"chr5:149753791-149753791"},{"Gene":"TG","Transcript":"NM_003235.4","cDNA (cNomen)":"c.2222C>T","Protein (pNomen)":"p.Thr741Met","Target Position":"chr8:133900274-133900274"},{"Gene":"TMPRSS15","Transcript":"NM_002772.2","cDNA (cNomen)":"c.687T>G","Protein (pNomen)":"p.Phe229Leu","Target Position":"chr21:19737543-19737543"},{"Gene":"UBR1","Transcript":"NM_174916.2","cDNA (cNomen)":"c.3959T>C","Protein (pNomen)":"p.Leu1320Pro","Target Position":"chr15:43281055-43281055"},{"Gene":"ZNF513","Transcript":"NM_144631.5","cDNA (cNomen)":"c.901G>A","Protein (pNomen)":"p.Gly301Arg","Target Position":"chr2:27601137-27601137"}]'

In [51]:
[i.split(":")[0][1:-1] for i in b.split("},{")[0][2:].split(',')]

['Gene', 'Transcript', 'cDNA (cNomen)', 'Protein (pNomen)', 'Target Position']

In [52]:
[i.split('":"')[0][1:-1] for i in a.split("},{")[0][2:].split('","')]

['#CHRO',
 'O',
 '',
 'E',
 'L',
 'UA',
 'ILTE',
 'NF',
 'ORMA',
 'A0000',
 'A0000',
 'A0000']

In [38]:
input_gene_field

['#CHRO',
 'O',
 '',
 'E',
 'L',
 'UA',
 'ILTE',
 'NF',
 'ORMA',
 'A0000',
 'A0000',
 'A0000']

In [94]:
[i.split(":")[0][1:-1] for i in a.split("},{")[0][2:].split(',')]

['#CHROM',
 'POS',
 'ID',
 'REF',
 'ALT',
 'QUAL',
 'FILTER',
 'INFO',
 'FORMAT',
 'NA00001',
 '1',
 'NA00002',
 '1',
 'NA00003',
 '']

In [93]:
[i.split('":"')[0] for i in a.split("},{")[0][3:].split('","')]

['#CHROM',
 'POS',
 'ID',
 'REF',
 'ALT',
 'QUAL',
 'FILTER',
 'INFO',
 'FORMAT',
 'NA00001',
 'NA00002',
 'NA00003']

In [88]:
b.split("},{")[0][3:]

'Gene":"AARS","Transcript":"NM_001605.2","cDNA (cNomen)":"c.1786-5T>C","Protein (pNomen)":"","Target Position":"chr16:70293094-70293094"'

In [4]:
from langdetect import detect

In [7]:
detect(u"任务1")

'zh-cn'

In [20]:
from oauth2client.service_account import ServiceAccountCredentials

# scopes = ['https://www.googleapis.com/auth/translateservice.admin']

credentials = ServiceAccountCredentials.from_json_keyfile_name(
    '/Users/Tianqi/Keep_Learning/google_api/translate-7696ad4e40b3.json')

In [21]:
from httplib2 import Http

http_auth = credentials.authorize(Http())

In [22]:
from apiclient.discovery import build
service = build('translate', 'v2', http=http_auth)

Traceback (most recent call last):
  File "/Users/Tianqi/anaconda/lib/python2.7/site-packages/googleapiclient/discovery_cache/__init__.py", line 41, in autodetect
    from . import file_cache
  File "/Users/Tianqi/anaconda/lib/python2.7/site-packages/googleapiclient/discovery_cache/file_cache.py", line 41, in <module>
    'file_cache is unavailable when using oauth2client >= 4.0.0')
ImportError: file_cache is unavailable when using oauth2client >= 4.0.0


TypeError: cannot instantiate ctype 'EVP_MD_CTX' of unknown size

In [None]:
service = build('translate', 'v2',
        developerKey='e37b6a3f5406868430b2b58d7b65825e4cb21082')
service.translations().list(
  source='en',
  target='fr',
  q=['flower', 'car']
).execute()

In [64]:
# Imports the Google Cloud client library
from google.cloud import translate

# Instantiates a client
translate_client = translate.Client()

# The text to translate
text = u'Hello, world!'
# The target language
target = 'ru'

# Translates some text into Russian
translation = translate_client.translate(
    text,
    target_language=target)

print(u'Text: {}'.format(text))
print(u'Translation: {}'.format(translation['translatedText']))



Forbidden: 403 Daily Limit Exceeded (GET https://translation.googleapis.com/language/translate/v2?target=ru&q=Hello%2C+world%21)

In [13]:
from google.cloud import translate

translate_client = translate.Client()
text = u'Hello'
target = 'zh'

translation = translate_client.translate(text,target_language=target)
print(u'Translation: {}'.format(translation['translatedText']))



Translation: 你好


In [15]:
text = u'为什么时而好用时而不好用'
target = 'en'
translation = translate_client.translate(text,target_language=target)
translation['translatedText']

u'Why sometimes easy to use and sometimes easy to use'

In [24]:
sourceLang = 'zh-Hans'
targetLang = 'en'
sourceText = '你好啊'

url = "https://translate.googleapis.com/translate_a/single?client=gtx&sl=" + sourceLang + "&tl=" + targetLang + "&dt=t&q=" + sourceText

In [22]:
url

'https://translate.googleapis.com/translate_a/single?client=gtx&sl=zh-Hans&tl=en&dt=t&q=\xe4\xbd\xa0\xe5\xa5\xbd\xe5\x95\x8a'

In [35]:
url = 'response'

In [None]:
https://translate.googleapis.com/translate_a/single?client=gtx&sl=zh-Hans&tl=en&dt=t&q=%E4%BD%A0%E5%A5%BD

In [31]:
import urllib

f = urllib.urlopen(url)
myfile = f.read()
print myfile

<!DOCTYPE html><html lang=en><meta charset=utf-8><meta name=viewport content="initial-scale=1, minimum-scale=1, width=device-width"><title>Error 403 (Forbidden)!!1</title><style>*{margin:0;padding:0}html,code{font:15px/22px arial,sans-serif}html{background:#fff;color:#222;padding:15px}body{margin:7% auto 0;max-width:390px;min-height:180px;padding:30px 0 15px}* > body{background:url(//www.google.com/images/errors/robot.png) 100% 5px no-repeat;padding-right:205px}p{margin:11px 0 22px;overflow:hidden}ins{color:#777;text-decoration:none}a img{border:0}@media screen and (max-width:772px){body{background:none;margin-top:0;max-width:none;padding-right:0}}#logo{background:url(//www.google.com/images/branding/googlelogo/1x/googlelogo_color_150x54dp.png) no-repeat;margin-left:-5px}@media only screen and (min-resolution:192dpi){#logo{background:url(//www.google.com/images/branding/googlelogo/2x/googlelogo_color_150x54dp.png) no-repeat 0% 0%/100% 100%;-moz-border-image:url(//www.google.com/images/

In [140]:
import urllib2
site = "https://translate.googleapis.com/translate_a/single?client=gtx&sl=zh-Hans&tl=en&dt=t&q="+iput_text
hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
        'Accept-Encoding': 'none',
        'Accept-Language': 'en-US,en;q=0.8',
        'Connection': 'keep-alive'}
req = urllib2.Request(site, headers=hdr)
try:
    page = urllib2.urlopen(req)
except urllib2.HTTPError, e:
    print e

content = page.read()
translate = json.loads(content)[0][0][0]
print translate

Non-developed, severe global growth retardation, short stature, high blood pressure, small head deformity, cortical visual impairment, phalanx smooth muscle, overlapping toes, ball nose, nasal inactivity, delayed toothbrush, frontal lobelia, scleral uveitis, 


In [141]:
json.loads(content)

[[[u'Non-developed, severe global growth retardation, short stature, high blood pressure, small head deformity, cortical visual impairment, phalanx smooth muscle, overlapping toes, ball nose, nasal inactivity, delayed toothbrush, frontal lobelia, scleral uveitis, ',
   u'\u4e0d\u53d1\u80b2\uff0c\u4e25\u91cd\u7684\u5168\u7403\u53d1\u80b2\u8fdf\u7f13\uff0c\u8eab\u6750\u77ee\u5c0f\uff0c\u9ad8\u8840\u538b\uff0c\u5c0f\u5934\u7578\u5f62\uff0c\u76ae\u5c42\u89c6\u89c9\u969c\u788d\uff0c\u8dbe\u9aa8\u5e73\u6ed1\u808c\uff0c\u91cd\u53e0\u8dbe\uff0c\u7403\u9f3b\uff0c\u9f3b\u8154\u4e0d\u53d1\u8fbe\uff0c\u5ef6\u8fdf\u7259\u55b7\uff0c\u989d\u53f6\u591a\u6bdb\u75c7\uff0c\u5de9\u819c\u8461\u8404\u819c\u708e\uff0c',
   None,
   None,
   3],
  [u'Vertical, low tension, temporal disc, loose esophageal sphincter, club foot, malformations, malnutrition, congenital CMV, denervated gyrus, lower hippocampus, corpuscles, body, hemangioma, birthmark, forehead hirsutism, eye, hearing',
   u'\u5782\u76f4\u6027\u4f4

In [147]:
import map_phenotype_to_gene
import collectVariantInfo
import pubmed
import ACMG
import filterVariantOnPhenotype
import csv

import pandas as pd
import numpy as np
import re
import myvariant
# from deepb.models import Main_table, Raw_input_table
from io import StringIO
from collections import Counter
from collections import defaultdict
from langdetect import detect
import urllib2
import json


In [148]:
def read_input_pheno_file(input_phenotype):
	if not input_phenotype:
		return '', '', ''
	language = detect(unicode(input_phenotype))
	if language == "zh-cn":
		site = "https://translate.googleapis.com/translate_a/single?client=gtx&sl=zh-Hans&tl=en&dt=t&q="+input_phenotype
		hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
				'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
				'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
				'Accept-Encoding': 'none',
				'Accept-Language': 'en-US,en;q=0.8',
				'Connection': 'keep-alive'}
		req = urllib2.Request(site, headers=hdr)
		try:
			page = urllib2.urlopen(req)
			content = page.read()
			input_phenotype = json.loads(content)[0][0][0]
		except urllib2.HTTPError, e:
			return '', '', ''
	text = StringIO(unicode(input_phenotype), newline=None)
	lines = text.readlines()
	lines = [line.strip() for line in lines]
	phenos = []
	for line in lines:
		if not line:
			continue
        phenos_each_line = re.split(r'  +|\t+|,|;|\.|\|', line.strip())
        phenos_each_line = [re.sub(r'^\W+|\W+$', '', s) for s in phenos_each_line]
        phenos_each_line = [s.lower() for s in phenos_each_line if s]
        phenos += phenos_each_line

	corner_cases = dict()
	original_phenos = [_.strip() for _ in phenos]
	for pheno in phenos:
		if re.search('development', pheno) and re.search('delay', pheno) and not re.search('growth', pheno):
			phenos.append('growth delay')
			orner_cases['growth delay'] = pheno.strip()
		for pheno in phenos:
			if re.search('growth', pheno) and re.search('delay', pheno) and not re.search('development', pheno):
				phenos.append('developmental delay')
				corner_cases['developmental delay'] = pheno.strip()
		phenos = [_.strip() for _ in phenos]
		phenos = list(set(phenos))
		return phenos, corner_cases, original_phenos

In [149]:
read_input_pheno_file('Lamdoid, Sagittal, Craniosynostosis, Metopic, macular atrophy, anetoderma, short stature, developmental delay, Ventriculomegaly, growth hormone deficiency')

([u'short stature',
  u'craniosynostosis',
  u'ventriculomegaly',
  u'metopic',
  u'growth hormone deficiency',
  u'anetoderma',
  u'lamdoid',
  u'macular atrophy',
  u'developmental delay',
  u'sagittal'],
 {},
 [u'lamdoid',
  u'sagittal',
  u'craniosynostosis',
  u'metopic',
  u'macular atrophy',
  u'anetoderma',
  u'short stature',
  u'developmental delay',
  u'ventriculomegaly',
  u'growth hormone deficiency'])

In [116]:
import sys
reload(sys)
sys.setdefaultencoding("utf-8")

In [117]:
import urllib2
import json

In [118]:
from langdetect import detect

In [218]:
input_phenotype = '老年痴呆'

In [219]:
# language = detect(unicode(input_phenotype))
# if language == "zh-cn" or language == "ko":
site = "https://translate.googleapis.com/translate_a/single?client=gtx&sl=zh-Hans&tl=en&dt=t&q="+input_phenotype
hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
        'Accept-Encoding': 'none',
        'Accept-Language': 'en-US,en;q=0.8',
        'Connection': 'keep-alive'}
req = urllib2.Request(site, headers=hdr)
try:
    page = urllib2.urlopen(req)
    content = page.read()
    input_phenotype = json.loads(content)[0][0][0]
except urllib2.HTTPError, e:
    print e
        

# match_result = map2hpoWithPhenoSynonyms(input_phenotype)
# match_result = sorted(match_result, key = lambda x: x[2], reverse = True)
# if match_result==[]:
#     match_result = ''
# elif match_result[0][0] == 'Familial  hyperprolactinemia':
#     match_result = ''
# else:
#     if match_result[0][2] == 1.0:
#         match_result = match_result[:1]
#     match_id = [i[1] for i in match_result]
#     for indx,i in enumerate(match_id):
#         if i[-7:]=='synonym':
#             match_id[indx] = match_id[indx][:-8]
#     match_table = chpo[chpo['编号'].isin(match_id)].iloc[:,[3,2,1,0,5]].reset_index(drop=True)

In [220]:
input_phenotype

u"Alzheimer's disease"

In [122]:
input_pheno = input_phenotype

In [134]:
from map_phenotype_to_gene import map2hpoWithPhenoSynonyms

In [158]:
def smart_match(input_en, chpo):
    search_results = google.search(input_en, 1)
    wiki = list(set([i.name[:-12] for i in search_results if i.name[-9:]=='Wikipedia']))
    for i in wiki:
        try:
            wiki_match = chpo[chpo['表型英文名']==i]
            return wiki_match.to_json(orient='records')
        except:
            pass

    match_result = map2hpoWithPhenoSynonyms(input_en)
    match_result = sorted(match_result, key = lambda x: x[2], reverse = True)
    if match_result == []:
        match_result = []
    else:
        if match_result[0][2] == 1.0:
            match_result = match_result[:1]
        match_id = [i[1] for i in match_result]
        for indx,i in enumerate(match_id):
            if i[-7:]=='synonym':
                match_id[indx] = match_id[indx][:-8]
        match_table = chpo[chpo.iloc[:,2].isin(match_id)].iloc[:7,:].reset_index(drop=True)
        match_result = match_table.to_json(orient='records')
    return match_result

In [159]:
chpo = pd.read_excel("data/chpo.2016-10.xls")
chpo.columns = ['类别','HPO编号','表型英文名','表型中文名','英文释义','释义']
chpo = chpo.iloc[:,[3,2,1,0,5]]
match_result = smart_match(input_pheno, chpo)

In [160]:
input_pheno

u'Heartbeat'

In [161]:
match_result

'[]'

In [162]:
match_result = map2hpoWithPhenoSynonyms('Heartbeat')

In [163]:
match_result

[('Irregular heartbeat', 'HP:0011675-synonym', 0.5)]

In [164]:
match_id = [i[1] for i in match_result]

In [165]:
match_id

['HP:0011675-synonym']

In [166]:
for indx,i in enumerate(match_id):
    if i[-7:]=='synonym':
        match_id[indx] = match_id[indx][:-8]
match_table = chpo[chpo.iloc[:,2].isin(match_id)].iloc[:7,:].reset_index(drop=True)

In [167]:
match_table

Unnamed: 0,表型中文名,表型英文名,HPO编号,类别,释义
0,心律失常,Arrhythmia,HP:0011675,心血管系统的异常,非正常窦性心律的其他心律。这样的心律可能是窦或异位的起源，有的规律有的不规律。心律失常由心脏...


In [168]:
detect(unicode('心率不齐'))

'ko'

In [169]:
jieba.cut_for_search('心率不齐')

<generator object cut_for_search at 0x1262e7500>

In [129]:
import google

In [136]:
from google import google

In [18]:
num_page = 1
search_results = google.search("afraid of light", 1)

In [14]:
search_results[1].name

u'Photophobia - Wikipedia'

In [24]:
list(set([i.name[:-12] for i in search_results if i.name[-9:]=='Wikipedia']))

[u'Photophobia', u'Heliophobia']

In [61]:
import json
import requests


TAG_URL = 'http://api.bosonnlp.com/tag/analysis?oov_level=0'
# 如果某个选项采用默认设置，可以在TAG_URL中省略，完整的TAG_URL如下：
# 'http://api.bosonnlp.com/tag/analysis?space_mode=0&oov_level=3&t2s=0&special_char_conv=0'
# 修改space_mode选项为1
# TAG_URL = \
#   'http://api.bosonnlp.com/tag/analysis?space_mode=1'
# 修改oov_level选项为1
# TAG_URL = \
#    'http://api.bosonnlp.com/tag/analysis?oov_level=1'
# 修改t2s选项为1
# TAG_URL= \
#     'http://api.bosonnlp.com/tag/analysis?t2s=1'
# 修改special_char_conv选项为1
# TAG_URL= \
# 'http://api.bosonnlp.com/tag/analysis?special_char_conv=1'

s = ['胎儿宫内发育迟缓']
data = json.dumps(s)
headers = {'X-Token': '2PDrt-DP.16040.4IrgHXFf8JDy'}
resp = requests.post(TAG_URL, headers=headers, data=data.encode('utf-8'))


for i in [i for i in resp.json()[0]['word']]:
    print i

胎儿
宫内
发育
迟缓


In [6]:
resp.json()

[{u'tag': [u'n', u'vi', u'a'],
  u'word': [u'\u4e73\u5934', u'\u53d1\u80b2', u'\u4e0d\u826f']}]

In [12]:
for i in [i for i in resp.json()[0]['word']]:
    print i

乳头
发育
不良


In [3]:
# encoding=utf-8
import jieba

seg_list = jieba.cut("我来到北京清华大学", cut_all=True)
print("Full Mode: " + "/ ".join(seg_list))  # 全模式

seg_list = jieba.cut("我来到北京清华大学", cut_all=False)
print("Default Mode: " + "/ ".join(seg_list))  # 精确模式

seg_list = jieba.cut("他来到了网易杭研大厦")  # 默认是精确模式
print(", ".join(seg_list))

seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所，后在日本京都大学深造")  # 搜索引擎模式
print(", ".join(seg_list))

Full Mode: 我/ 来到/ 北京/ 清华/ 清华大学/ 华大/ 大学
Default Mode: 我/ 来到/ 北京/ 清华大学
他, 来到, 了, 网易, 杭研, 大厦
小明, 硕士, 毕业, 于, 中国, 科学, 学院, 科学院, 中国科学院, 计算, 计算所, ，, 后, 在, 日本, 京都, 大学, 日本京都大学, 深造


In [172]:
for i in list(jieba.cut("癫痫")):
    print i

In [81]:
'hello' in 'hello'

True

In [80]:
input_pheno = '愣神'

In [72]:
manual_dir = {'愣神': '癫痫'}
if input_pheno in manual_dir.keys():
    input_pheno = manual_dir[input_pheno]

In [78]:
print manual_dir.get('愣')

None


In [81]:
input_pheno in manual_dir.keys()

True

In [171]:
for i in list(jieba.cut_for_search('心率不齐')):
    print i

In [174]:
input_pheno = '心率不齐'

In [182]:
if not input_pheno:
    match_result = ''
else:
    chpo = pd.read_excel("data/chpo.2016-10.xls")
    chpo.columns = ['类别','HPO编号','表型英文名','表型中文名','英文释义','释义']
    chpo = chpo.iloc[:,[3,2,1,0,5]]
    word_cant_be_detected = ['癫痫']
    manual_dir = {u'愣神': u'癫痫'}
    if input_pheno in manual_dir.keys():
        input_pheno = manual_dir[input_pheno]
    if input_pheno in word_cant_be_detected or detect(unicode(input_pheno)) in ["zh-cn","ko"]:
        direct_match = chpo[chpo['表型中文名']==input_pheno]
        substring_match = [i for i,j in enumerate(list(chpo['表型中文名'])) if set(list(jieba.cut_for_search(input_pheno))).issubset(set(list(jieba.cut_for_search(j))))]
        if len(direct_match) > 0:
            match_result = direct_match.to_json(orient='records')
        elif len(substring_match) > 0:
            match_table = chpo.iloc[substring_match,:].reset_index(drop=True)
            match_result = match_table.to_json(orient='records')
        else:
            site = "https://translate.googleapis.com/translate_a/single?client=gtx&sl=zh-Hans&tl=en&dt=t&q="+input_phenotype
            hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
                    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                    'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
                    'Accept-Encoding': 'none',
                    'Accept-Language': 'en-US,en;q=0.8',
                    'Connection': 'keep-alive'}
            req = urllib2.Request(site, headers=hdr)
            page = urllib2.urlopen(req)
            content = page.read()
            input_phenotype = json.loads(content)[0][0][0]
            match_result = smart_match(input_phenotype, chpo)
#     else:
#         match_result = smart_match(input_pheno, chpo)

In [217]:
smart_match(input_phenotype, chpo)

'[{"\\u8868\\u578b\\u4e2d\\u6587\\u540d":"\\u5fc3\\u5f8b\\u5931\\u5e38","\\u8868\\u578b\\u82f1\\u6587\\u540d":"Arrhythmia","HPO\\u7f16\\u53f7":"HP:0011675","\\u7c7b\\u522b":"\\u5fc3\\u8840\\u7ba1\\u7cfb\\u7edf\\u7684\\u5f02\\u5e38","\\u91ca\\u4e49":"\\u975e\\u6b63\\u5e38\\u7aa6\\u6027\\u5fc3\\u5f8b\\u7684\\u5176\\u4ed6\\u5fc3\\u5f8b\\u3002\\u8fd9\\u6837\\u7684\\u5fc3\\u5f8b\\u53ef\\u80fd\\u662f\\u7aa6\\u6216\\u5f02\\u4f4d\\u7684\\u8d77\\u6e90\\uff0c\\u6709\\u7684\\u89c4\\u5f8b\\u6709\\u7684\\u4e0d\\u89c4\\u5f8b\\u3002\\u5fc3\\u5f8b\\u5931\\u5e38\\u7531\\u5fc3\\u810f\\u640f\\u52a8\\u7684\\u8d77\\u6e90\\u6216\\u4f20\\u5bfc\\u7d0a\\u4e71\\u5f15\\u8d77\\u3002"}]'

In [223]:
input_en = input_phenotype

In [226]:
# def smart_match(input_en, chpo):
search_results = google.search(input_en, 1)
wiki = list(set([i.name[:-12] for i in search_results if i.name[-9:]=='Wikipedia']))
for i in wiki:
    try:
        wiki_match = chpo[chpo['表型英文名']==i]
        if len(wiki_match) > 0:
            match_table = wiki_match
#             return wiki_match.to_json(orient='records')
    except:
        pass

match_result = map2hpoWithPhenoSynonyms(input_en)
match_result = sorted(match_result, key = lambda x: x[2], reverse = True)
if match_result == []:
    match_result = []
else:
    if match_result[0][2] == 1.0:
        match_result = match_result[:1]
    match_id = [i[1] for i in match_result]
    for indx,i in enumerate(match_id):
        if i[-7:]=='synonym':
            match_id[indx] = match_id[indx][:-8]
    match_table = chpo[chpo.iloc[:,2].isin(match_id)].iloc[:7,:].reset_index(drop=True)
    match_result = match_table.to_json(orient='records')
#     return match_result

In [228]:
wiki_match

Unnamed: 0,表型中文名,表型英文名,HPO编号,类别,释义


In [229]:
input_phenotype

u"Alzheimer's disease"

In [230]:
wiki

[u"Alzheimer's disease"]

In [7]:
# import map_phenotype_to_gene
# import collectVariantInfo
# import pubmed
# import ACMG
# import filterVariantOnPhenotype
import csv

import pandas as pd
import numpy as np
import re
import myvariant
# from deepb.models import Main_table, Raw_input_table
from io import StringIO
from collections import Counter
from collections import defaultdict
from langdetect import detect
import urllib2
import json
import sys


# reload(sys)
# sys.setdefaultencoding('utf-8')

In [48]:
def read_input_gene_file(input_gene):
    candidate_vars = []
    input_gene = input_gene.split('\n')

    for line in input_gene:
        if line and line[:2] != "##":
            header = line
            sniffer = csv.Sniffer()
            dialect = sniffer.sniff(header)
            delimiter =  dialect.delimiter
            field_names = header.split(delimiter)
            break

    chrom_idx, pos_idx, ref_idx, alt_idx, gene_idx, zygosity_idx = None, None, None, None, None, None
    mother_idx, father_idx = None, None

    for idx in xrange(len(field_names)):
        field = field_names[idx]
        if re.match(r'chrom|#chrom', field, re.I): chrom_idx = idx
        if re.match(r'pos|start', field, re.I): pos_idx = idx
        if re.match(r'ref', field, re.I): ref_idx = idx
        if re.match(r'alt|allele 1', field, re.I): alt_idx = idx
        if re.match(r'gene \(gene\)|gene$', field, re.I): gene_idx = idx
        if re.match(r'zygo', field, re.I): zygosity_idx = idx
        if re.match(r'mot', field, re.I): mother_idx = idx
        if re.match(r'fat', field, re.I): father_idx = idx

    input_gene_list = []
    CANDIDATE_GENES = []
    candidate_vars_zygosity = []
    for line in input_gene[1:]:
        if not line:
            continue
        if line.startswith("#"):
            continue
        line = line.rstrip()
        # print line
        parts = re.split(r'%s' % delimiter, line)
        input_gene_list.append(parts)
        gene, transcript, variant, variant_id, zygosity = '', '', '', '', ''
        if gene_idx is not None:
            gene = parts[gene_idx]
            CANDIDATE_GENES.append(gene)
        for part in parts:
            # print part
            if re.search(r'_.*:c\.', part):
                transcript, variant = part.split(':')
            else:
                if re.search(r'c\.', part):
                    variant = part
                if re.search(r'NM_', part, re.I):
                    transcript = part.split(':')[0]
            if re.search(r'_.*:g\.', part):
                variant_id = 'chr' + part.split(':')[0].split('.')[-1] + part.split(':')[-1]
            if re.search(r'chr.*:g\.', part, re.I):
                variant_id = part
            if re.match(r'het|hom|hem|de |comp', part, re.I):	
                zygosity = part

        if not variant_id and (chrom_idx is not None and pos_idx is not None and ref_idx is not None and alt_idx is not None):
            chrome, pos, ref, alt = parts[chrom_idx], parts[pos_idx], parts[ref_idx], parts[alt_idx]
            alts = alt.split(',')
            for alt in alts:
                try:
                    variant_id = format_hgvs(chrome, pos, ref, alt)
                except ValueError:
                    pass
                # print gene, variant, transcript, variant_id
                if not gene and not variant and not transcript and not variant_id:
                    continue
                if mother_idx is not None and father_idx is not None:
                    mother, father = parts[mother_idx], parts[father_idx]                   
                    candidate_vars_zygosity.append((gene, variant, transcript, variant_id, ref, alt, mother, father))
                candidate_vars.append((gene, variant, transcript, variant_id, zygosity))
        else:
            candidate_vars.append((gene, variant, transcript, variant_id, zygosity))
    print candidate_vars
    
    def getZygosityFromVCF():
        gene_zygosity = dict()
        for item in candidate_vars_zygosity:
            gene, variant, transcript, variant_id, ref, alt, mother, father = item
            if gene in gene_zygosity:
                gene_zygosity[gene].append((ref, alt, mother, father)) 
            else:
                gene_zygosity[gene] = [(ref, alt, mother, father)]

        comp_het_genes = [] 
        for gene in gene_zygosity.keys():
            if len(gene_zygosity[gene]) > 1:
                var_from_mother, var_from_father = False, False
                for item in gene_zygosity[gene]:
                    if mother == alt and father == ref:
                        var_from_mother = True
                    if mother == ref and father == alt:
                        var_from_father = True
                if var_from_mother and var_from_father:
                    comp_het_genes.append(gene)
        return comp_het_genes
    
#     print candidate_vars_zygosity
    if candidate_vars_zygosity:
        candidate_vars = []
        comp_het_genes = getZygosityFromVCF()
        for item in candidate_vars_zygosity:
            gene, variant, transcript, variant_id, ref, alt, mother, father = item
            if gene in comp_het_genes:
                zygosity = 'comp het'
            elif not mother or not father:
                zygosity = 'hem'
            else:
                if mother == alt and father == alt:
                    zygosity = 'hom'
                elif mother == ref and father == ref:
                    zygosity = 'de novo'
                else:
                    zygosity = 'het'
            candidate_vars.append((gene, variant, transcript, variant_id, zygosity))
    
    print candidate_vars
    
    tmp_candidate_vars = []
    gene_zygosity = dict()
    for var in candidate_vars:
        gene, variant, transcript, variant_id, zygosity = var
        tmp_candidate_vars.append((gene, variant, transcript, variant_id))
        if variant_id:
            gene_zygosity[variant_id] = zygosity
        if gene and variant:
            gene_zygosity[(gene, variant)] = zygosity
    candidate_vars = tmp_candidate_vars

    # remove lines in the input file which has wrong number of fields
    field_nums = []
    for line in input_gene_list:
        field_nums.append(len(line))
    count = Counter(field_nums)
    correct_field_num = count.most_common()[0][0]
    correct_input_gene_list = []
    for line in input_gene_list:
        if len(line) == correct_field_num:
            correct_input_gene_list.append(line)
    df_genes = pd.DataFrame(correct_input_gene_list, columns = field_names)
    return candidate_vars, CANDIDATE_GENES, df_genes, field_names, gene_zygosity 

In [49]:
input_gene = open("/Users/Tianqi/Keep_Learning/bio-nlp/test_data/xiaonan_gene_2.txt", "r").read()

In [50]:
candidate_vars, CANDIDATE_GENES, df_genes, field_names, gene_zygosity = read_input_gene_file(input_gene)

[('AARS', 'c.1786-5T>C', 'NM_001605.2', '', ''), ('AP4E1', 'c.2553A>T', 'NM_007347.4', '', ''), ('ARFGEF2', 'c.3120C>T', 'NM_006420.2', '', ''), ('ARL2BP', 'c.207C>T', 'NM_012106.3', '', ''), ('ASPM', 'c.7812A>T', 'NM_018136.4', '', ''), ('CD96', 'c.1295A>T', 'NM_198196.2', '', ''), ('CHAMP1', 'c.449_451delCTC', 'NM_001164144.1', '', ''), ('CREBBP', 'c.5354G>A', 'NM_004380.2', '', ''), ('DYNC1H1', 'c.634A>G', 'NM_001376.4', '', ''), ('DYNC2H1', 'c.2479A>G', 'NM_001080463.1', '', ''), ('ERCC6L2', 'c.4123A>C', 'NM_020207.4', '', ''), ('FRAS1', 'c.3065A>C', 'NM_025074.6', '', ''), ('GBA2', 'c.379C>T', 'NM_020944.2', '', ''), ('KAT6B', 'c.4097_4105dupAAGAGGAAG', 'NM_012330.3', '', ''), ('KLLN', 'c.17C>T', 'NM_001126049.1', '', ''), ('LARP7', 'c.-2-4A>G', 'NM_016648.3', '', ''), ('LIFR', 'c.46G>A', 'NM_002310.5', '', ''), ('MC1R', 'c.892T>C', 'NM_002386.3', '', ''), ('MCM4', 'c.2102G>A', 'NM_005914.3', '', ''), ('NBAS', 'c.5110C>T', 'NM_015909.3', '', ''), ('NBAS', 'c.2290C>T', 'NM_015909.3

In [51]:
candidate_vars

[('AARS', 'c.1786-5T>C', 'NM_001605.2', ''),
 ('AP4E1', 'c.2553A>T', 'NM_007347.4', ''),
 ('ARFGEF2', 'c.3120C>T', 'NM_006420.2', ''),
 ('ARL2BP', 'c.207C>T', 'NM_012106.3', ''),
 ('ASPM', 'c.7812A>T', 'NM_018136.4', ''),
 ('CD96', 'c.1295A>T', 'NM_198196.2', ''),
 ('CHAMP1', 'c.449_451delCTC', 'NM_001164144.1', ''),
 ('CREBBP', 'c.5354G>A', 'NM_004380.2', ''),
 ('DYNC1H1', 'c.634A>G', 'NM_001376.4', ''),
 ('DYNC2H1', 'c.2479A>G', 'NM_001080463.1', ''),
 ('ERCC6L2', 'c.4123A>C', 'NM_020207.4', ''),
 ('FRAS1', 'c.3065A>C', 'NM_025074.6', ''),
 ('GBA2', 'c.379C>T', 'NM_020944.2', ''),
 ('KAT6B', 'c.4097_4105dupAAGAGGAAG', 'NM_012330.3', ''),
 ('KLLN', 'c.17C>T', 'NM_001126049.1', ''),
 ('LARP7', 'c.-2-4A>G', 'NM_016648.3', ''),
 ('LIFR', 'c.46G>A', 'NM_002310.5', ''),
 ('MC1R', 'c.892T>C', 'NM_002386.3', ''),
 ('MCM4', 'c.2102G>A', 'NM_005914.3', ''),
 ('NBAS', 'c.5110C>T', 'NM_015909.3', ''),
 ('NBAS', 'c.2290C>T', 'NM_015909.3', ''),
 ('NFKB1', 'c.1601G>A', 'NM_003998.3', ''),
 ('PAH',

In [53]:
a = defaultdict(dict)

In [58]:
a.keys()

[0, 12]

In [59]:
a[12]

{}

In [1]:
from google import google

In [8]:
input_gene = 'AHDC1'

In [18]:
search_results = google.search(input_gene+' loss of function', 1)
# wiki = list(set([i.name[:-12] for i in search_results if i.name[-9:]=='Wikipedia']))

In [19]:
for i in search_results:
    for j in i.description.split("..."):
        j = j.replace("\n", "")
        if ('loss-of-function ' in j or ' loss of function 'in j or ' Loss of function 'in j) and (input_gene in j) and (('no' and 'not' and 'whether') not in j):
            print i
            print j
            print '1'
            break
    else:
        continue
    break

In [20]:
i.description.split("...")

[u'AHDC1. Approved Name: AT-hook DNA binding motif containing 1. \nChromosomal ',
 u' Tier 1 Gene with three or more de novo pathogenic loss-of-\nfunction variants.']

In [21]:
i.name

u'AHDC1 - (ADMI) | Geisinger Health System'

In [13]:
search_results

[GoogleResult(name=De novo truncating variants in the AHDC1 gene encoding ..
              description=The AHDC1 gene has only one coding exon, and the ..,
 GoogleResult(name=AHDC1 Gene - GeneCards | AHDC1 Protein | AHDC1 Antibody
              description=Complete information for AHDC1 gene (Protein Codi..,
 GoogleResult(name=New syndrome caused by mutations in AHDC1 -- ScienceDai..
              description=May 1, 2014 ... "Little is known about this gene ..,
 GoogleResult(name=AHDC1 - gene and developmental disorder implications ....
              description=AHDC1 in the Development Disorder Genotype - Phen..,
 GoogleResult(name=AHDC1 - OMIM
              description=Xia et al. (2014) reported that the deduced 1,603..,
 GoogleResult(name=Genic overlap for neurodevelopmental disorders. : Genet..
              description=Oct 27, 2015 ... To assess the significance for o..,
 GoogleResult(name=Molecular characteristics - AHDC1 - Gensites Sites
              description=All affected fe

In [7]:
i

GoogleResult(name=AHDC1 - (ADMI) | Geisinger Health System
             description=AHDC1. Approved Name: AT-hook DNA binding motif c..