In [2]:
gene_file_path = 'input/input_genes.txt'
phenotype_file_path = 'input/input_phenotype.txt'

In [10]:
import map_phenotype_to_gene
import collectVariantInfo
import pubmed
import ACMG
import filterVariantOnPhenotype
import csv

import pandas as pd
import numpy as np
import re
import myvariant
# from deepb.models import Main_table, Raw_input_table
from io import StringIO
from collections import Counter

# input_phenotype = 'data/sample_patient_phenotype.txt'
# input_genes = 'data/sample_genes.txt'

def format_hgvs(chrom, pos, ref, alt):

    chrom = str(chrom)
    if chrom.lower().startswith('chr'):
        # trim off leading "chr" if any
        chrom = chrom[3:]
    if len(ref) == len(alt) == 1:
        # this is a SNP
        hgvs = 'chr{0}:g.{1}{2}>{3}'.format(chrom, pos, ref, alt)
    elif len(ref) > 1 and len(alt) == 1:
        # this is a deletion:
        if ref[0] == alt:
            start = int(pos) + 1
            end = int(pos) + len(ref) - 1
            hgvs = 'chr{0}:g.{1}_{2}del'.format(chrom, start, end)
        else:
            end = int(pos) + len(ref) - 1
            hgvs = 'chr{0}:g.{1}_{2}delins{3}'.format(chrom, pos, end, alt)
    elif len(ref) == 1 and len(alt) > 1:
        # this is a insertion
        if alt[0] == ref:
            hgvs = 'chr{0}:g.{1}_{2}ins'.format(chrom, pos, int(pos) + 1)
            ins_seq = alt[1:]
            hgvs += ins_seq
        else:
            hgvs = 'chr{0}:g.{1}delins{2}'.format(chrom, pos, alt)
    elif len(ref) > 1 and len(alt) > 1:
        end = int(pos) + len(alt) - 1
        hgvs = 'chr{0}:g.{1}_{2}delins{3}'.format(chrom, pos, end, alt)
    else:
        raise ValueError("Cannot convert {} into HGVS id.".format((chrom, pos, ref, alt)))
    return hgvs

def read_input_pheno_file(input_phenotype):
        if not input_phenotype:
            return '', ''
        text = StringIO(unicode(input_phenotype), newline=None)
        lines = text.readlines()
        lines = [line.strip() for line in lines]
        phenos = []
        for line in lines:
            if not line:
                continue
            phenos_each_line = re.split(r'  +|\t+|,|;|\.|\|', line.strip())
            phenos_each_line = [re.sub(r'^\W+|\W+$', '', s) for s in phenos_each_line]
            phenos_each_line = [s.lower() for s in phenos_each_line if s]
            phenos += phenos_each_line

        corner_cases = dict()
        for pheno in phenos:
                if re.search('development', pheno) and re.search('delay', pheno) and not re.search('growth', pheno):
                        phenos.append('growth delay')
                        corner_cases['growth delay'] = pheno.strip()
        for pheno in phenos:
                if re.search('growth', pheno) and re.search('delay', pheno) and not re.search('development', pheno):
                        phenos.append('developmental delay')
                        corner_cases['developmental delay'] = pheno.strip()
        return phenos, corner_cases

def read_input_gene_file(input_gene):
        candidate_vars = []
        input_gene = input_gene.split('\n')
        header = input_gene[0]
        sniffer = csv.Sniffer()
        dialect = sniffer.sniff(header)
        delimiter =  dialect.delimiter
        field_names = header.split(delimiter)
        chrom_idx, pos_idx, ref_idx, alt_idx, gene_idx = None, None, None, None, 0 

        for idx in xrange(len(field_names)):
            field = field_names[idx]
            if re.match(r'chrom', field, re.I): chrom_idx = idx
            if re.match(r'pos|start', field, re.I): pos_idx = idx
            if re.match(r'ref', field, re.I): ref_idx = idx
            if re.match(r'alt|allele 1', field, re.I): alt_idx = idx
            if re.match(r'gene (gene)|gene', field, re.I): gene_idx = idx

        input_gene_list = []
        CANDIDATE_GENES = []
        for line in input_gene[1:]:
                if not line:
                        continue
                line = line.rstrip()
                parts = re.split(r'%s' % delimiter, line)
                input_gene_list.append(parts)
                gene = parts[gene_idx]
                CANDIDATE_GENES.append(gene)
                transcript, variant, variant_id = '', '', ''
                for part in parts:
                    if re.search(r'_.*:c\.', part):
                        transcript, variant = part.split(':')
                    if re.search(r'_.*:g\.', part):
                        variant_id = 'chr' + part.split(':')[0].split('.')[-1] + part.split(':')[-1]
                    if re.search(r'chr.*:g\.', part, re.I):
                        variant_id = part
                if not variant_id and (chrom_idx and pos_idx and ref_idx and alt_idx):
                    chrome, pos, ref, alt = parts[chrom_idx], parts[pos_idx], parts[ref_idx], parts[alt_idx]
                    variant_id = format_hgvs(chrome, pos, ref, alt)
                candidate_vars.append((gene, variant, transcript, variant_id))

        # remove lines in the input file which has wrong number of fields
        field_nums = []
        for line in input_gene_list:
            field_nums.append(len(line))
        count = Counter(field_nums)
        correct_field_num = count.most_common()[0][0]
        correct_input_gene_list = []
        for line in input_gene_list:
                if len(line) == correct_field_num:
                        correct_input_gene_list.append(line)
        df_genes = pd.DataFrame(correct_input_gene_list, columns = field_names)
        return candidate_vars, CANDIDATE_GENES, df_genes, field_names 

def map_phenotype2gene(CANDIDATE_GENES, phenos, corner_cases, candidate_vars):
        ranking_genes, ranking_disease = map_phenotype_to_gene.generate_score(phenos, CANDIDATE_GENES, corner_cases)
        # collect variant info
        hpo_filtered_genes = np.unique([i[0] for i in ranking_genes]).tolist()

        tmp_candidate_vars = []
        for var in candidate_vars:
                if var[0] in hpo_filtered_genes:
                        tmp_candidate_vars.append(var)
        candidate_vars = tmp_candidate_vars
        return ranking_genes, candidate_vars

input_gene = open("/Users/Tianqi/Desktop/xiaonantest1_g.txt",'rU').read()
input_phenotype = open("/Users/Tianqi/Desktop/xiaonantest1_p.txt",'rU').read()

# Read input pheno file and generate phenos and corner_cases 
phenos, corner_cases = read_input_pheno_file(input_phenotype)

# Read input gene file and generate candidate_vars. candidate_vars are (gene, variant, transcript, variant_id); CANDIDATE_GENES is a list of gene symbols; df_genes is a dataframe that keeps all the data that user uploaded; field_names are header of the input gene file 
candidate_vars, CANDIDATE_GENES, df_genes, field_names = read_input_gene_file(input_gene)

# map phenotype to gene; the candidate_vars was filtered: if it is a gene associated with phenos, then keep it.
if phenos:
    ranking_genes, candidate_vars = map_phenotype2gene(CANDIDATE_GENES, phenos, corner_cases, candidate_vars)
else:
    ranking_genes = []
    for gene in CANDIDATE_GENES:
        ranking_genes.append((gene, 1.0, 1))

# collect variant info
mv = myvariant.MyVariantInfo()
final_res, variants = collectVariantInfo.get_variants(candidate_vars)

# pubmed
df_pubmed = pubmed.queryPubmedDB(final_res)

# ACMG
df_hpo_ranking_genes = pd.DataFrame(ranking_genes, columns=['gene', 'score', 'hits'])
df_hpo_ranking_genes = df_hpo_ranking_genes[['gene', 'score']]
ACMG_result = ACMG.Get_ACMG_result(df_hpo_ranking_genes, variants, df_pubmed)

# filter variant on phenotype
if phenos:
    df_final_res = filterVariantOnPhenotype.generateOutput(variants, ACMG_result, phenos)
else:
    df_final_res = ACMG_result



In [5]:
import map_phenotype_to_gene
import collectVariantInfo
import pubmed
import ACMG
import filterVariantOnPhenotype
import csv

import pandas as pd
import numpy as np
import re
import myvariant
# from deepb.models import Main_table, Raw_input_table
from io import StringIO
from collections import Counter


# input_phenotype = 'data/sample_patient_phenotype.txt'
# input_genes = 'data/sample_genes.txt'

def format_hgvs(chrom, pos, ref, alt):
    '''get a valid hgvs name from VCF-style "chrom, pos, ref, alt" data.

    Example:

        >>> myvariant.format_hgvs("1", 35366, "C", "T")
        >>> myvariant.format_hgvs("2", 17142, "G", "GA")
        >>> myvariant.format_hgvs("MT", 8270, "CACCCCCTCT", "C")
        >>> myvariant.format_hgvs("X", 107930849, "GGA", "C")

    '''
    chrom = str(chrom)
    if chrom.lower().startswith('chr'):
        # trim off leading "chr" if any
        chrom = chrom[3:]
    if len(ref) == len(alt) == 1:
        # this is a SNP
        hgvs = 'chr{0}:g.{1}{2}>{3}'.format(chrom, pos, ref, alt)
    elif len(ref) > 1 and len(alt) == 1:
        # this is a deletion:
        if ref[0] == alt:
            start = int(pos) + 1
            end = int(pos) + len(ref) - 1
            hgvs = 'chr{0}:g.{1}_{2}del'.format(chrom, start, end)
        else:
            end = int(pos) + len(ref) - 1
            hgvs = 'chr{0}:g.{1}_{2}delins{3}'.format(chrom, pos, end, alt)
    elif len(ref) == 1 and len(alt) > 1:
        # this is a insertion
        if alt[0] == ref:
            hgvs = 'chr{0}:g.{1}_{2}ins'.format(chrom, pos, int(pos) + 1)
            ins_seq = alt[1:]
            hgvs += ins_seq
        else:
            hgvs = 'chr{0}:g.{1}delins{2}'.format(chrom, pos, alt)
    elif len(ref) > 1 and len(alt) > 1:
        end = int(pos) + len(alt) - 1
        hgvs = 'chr{0}:g.{1}_{2}delins{3}'.format(chrom, pos, end, alt)
    else:
        raise ValueError("Cannot convert {} into HGVS id.".format((chrom, pos, ref, alt)))
    return hgvs

def read_input_pheno_file(input_phenotype):
    if not input_phenotype:
        return '', ''
    text = StringIO(unicode(input_phenotype), newline=None)
    lines = text.readlines()
    lines = [line.strip() for line in lines]
    phenos = []
    for line in lines:
        if not line:
            continue
        phenos_each_line = re.split(r'  +|\t+|,|;|\.|\|', line.strip())
        phenos_each_line = [re.sub(r'^\W+|\W+$', '', s) for s in phenos_each_line]
        phenos_each_line = [s.lower() for s in phenos_each_line if s]
        phenos += phenos_each_line

    corner_cases = dict()
    for pheno in phenos:
        if re.search('development', pheno) and re.search('delay', pheno) and not re.search('growth', pheno):
            phenos.append('growth delay')
            corner_cases['growth delay'] = pheno.strip()
    for pheno in phenos:
        if re.search('growth', pheno) and re.search('delay', pheno) and not re.search('development', pheno):
            phenos.append('developmental delay')
            corner_cases['developmental delay'] = pheno.strip()
    return phenos, corner_cases

def read_input_gene_file(input_gene):
	candidate_vars = []
	input_gene = input_gene.split('\n')
	header = input_gene[0]
	sniffer = csv.Sniffer()
	dialect = sniffer.sniff(header)
	delimiter =  dialect.delimiter
	field_names = header.split(delimiter)
	chrom_idx, pos_idx, ref_idx, alt_idx, gene_idx = None, None, None, None, 0 

	for idx in xrange(len(field_names)):
		field = field_names[idx]
		if re.match(r'chrom', field, re.I): chrom_idx = idx
		if re.match(r'pos|start', field, re.I): pos_idx = idx
		if re.match(r'ref', field, re.I): ref_idx = idx
		if re.match(r'alt|allele 1', field, re.I): alt_idx = idx
		if re.match(r'gene (gene)|gene', field, re.I): gene_idx = idx

	input_gene_list = []
	CANDIDATE_GENES = []
	for line in input_gene[1:]:
		if not line:
			continue
		line = line.rstrip()
		parts = re.split(r'%s' % delimiter, line)
		input_gene_list.append(parts)
		gene = parts[gene_idx]
		CANDIDATE_GENES.append(gene)
		transcript, variant, variant_id = '', '', ''
		for part in parts:
			if re.search(r'_.*:c\.', part):
				transcript, variant = part.split(':')
			else:
				if re.search(r'c\.', part):
					variant = part
				if re.search(r'NM_', part, re.I):
					transcript = part.split(':')[0]
			if re.search(r'_.*:g\.', part):
				variant_id = 'chr' + part.split(':')[0].split('.')[-1] + part.split(':')[-1]
			if re.search(r'chr.*:g\.', part, re.I):
				variant_id = part
		if not variant_id and (chrom_idx and pos_idx and ref_idx and alt_idx):
			chrome, pos, ref, alt = parts[chrom_idx], parts[pos_idx], parts[ref_idx], parts[alt_idx]
			variant_id = format_hgvs(chrome, pos, ref, alt)
		candidate_vars.append((gene, variant, transcript, variant_id))

        # remove lines in the input file which has wrong number of fields
        field_nums = []
        for line in input_gene_list:
			field_nums.append(len(line))
        count = Counter(field_nums)
        correct_field_num = count.most_common()[0][0]
        correct_input_gene_list = []
        for line in input_gene_list:
			if len(line) == correct_field_num:
				correct_input_gene_list.append(line)
        df_genes = pd.DataFrame(correct_input_gene_list, columns = field_names)
        return candidate_vars, CANDIDATE_GENES, df_genes, field_names 

def map_phenotype2gene(CANDIDATE_GENES, phenos, corner_cases, candidate_vars):
	ranking_genes, ranking_disease = map_phenotype_to_gene.generate_score(phenos, CANDIDATE_GENES, corner_cases)
	# collect variant info
	hpo_filtered_genes = np.unique([i[0] for i in ranking_genes]).tolist()

	tmp_candidate_vars = []
	for var in candidate_vars:
		if var[0] in hpo_filtered_genes:
			tmp_candidate_vars.append(var)
	candidate_vars = tmp_candidate_vars
	return ranking_genes, candidate_vars

# 	status_step = "generating candidate variants ..." 
# 	raw_input = Raw_input_table.objects.get(id=raw_input_id)
# 	input_gene = raw_input.raw_input_gene
# 	input_phenotype = raw_input.raw_input_phenotype
input_gene = open("/Users/Tianqi/Desktop/xiaonantest1_g.txt",'rU').read()
input_phenotype = open("/Users/Tianqi/Desktop/xiaonantest1_p.txt",'rU').read()

# Read input pheno file and generate phenos and corner_cases 
phenos, corner_cases = read_input_pheno_file(input_phenotype)


# Read input gene file and generate candidate_vars. candidate_vars are (gene, variant, transcript, variant_id); CANDIDATE_GENES is a list of gene symbols; df_genes is a dataframe that keeps all the data that user uploaded; field_names are header of the input gene file 
candidate_vars, CANDIDATE_GENES, df_genes, field_names = read_input_gene_file(input_gene)

# map phenotype to gene; the candidate_vars was filtered: if it is a gene associated with phenos, then keep it.

if phenos:
#     raw_input.status = "Maping phenotypes to genes"
#     raw_input.save()
    ranking_genes, candidate_vars = map_phenotype2gene(CANDIDATE_GENES, phenos, corner_cases, candidate_vars)
else:
    ranking_genes = []
    for gene in CANDIDATE_GENES:
        ranking_genes.append((gene, 1.0, 1))

# collect variant info
# raw_input.status = "Annotating variants using genomic databases"
# raw_input.save()
mv = myvariant.MyVariantInfo()
final_res, variants = collectVariantInfo.get_variants(candidate_vars)

# pubmed
# raw_input.status = "Searching biomedical literatures"
# raw_input.save()
df_pubmed = pubmed.queryPubmedDB(final_res)

# ACMG
# raw_input.status = "Checking ACMG standard"
# raw_input.save()
df_hpo_ranking_genes = pd.DataFrame(ranking_genes, columns=['gene', 'score', 'hits'])
df_hpo_ranking_genes = df_hpo_ranking_genes[['gene', 'score']]
ACMG_result, variant_ACMG_interpretation, variant_ACMG_interpret_chinese = ACMG.Get_ACMG_result(df_hpo_ranking_genes, variants, df_pubmed)

# filter variant on phenotype

if phenos:
#     raw_input.status = "Filtering variants based on phenotypes"
#     raw_input.save()
    df_final_res, variant_ACMG_interpretation, variant_ACMG_interpret_chinese = filterVariantOnPhenotype.generateOutput(variants, ACMG_result, phenos, variant_ACMG_interpretation, variant_ACMG_interpret_chinese)
else:
    df_final_res = ACMG_result

In [10]:
df_final_res.head()

Unnamed: 0,gene,variant,protein,id,final_score,pathogenicity,hit_criteria,pathogenicity_score,hpo_hit_score,pheno_match_score
76,BBS10,c.145C>T,p.Arg49Trp,chr12:g.76741994G>A,9.51101,Likely pathogenic,PS1|PM1|PP3|PP5,3.875,5.666667,1.154249
103,HBB,c.208G>A,p.Gly70Ser,chr11:g.5247914C>T,8.713391,Uncertain significance,PM5|PM1|PP3|PP5,2.625,19.213095,1.074401
66,DLX3,c.335A>G,p.Lys112Arg,chr17:g.48070945T>C,6.15,Uncertain significance,PS1|PM1|PM2|PP3|BP1,3.9612,2.0,1.0
53,PROP1,c.652A>C,p.Ser218Arg,chr5:g.177419739T>G,5.23,Uncertain significance,PM1|PM2|PP3,2.25,7.5,1.0
107,SOX3,c.818C>T,p.Ser273Leu,chrX:g.139586408G>A,5.19,Uncertain significance,PM1|PM2|PP3,2.25,7.333333,1.0


In [11]:
df_genes.head()

Unnamed: 0,Chromosome,Start,Stop,Reference,Allele 1,Allele 2,Allele In Scope,Transcript,Gene Profile Report,Gene (gene),cDNA (cNomen),Protein (pNomen),HGVS cDNA-level nomenclature (fullCNomen)
0,1,1635536,1635536,.,.,A,A,NM_024011.2,,CDK11A,c.1735dupT,p.Y579Lfs*66,NM_024011.2:c.1735dupT
1,1,23713843,23713843,T,C,T,C,NM_003196.2,,TCEA3,c.889A>G,p.M297V,NM_003196.2:c.889A>G
2,1,24882663,24882663,A,A,G,G,NM_001010980.4,,NCMAP,c.-8+3A>G,,NM_001010980.4:c.-8+3A>G
3,1,26663842,26663842,G,G,A,A,NM_001039775.3,,AIM1L,c.3673C>T,p.R1225W,NM_001039775.3:c.3673C>T
4,1,39896387,39896387,A,C,A,C,NM_012090.5,,MACF1,c.10958A>C,p.E3653A,NM_012090.5:c.10958A>C


In [8]:
phenos

[u'bilateral post-axial polydactyly of hands and feet',
 u'macrocephaly',
 u'tall stature',
 u'post-natal',
 u'central hypotonia',
 u'global developmental delay',
 u'obstructive sleep apnea',
 u'delayed cns central nervous myelination',
 u'white matter',
 u'corpus callosum',
 u'mildly ectopic neurohypophys posterior pituitary upslanted palpebral fissures',
 u'epicanthus',
 u'arched eyebrows',
 u'craniofacial asymmetry',
 u'bronchial stenosis',
 u'macrosomia',
 u'recurrent infection',
 u'gastroesophageal reflux',
 u'supratentorial perivascular space',
 u'mucopolysaccharidosis plasma urine creatine',
 u'purine',
 u'pyrimidine metabolism',
 u'gross motor',
 u'language',
 u'cognitive',
 u'ventriculomegaly',
 u'metatarsal bones',
 u'proximal phalanges',
 u'obesity',
 u'wheeze',
 u'brachycephaly frontal prominence',
 u'bitemporal narrowing',
 u'clinodactyly',
 u'brachydactyly',
 u'bardet-biedl',
 'growth delay',
 'developmental delay']

In [9]:
field_names

['Chromosome',
 'Start',
 'Stop',
 'Reference',
 'Allele 1',
 'Allele 2',
 'Allele In Scope',
 'Transcript',
 'Gene Profile Report',
 'Gene (gene)',
 'cDNA (cNomen)',
 'Protein (pNomen)',
 'HGVS cDNA-level nomenclature (fullCNomen)']

In [17]:
variant_ACMG_interpretation.head()

Unnamed: 0,gene,variant,criteria,interpretation
0,BBS10,c.145C>T,variant_annotations,"Effect: missense_variant.<br/>Protein domain: GroEL-like equatorial domain.<br/>HGVS ID: chr12:g.76741994G>A.<br/>RefSeq ID: rs768933093.<br/>exon: 1.<br/>ExAC MAF: 5.052e-05.<br/>DANN pathogenicity score: 0.999117068756.<br/>FATHMM pathogenicity score: 0.79186.<br/>MetaSVM pathogenicity score: 0.84564.<br/>GERP++ conservation score: 4.27.<br/>Clinvar variation ids: 225010.<br/>Pathogenicity reported by Clinvar: Pathogenic|Pathogenic.<br/>Clinvar review status: criteria provided, single submitter.<br/>Pubmed references from Clinvar: ['25356970', '20120035', '16582908', '21044901', '20876674', '21642631', '20498079', '21517826', '20177705', '24746959', '25982971']."
1,BBS10,c.145C>T,PVS1,Variant effect NOT in null variant type. Allele in a gene where loss of function (LOF) is a known mechanism of disease. The variant does NOT have damaging splicing effect. PVS1 is NOT met.
2,BBS10,c.145C>T,PS1 and PM5,Variant effect is missense. Same amino acid change as a previously established pathogenic variant regardless of nucleotide change (Clinvar references: 225010). Not find missense change at an amino acid residue where a different missense change determined to be pathogenic has been seen before. The variant does NOT have damaging splicing effect. PS1 is met. PM5 is NOT met.
3,BBS10,c.145C>T,PS3 and BS3,Not find well-established functional studies on this variant. PS3 is NOT met. BS3 is NOT met.
4,BBS10,c.145C>T,PS4,Relative risk is smaller than 5. The prevalence of the variant in affected individuals is NOT significantly increased compared with the prevalence in controls. PS4 is NOT met.


In [18]:
variant_ACMG_interpret_chinese.head()

Unnamed: 0,基因,变异,标准,解读
0,BBS10,c.145C>T,变异注释,"突变类型: missense_variant.<br/>蛋白功能区: GroEL-like equatorial domain.<br/>HGVS ID: chr12:g.76741994G>A.<br/>RefSeq ID: rs768933093.<br/>外显子: 1.<br/>ExAC 最小等位基因频率(MAF): 5.052e-05.<br/>DANN致病性分数: 0.999117068756.<br/>FATHMM致病性分数: 0.79186.<br/>MetaSVM致病性分数: 0.84564.<br/>GERP++序列保守性预测分数: 4.27.<br/>Clinvar数据库ID: 225010.<br/>Clinvar数据库记录的变异致病性: 致病|致病.<br/>Clinvar数据库记录审核状态: 提供标准，单个提交者.<br/>Clinvar数据库记录的Pubmed相关生物医学文献: ['25356970', '20120035', '16582908', '21044901', '20876674', '21642631', '20498079', '21517826', '20177705', '24746959', '25982971']."
1,BBS10,c.145C>T,PVS1,基因变异类型不是无效变异(null variant). 变异位点所在基因的功能丢失(loss of function)是已知的致病机制. 此变异不具有害的剪接效应(splicing effect). 不符合PVS1标准.
2,BBS10,c.145C>T,PS1和PM5,变异为错义突变. 之前报道导致相同氨基酸改变的基因变异被证明是致病的 (Clinvar数据库参考: 225010). 未发现导致相同氨基酸残基错义突变（不同氨基酸改变）导致的致病基因变异. 此变异不具有害的剪接效应(splicing effect). 符合PS1标准. 不符合PM5标准.
3,BBS10,c.145C>T,PS3和BS3,未发现针对此基因变异的完善的体内或体外功能性研究. 不符合PS3标准. 不符合BS3标准.
4,BBS10,c.145C>T,PS4,具有此基因变异的人群患病率并未显著升高，相对风险(Relative Risk)小于5.0.. 不符合PS4标准.


In [None]:
df_final_res, df_genes, phenos, field_names, variant_ACMG_interpretation, variant_ACMG_interpret_chinese

In [14]:
nput_gene = df_genes.to_json(orient='records')
input_phenotype = ', '.join(phenos)
result_table = ACMG_result.to_json(orient='records')
interpretation = variant_ACMG_interpretation.to_json(orient='records')
interpretation_chinese = variant_ACMG_interpret_chinese.to_json(orient='records')
# logger.info("Finish processing data, start writing data to DB in background main task")


# sample = Main_table(
#     task_id=raw_input_id,
#     input_gene=input_gene,
#     input_phenotype=input_phenotype,
#     result=result_table,
#     interpretation=interpretation,
#     interpretation_chinese=interpretation_chinese,
#     pub_date=timezone.now(),
#     user_name=raw_input.user_name,
#     task_name=raw_input.task_name,
# )
# sample.save()
# logger.info("Finish writing data to DB in background main task")
# raw_input.status = "succeed"
# raw_input.save()

In [16]:
interpretation[:1000]

'[{"gene":"BBS10","variant":"c.145C>T","criteria":"variant_annotations","interpretation":"Effect: missense_variant.<br\\/>Protein domain: GroEL-like equatorial domain.<br\\/>HGVS ID: chr12:g.76741994G>A.<br\\/>RefSeq ID: rs768933093.<br\\/>exon: 1.<br\\/>ExAC MAF: 5.052e-05.<br\\/>DANN pathogenicity score: 0.999117068756.<br\\/>FATHMM pathogenicity score: 0.79186.<br\\/>MetaSVM pathogenicity score: 0.84564.<br\\/>GERP++ conservation score: 4.27.<br\\/>Clinvar variation ids: 225010.<br\\/>Pathogenicity reported by Clinvar: Pathogenic|Pathogenic.<br\\/>Clinvar review status: criteria provided, single submitter.<br\\/>Pubmed references from Clinvar: [\'25356970\', \'20120035\', \'16582908\', \'21044901\', \'20876674\', \'21642631\', \'20498079\', \'21517826\', \'20177705\', \'24746959\', \'25982971\']."},{"gene":"BBS10","variant":"c.145C>T","criteria":"PVS1","interpretation":"Variant effect NOT in null variant type. Allele in a gene where loss of function (LOF) is a known mechanism of dis

In [21]:
interpretation_chinese[:1000]

'[{"\\u57fa\\u56e0":"BBS10","\\u53d8\\u5f02":"c.145C>T","\\u6807\\u51c6":"\\u53d8\\u5f02\\u6ce8\\u91ca","\\u89e3\\u8bfb":"\\u7a81\\u53d8\\u7c7b\\u578b: missense_variant.<br\\/>\\u86cb\\u767d\\u529f\\u80fd\\u533a: GroEL-like equatorial domain.<br\\/>HGVS ID: chr12:g.76741994G>A.<br\\/>RefSeq ID: rs768933093.<br\\/>\\u5916\\u663e\\u5b50: 1.<br\\/>ExAC \\u6700\\u5c0f\\u7b49\\u4f4d\\u57fa\\u56e0\\u9891\\u7387(MAF): 5.052e-05.<br\\/>DANN\\u81f4\\u75c5\\u6027\\u5206\\u6570: 0.999117068756.<br\\/>FATHMM\\u81f4\\u75c5\\u6027\\u5206\\u6570: 0.79186.<br\\/>MetaSVM\\u81f4\\u75c5\\u6027\\u5206\\u6570: 0.84564.<br\\/>GERP++\\u5e8f\\u5217\\u4fdd\\u5b88\\u6027\\u9884\\u6d4b\\u5206\\u6570: 4.27.<br\\/>Clinvar\\u6570\\u636e\\u5e93ID: 225010.<br\\/>Clinvar\\u6570\\u636e\\u5e93\\u8bb0\\u5f55\\u7684\\u53d8\\u5f02\\u81f4\\u75c5\\u6027: \\u81f4\\u75c5|\\u81f4\\u75c5.<br\\/>Clinvar\\u6570\\u636e\\u5e93\\u8bb0\\u5f55\\u5ba1\\u6838\\u72b6\\u6001: \\u63d0\\u4f9b\\u6807\\u51c6\\uff0c\\u5355\\u4e2a\\u63d0\\u4ea4\

In [None]:
interpretation_chinese = variant_ACMG_interpret_chinese.to_json(orient='records')