In [2]:
import os
import hail as hl
import pyspark
import bokeh
import logging
import random
import pandas as pd
import numpy as np
from scipy import stats
import pickle 
from matplotlib import pyplot as plt
from typing import Any, Counter, List, Optional, Tuple, Union,Dict,Set
from hail.plot import show, output_notebook
from bokeh.palettes import d3  # pylint: disable=no-name-in-module
from bokeh.models import Plot, Row, Span, NumeralTickFormatter, LabelSet
from gnomad.utils.plotting import *
from typing import Set, Tuple

tmp_dir = "hdfs://spark-master:9820/"
temp_dir = "file:///home/ubuntu/data/tmp"
plot_dir = "/home/ubuntu/data/tmp"

sc = pyspark.SparkContext()
hadoop_config = sc._jsc.hadoopConfiguration()
hadoop_config.set("fs.s3a.access.key", "8YY584J59H7Q6AVKHSU8")
hadoop_config.set("fs.s3a.secret.key", "P8vePa7JUvxKXX2me9ti1cGujgYWMoimAwx4mMlM")
hadoop_config.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
hadoop_config.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
hl.init(sc=sc, tmp_dir=tmp_dir, default_reference='GRCh38')
output_notebook()
logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s")
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

pip-installed Hail requires additional configuration options in Spark referring
  to the path to the Hail Python module directory HAIL_DIR,
  e.g. /path/to/python/site-packages/hail:
    spark.jars=HAIL_DIR/hail-all-spark.jar
    spark.driver.extraClassPath=HAIL_DIR/hail-all-spark.jar
    spark.executor.extraClassPath=./hail-all-spark.jarRunning on Apache Spark version 2.4.5
SparkUI available at http://spark-master:4040
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.41-b8144dba46e6
LOGGING: writing to /home/ubuntu/data/tmp/scripts/sanger_gnomad_hail_qc/notebooks/hail-20201123-1914-0.2.41-b8144dba46e6.log


In [27]:
af_gnomad="s3a://DDD-ELGH-UKBB-exomes/gnomad-AF/gnomad_3.0_sites_AF.tsv"

In [28]:
ht_af=hl.import_table(af_gnomad, types={'f0':'str','f1':'int32', 'f2':'str','f3':'str','f4':'str'},no_header=True)

2020-11-23 12:04:31 Hail: INFO: Reading table with no type imputation
  Loading column 'f0' as type 'str' (user-specified)
  Loading column 'f1' as type 'int32' (user-specified)
  Loading column 'f2' as type 'str' (user-specified)
  Loading column 'f3' as type 'str' (user-specified)
  Loading column 'f4' as type 'str' (user-specified)



In [30]:
ht_af=ht_af.annotate(chrom=ht_af.f0)
ht_af=ht_af.annotate(position=ht_af.f1)
ht_af=ht_af.annotate(ref=ht_af.f2)
ht_af=ht_af.annotate(alt=ht_af.f3)
ht_af=ht_af.annotate(smaf=ht_af.f4)
ht_af=ht_af.select(ht_af.chrom, ht_af.position, ht_af.ref, ht_af.alt, ht_af.smaf)

In [31]:
ht_af = ht_af.key_by(
    locus=hl.locus(ht_af.chrom, ht_af.position, reference_genome='GRCh38'), 
    alleles=[ht_af.ref, ht_af.alt])

In [26]:
ht_af.count()

707950943

In [33]:
ht_af=ht_af.filter(ht_af.smaf !='.')

In [34]:
ht_af.count()

707818395

In [35]:
ht = ht_af.annotate(maf = hl.float64(ht_af.smaf))

In [36]:
ht.describe()

----------------------------------------
Global fields:
    None
----------------------------------------
Row fields:
    'chrom': str 
    'position': int32 
    'ref': str 
    'alt': str 
    'smaf': str 
    'locus': locus<GRCh38> 
    'alleles': array<str> 
    'maf': float64 
----------------------------------------
Key: ['locus', 'alleles']
----------------------------------------


In [38]:
ht=ht.drop(ht.smaf)

In [39]:
ht.write(f'{tmp_dir}/gnomad_v3-0_AF.ht', overwrite=True)

2020-11-23 12:06:52 Hail: INFO: Coerced sorted dataset
2020-11-23 12:07:50 Hail: INFO: wrote table with 707818395 rows in 655 partitions to hdfs://spark-master:9820//gnomad_v3-0_AF.ht


In [2]:
mt=hl.read_matrix_table(f'{temp_dir}/ddd-elgh-ukbb/variant_qc/Sanger_cohorts_family_stats.mt')

In [3]:
fam = f"{temp_dir}/ddd-elgh-ukbb/variant_qc/DDD_trios.fam"
pedigree = hl.Pedigree.read(fam)

In [5]:
priors = hl.read_table(f'{temp_dir}/ddd-elgh-ukbb/variant_qc/gnomad_v3-0_AF.ht')

In [10]:
mt=mt.annotate_rows(gnomad_maf=priors[mt.row_key].maf)

In [12]:
de_novo_table = hl.de_novo(
        mt, pedigree, mt.gnomad_maf)

2020-11-23 15:50:45 Hail: WARN: entries(): Resulting entries table is sorted by '(row_key, col_key)'.
    To preserve row-major matrix table order, first unkey columns with 'key_cols_by()'


In [13]:
de_novo_table = de_novo_table.key_by(
        'locus', 'alleles').collect_by_key('de_novo_data')
de_novo_table.write(
        f'{tmp_dir}/Sanger_cohort_denovo_table.ht', overwrite=True)

KeyboardInterrupt: 

In [2]:
gnomad_sites=hl.read_table('s3a://intervalwgs-qc/gnomad.genomes.r2.1.1.sites.liftover_grch38.ht')

In [3]:
gnomad_sites.describe()



----------------------------------------
Global fields:
    'rf': struct {
        variants_by_type: dict<str, int32>, 
        feature_medians: dict<str, struct {
            variant_type: str, 
            n_alt_alleles: int32, 
            qd: float64, 
            pab_max: float64, 
            info_MQRankSum: float64, 
            info_SOR: float64, 
            info_InbreedingCoeff: float64, 
            info_ReadPosRankSum: float64, 
            info_FS: float64, 
            info_QD: float64, 
            info_MQ: float64, 
            info_DP: int32
        }>, 
        test_intervals: array<interval<locus<GRCh37>>>, 
        test_results: array<struct {
            rf_prediction: str, 
            rf_label: str, 
            n: int32
        }>, 
        features_importance: dict<str, float64>, 
        features: array<str>, 
        vqsr_training: bool, 
        no_transmitted_singletons: bool, 
        adj: bool, 
        rf_hash: str, 
        rf_snv_cutoff: struct {
     

In [5]:
gnomad_sites.freq.AF.show()

locus,alleles,Unnamed: 2_level_0
locus<GRCh38>,array<str>,array<float64>
chr1:10067,"[""T"",""TAACCCTAACCCTAACCCTAACCCTAACCCT...","[3.80e-05,9.55e-05,0.00e+00,3.75e-04,..."
chr1:10108,"[""CAACCCT"",""C""]","[1.01e-03,6.49e-05,0.00e+00,2.27e-02,..."
chr1:10109,"[""AACCCT"",""A""]","[6.42e-02,1.01e-03,5.13e-02,5.88e-02,..."
chr1:10114,"[""T"",""C""]","[0.00e+00,2.56e-04,0.00e+00,0.00e+00,..."
chr1:10114,"[""TAACCCTAACCCTAACCCTAACCCTAACCCTAACC...","[1.14e-04,3.21e-05,0.00e+00,0.00e+00,..."
chr1:10119,"[""CT"",""C""]","[0.00e+00,1.28e-04,0.00e+00,0.00e+00,..."
chr1:10120,"[""T"",""C""]","[0.00e+00,2.89e-04,0.00e+00,0.00e+00,..."
chr1:10128,"[""ACCCTAACCCTAACCCTAAC"",""A""]","[3.95e-05,9.66e-05,8.18e-05,0.00e+00,..."
chr1:10131,"[""CT"",""C""]","[3.66e-05,2.25e-04,0.00e+00,3.38e-04,..."
chr1:10132,"[""TAACCC"",""T""]","[0.00e+00,6.44e-05,0.00e+00,0.00e+00,..."


In [None]:
 mt = hl.read_matrix_table(
        f'{temp_dir}/ddd-elgh-ukbb/variant_qc/Sanger_cohorts_family_stats.mt')

In [2]:
run_hash="91b132aa"
ht_RF = hl.read_table(
        f'{temp_dir}/ddd-elgh-ukbb/variant_qc/models/{run_hash}/rf_result_sanger_cohorts_new.ht')

In [3]:
ht=hl.read_table(
        f'{temp_dir}/ddd-elgh-ukbb/variant_qc/Sanger_cohort_denovo_table.ht')

In [4]:
ht.show()

locus,alleles,de_novo_data
locus<GRCh38>,array<str>,"array<struct{id: str, prior: float64, proband: struct{s: str}, father: struct{s: str}, mother: struct{s: str}, proband_entry: struct{AD: array<int32>, DP: int32, GQ: int32, GT: call, MIN_DP: int32, PGT: call, PID: str, PL: array<int32>, PS: int32, RGQ: int32, SB: array<int32>}, father_entry: struct{AD: array<int32>, DP: int32, GQ: int32, GT: call, MIN_DP: int32, PGT: call, PID: str, PL: array<int32>, PS: int32, RGQ: int32, SB: array<int32>}, mother_entry: struct{AD: array<int32>, DP: int32, GQ: int32, GT: call, MIN_DP: int32, PGT: call, PID: str, PL: array<int32>, PS: int32, RGQ: int32, SB: array<int32>}, is_female: bool, p_de_novo: float64, confidence: str}>"
chr1:16959,"[""G"",""C""]","[(""EGAN00001315776"",3.36e-05,(""EGAN00..."
chr1:17375,"[""A"",""G""]","[(""EGAN00001343225"",1.29e-02,(""EGAN00..."
chr1:17407,"[""G"",""A""]","[(""EGAN00001343225"",1.79e-02,(""EGAN00..."
chr1:17452,"[""C"",""T""]","[(""EGAN00001313169"",3.99e-03,(""EGAN00..."
chr1:17487,"[""C"",""A""]","[(""EGAN00001315786"",3.33e-06,(""EGAN00..."
chr1:17512,"[""C"",""G""]","[(""EGAN00001315333"",3.84e-04,(""EGAN00..."
chr1:17519,"[""G"",""T""]","[(""EGAN00001324367"",2.08e-02,(""EGAN00..."
chr1:63628,"[""G"",""A""]","[(""EGAN00001342824"",3.33e-06,(""EGAN00..."
chr1:133383,"[""A"",""ACT""]","[(""EGAN00001342875"",3.33e-06,(""EGAN00..."
chr1:135166,"[""T"",""TGAGGCC""]","[(""EGAN00001319138"",1.19e-04,(""EGAN00..."


In [5]:

#annotate with de novo table
ht=hl.read_table(
        f'{temp_dir}/ddd-elgh-ukbb/variant_qc/Sanger_cohort_denovo_table.ht')
ht_RF=ht_RF.annotate(de_novo_data=ht[ht_RF.key].de_novo_data)

In [6]:
#annotate with family stats
ht=hl.read_table(
        f'{temp_dir}/ddd-elgh-ukbb/variant_qc/Sanger_cohorts_family_stats.ht')
ht_RF=ht_RF.annotate(family_stats=ht[ht_RF.key].family_stats)

In [7]:
ht_RF.write(f"{tmp_dir}/variant_qc/models/{run_hash}/{run_hash}_rf_result_sanger_cohorts_DENOVO_family_stats.ht", overwrite=True)

2020-11-23 18:37:18 Hail: INFO: wrote table with 13669739 rows in 500 partitions to hdfs://spark-master:9820//variant_qc/models/91b132aa/91b132aa_rf_result_sanger_cohorts_DENOVO_family_stats.ht


In [2]:
run_hash="91b132aa"

In [8]:
run_hash="91b132aa"
ht=hl.read_table(f'{temp_dir}/ddd-elgh-ukbb//variant_qc/models/{run_hash}/{run_hash}_rf_result_sanger_cohorts_DENOVO_family_stats.ht')

In [9]:
ht.describe()

----------------------------------------
Global fields:
    'feature_medians': dict<tuple (
        str
    ), struct {
        a_index: int32, 
        n_alt_alleles: int32, 
        QD: float64, 
        MQRankSum: float64, 
        SOR: float64, 
        ReadPosRankSum: float64, 
        FS: float64, 
        DP: int32
    }> 
    'variants_by_strata': dict<tuple (
        str
    ), int64> 
    'features_importance': dict<str, float64> 
    'features': array<str> 
    'test_results': array<struct {
        rf_prediction: str, 
        rf_label: str, 
        n: int32
    }> 
    'rf_hash': str 
----------------------------------------
Row fields:
    'locus': locus<GRCh38> 
    'alleles': array<str> 
    'a_index': int32 
    'was_split': bool 
    'InbreedingCoeff': float32 
    'variant_type': str 
    'allele_type': str 
    'n_alt_alleles': int32 
    'was_mixed': bool 
    'has_star': bool 
    'QD': float64 
    'MQRankSum': float64 
    'SOR': float64 
    'ReadPosRankSum': 

In [10]:
ht.count()

13669739

In [13]:
ht_validated=ht.filter(ht.de_novo_data[0].p_de_novo >0.99, keep=True)

In [14]:
ht_validated.count()

37533

In [18]:
ht_synonymous=hl.read_table(f'{temp_dir}/ddd-elgh-ukbb/variant_qc/models/{run_hash}/{run_hash}_rf_result_sanger_cohorts_DENOVO_family_stats_SYNONYMOUS_annotation.ht')

In [19]:
ht_synonymous.describe()

----------------------------------------
Global fields:
    'feature_medians': dict<tuple (
        str
    ), struct {
        a_index: int32, 
        n_alt_alleles: int32, 
        QD: float64, 
        MQRankSum: float64, 
        SOR: float64, 
        ReadPosRankSum: float64, 
        FS: float64, 
        DP: int32
    }> 
    'variants_by_strata': dict<tuple (
        str
    ), int64> 
    'features_importance': dict<str, float64> 
    'features': array<str> 
    'test_results': array<struct {
        rf_prediction: str, 
        rf_label: str, 
        n: int32
    }> 
    'rf_hash': str 
----------------------------------------
Row fields:
    'locus': locus<GRCh38> 
    'alleles': array<str> 
    'a_index': int32 
    'was_split': bool 
    'InbreedingCoeff': float32 
    'variant_type': str 
    'allele_type': str 
    'n_alt_alleles': int32 
    'was_mixed': bool 
    'has_star': bool 
    'QD': float64 
    'MQRankSum': float64 
    'SOR': float64 
    'ReadPosRankSum': 

In [21]:
ht=ht.annotate(consequence=ht_synonymous[ht.key].consequence)

In [22]:
ht.write(f'{tmp_dir}/{run_hash}_rf_result_sanger_cohorts_DENOVO_family_stats_SYNONYMOUS.ht',overwrite=True)

2020-11-23 18:46:18 Hail: INFO: wrote table with 13669739 rows in 500 partitions to hdfs://spark-master:9820//91b132aa_rf_result_sanger_cohorts_DENOVO_family_stats_SYNONYMOUS.ht


In [25]:
n_high_quality_de_novos_synonymous=hl.agg.count_where(
                (ht.de_novo_data.p_de_novo[0] > 0.99) & (ht.consequence=="synonymous"))

In [28]:
hl.eval()

<Int64Expression of type int64>


In [32]:
n_trans_singletons=hl.agg.filter((ht.ac_raw < 3) & (ht.consequence=="synonymous") & (
                ht.family_stats.unrelated_qc_callstats.AC[0][1] == 1), hl.agg.sum(ht.family_stats.tdt[0].t) )

In [34]:
n_untrans_singletons=hl.agg.filter((ht.ac_raw < 3) & (ht.consequence=="synonymous") & (
                ht.family_stats.unrelated_qc_callstats.AC[0][1] == 1), hl.agg.sum(ht.family_stats.tdt[0].u))

In [26]:
hl.eval(n_high_quality_de_novos_synonymous)

2020-11-23 18:54:36 Hail: ERROR: 'eval_timed' does not support aggregation


ExpressionException: 'eval_timed' does not support aggregation

In [3]:
ht = hl.read_table(
        f'{temp_dir}/ddd-elgh-ukbb/variant_qc/models/{run_hash}/{run_hash}_rf_result_sanger_cohorts_DENOVO_family_stats_SYNONYMOUS.ht')

NameError: name 'run_hash' is not defined

In [5]:
ht.family_stats.unrelated_qc_callstats.AC[0][1].show()

locus,alleles,Unnamed: 2_level_0
locus<GRCh38>,array<str>,int32
chr1:12938,"[""GCAAA"",""G""]",2
chr1:13024,"[""G"",""A""]",0
chr1:13087,"[""A"",""G""]",3
chr1:13116,"[""T"",""C""]",2
chr1:13130,"[""C"",""T""]",2
chr1:13151,"[""G"",""C""]",2
chr1:13164,"[""G"",""A""]",2
chr1:13176,"[""G"",""T""]",1
chr1:13198,"[""C"",""A""]",1
chr1:13216,"[""C"",""G""]",1


In [6]:
ht.family_stats.mendel.show()

locus,alleles,Unnamed: 2_level_0
locus<GRCh38>,array<str>,array<struct{errors: int64}>
chr1:12938,"[""GCAAA"",""G""]",[(0)]
chr1:13024,"[""G"",""A""]",[(1)]
chr1:13087,"[""A"",""G""]",[(1)]
chr1:13116,"[""T"",""C""]",[(0)]
chr1:13130,"[""C"",""T""]",[(0)]
chr1:13151,"[""G"",""C""]",[(0)]
chr1:13164,"[""G"",""A""]",[(0)]
chr1:13176,"[""G"",""T""]",[(0)]
chr1:13198,"[""C"",""A""]",[(0)]
chr1:13216,"[""C"",""G""]",[(1)]


In [8]:
ht.family_stats.mendel[0].errors.show()

locus,alleles,Unnamed: 2_level_0
locus<GRCh38>,array<str>,int64
chr1:12938,"[""GCAAA"",""G""]",0
chr1:13024,"[""G"",""A""]",1
chr1:13087,"[""A"",""G""]",1
chr1:13116,"[""T"",""C""]",0
chr1:13130,"[""C"",""T""]",0
chr1:13151,"[""G"",""C""]",0
chr1:13164,"[""G"",""A""]",0
chr1:13176,"[""G"",""T""]",0
chr1:13198,"[""C"",""A""]",0
chr1:13216,"[""C"",""G""]",1


In [10]:
ht.family_stats.tdt[0].show()

locus,alleles,.t,.u,.chi_sq,.p_value
locus<GRCh38>,array<str>,int64,int64,float64,float64
chr1:12938,"[""GCAAA"",""G""]",,,,
chr1:13024,"[""G"",""A""]",,,,
chr1:13087,"[""A"",""G""]",,,,
chr1:13116,"[""T"",""C""]",,,,
chr1:13130,"[""C"",""T""]",,,,
chr1:13151,"[""G"",""C""]",,,,
chr1:13164,"[""G"",""A""]",,,,
chr1:13176,"[""G"",""T""]",,,,
chr1:13198,"[""C"",""A""]",,,,
chr1:13216,"[""C"",""G""]",,,,


In [3]:
ht=hl.read_table(
        f'{temp_dir}/ddd-elgh-ukbb/variant_qc/Sanger_cohorts_family_stats.ht')

In [4]:
ht.describe()

----------------------------------------
Global fields:
    None
----------------------------------------
Row fields:
    'locus': locus<GRCh38> 
    'alleles': array<str> 
    'family_stats': array<struct {
        mendel: struct {
            errors: int64
        }, 
        tdt: struct {
            t: int64, 
            u: int64, 
            chi_sq: float64, 
            p_value: float64
        }, 
        unrelated_qc_callstats: struct {
            AC: array<int32>, 
            AF: array<float64>, 
            AN: int32, 
            homozygote_count: array<int32>
        }, 
        meta: dict<str, str>
    }> 
----------------------------------------
Key: ['locus', 'alleles']
----------------------------------------


In [5]:
ht=hl.read_table(
        f'{temp_dir}/ddd-elgh-ukbb/variant_qc/Sanger_cohort_denovo_table.ht')

In [6]:
ht.describe()

----------------------------------------
Global fields:
    None
----------------------------------------
Row fields:
    'locus': locus<GRCh38> 
    'alleles': array<str> 
    'de_novo_data': array<struct {
        id: str, 
        prior: float64, 
        proband: struct {
            s: str
        }, 
        father: struct {
            s: str
        }, 
        mother: struct {
            s: str
        }, 
        proband_entry: struct {
            AD: array<int32>, 
            DP: int32, 
            GQ: int32, 
            GT: call, 
            MIN_DP: int32, 
            PGT: call, 
            PID: str, 
            PL: array<int32>, 
            PS: int32, 
            RGQ: int32, 
            SB: array<int32>
        }, 
        father_entry: struct {
            AD: array<int32>, 
            DP: int32, 
            GQ: int32, 
            GT: call, 
            MIN_DP: int32, 
            PGT: call, 
            PID: str, 
            PL: array<int32>, 
           

In [28]:
ht.show()

locus,alleles,de_novo_data
locus<GRCh38>,array<str>,"array<struct{id: str, prior: float64, proband: struct{s: str}, father: struct{s: str}, mother: struct{s: str}, proband_entry: struct{AD: array<int32>, DP: int32, GQ: int32, GT: call, MIN_DP: int32, PGT: call, PID: str, PL: array<int32>, PS: int32, RGQ: int32, SB: array<int32>}, father_entry: struct{AD: array<int32>, DP: int32, GQ: int32, GT: call, MIN_DP: int32, PGT: call, PID: str, PL: array<int32>, PS: int32, RGQ: int32, SB: array<int32>}, mother_entry: struct{AD: array<int32>, DP: int32, GQ: int32, GT: call, MIN_DP: int32, PGT: call, PID: str, PL: array<int32>, PS: int32, RGQ: int32, SB: array<int32>}, is_female: bool, p_de_novo: float64, confidence: str}>"
chr1:16959,"[""G"",""C""]","[(""EGAN00001315776"",3.36e-05,(""EGAN00..."
chr1:17375,"[""A"",""G""]","[(""EGAN00001343225"",1.34e-02,(""EGAN00..."
chr1:17407,"[""G"",""A""]","[(""EGAN00001343225"",5.75e-03,(""EGAN00..."
chr1:17452,"[""C"",""T""]","[(""EGAN00001313169"",3.21e-03,(""EGAN00..."
chr1:17487,"[""C"",""A""]","[(""EGAN00001315786"",3.33e-06,(""EGAN00..."
chr1:17512,"[""C"",""G""]","[(""EGAN00001315333"",4.13e-04,(""EGAN00..."
chr1:17519,"[""G"",""T""]","[(""EGAN00001324367"",2.20e-02,(""EGAN00..."
chr1:63628,"[""G"",""A""]","[(""EGAN00001342824"",3.33e-06,(""EGAN00..."
chr1:133383,"[""A"",""ACT""]","[(""EGAN00001342875"",3.33e-06,(""EGAN00..."
chr1:135166,"[""T"",""TGAGGCC""]","[(""EGAN00001319138"",2.01e-05,(""EGAN00..."


In [8]:
ht.de_novo_data.p_de_novo.show()

locus,alleles,Unnamed: 2_level_0
locus<GRCh38>,array<str>,array<float64>
chr1:16959,"[""G"",""C""]",[2.84e-01]
chr1:17375,"[""A"",""G""]",[5.05e-02]
chr1:17407,"[""G"",""A""]",[1.54e-01]
chr1:17452,"[""C"",""T""]",[5.93e-01]
chr1:17487,"[""C"",""A""]",[6.12e-01]
chr1:17512,"[""C"",""G""]",[9.18e-02]
chr1:17519,"[""G"",""T""]","[1.60e-01,5.54e-01]"
chr1:63628,"[""G"",""A""]",[1.00e+00]
chr1:133383,"[""A"",""ACT""]",[5.00e-02]
chr1:135166,"[""T"",""TGAGGCC""]",[4.52e-01]


In [9]:
run_hash="91b132aa"
ht_RF=hl.read_table(
        f'{temp_dir}/ddd-elgh-ukbb/variant_qc/models/{run_hash}/rf_result_sanger_cohorts_new.ht')

In [12]:
ht_RF=ht_RF.annotate(de_novo_data=ht[ht_RF.key].de_novo_data)

In [19]:
ht_RF.write(f"{tmp_dir}/variant_qc/models/{run_hash}/{run_hash}_rf_result_sanger_cohorts_DENOVO.ht", overwrite=True)

2020-11-18 11:42:10 Hail: INFO: wrote table with 13669739 rows in 500 partitions to hdfs://spark-master:9820//variant_qc/models/91b132aa/91b132aa_rf_result_sanger_cohorts_DENOVO.ht


In [29]:
ht = hl.read_table(
        f'{temp_dir}/ddd-elgh-ukbb/variant_qc/models/{run_hash}/{run_hash}_rf_result_sanger_cohorts_DENOVO.ht')

In [30]:
ht.describe()

----------------------------------------
Global fields:
    'feature_medians': dict<tuple (
        str
    ), struct {
        a_index: int32, 
        n_alt_alleles: int32, 
        QD: float64, 
        MQRankSum: float64, 
        SOR: float64, 
        ReadPosRankSum: float64, 
        FS: float64, 
        DP: int32
    }> 
    'variants_by_strata': dict<tuple (
        str
    ), int64> 
    'features_importance': dict<str, float64> 
    'features': array<str> 
    'test_results': array<struct {
        rf_prediction: str, 
        rf_label: str, 
        n: int32
    }> 
    'rf_hash': str 
----------------------------------------
Row fields:
    'locus': locus<GRCh38> 
    'alleles': array<str> 
    'a_index': int32 
    'was_split': bool 
    'InbreedingCoeff': float32 
    'variant_type': str 
    'allele_type': str 
    'n_alt_alleles': int32 
    'was_mixed': bool 
    'has_star': bool 
    'QD': float64 
    'MQRankSum': float64 
    'SOR': float64 
    'ReadPosRankSum': 

In [31]:
ht=ht.annotate(high_quality_denovo=hl.is_defined(ht.de_novo_data.p_de_novo[0]> 0.9))

In [11]:
mt = mt.annotate_rows(family_stats=ht[mt.row_key].family_stats)

In [13]:
fam = f"{temp_dir}/ddd-elgh-ukbb/variant_qc/DDD_trios.fam"
pedigree = hl.Pedigree.read(fam)

In [14]:
de_novo_table = hl.de_novo(
        mt, pedigree, mt.family_stats[0].unrelated_qc_callstats.AF[1])

2020-11-17 16:09:03 Hail: WARN: entries(): Resulting entries table is sorted by '(row_key, col_key)'.
    To preserve row-major matrix table order, first unkey columns with 'key_cols_by()'


In [16]:
de_novo_table = de_novo_table.key_by(
        'locus', 'alleles').collect_by_key('de_novo_data')    

In [18]:
de_novo_table.describe()

----------------------------------------
Global fields:
    None
----------------------------------------
Row fields:
    'locus': locus<GRCh38> 
    'alleles': array<str> 
    'de_novo_data': array<struct {
        id: str, 
        prior: float64, 
        proband: struct {
            s: str
        }, 
        father: struct {
            s: str
        }, 
        mother: struct {
            s: str
        }, 
        proband_entry: struct {
            AD: array<int32>, 
            DP: int32, 
            GQ: int32, 
            GT: call, 
            MIN_DP: int32, 
            PGT: call, 
            PID: str, 
            PL: array<int32>, 
            PS: int32, 
            RGQ: int32, 
            SB: array<int32>
        }, 
        father_entry: struct {
            AD: array<int32>, 
            DP: int32, 
            GQ: int32, 
            GT: call, 
            MIN_DP: int32, 
            PGT: call, 
            PID: str, 
            PL: array<int32>, 
           

In [19]:
de_novo_table.show()

locus,alleles,de_novo_data
locus<GRCh38>,array<str>,"array<struct{id: str, prior: float64, proband: struct{s: str}, father: struct{s: str}, mother: struct{s: str}, proband_entry: struct{AD: array<int32>, DP: int32, GQ: int32, GT: call, MIN_DP: int32, PGT: call, PID: str, PL: array<int32>, PS: int32, RGQ: int32, SB: array<int32>}, father_entry: struct{AD: array<int32>, DP: int32, GQ: int32, GT: call, MIN_DP: int32, PGT: call, PID: str, PL: array<int32>, PS: int32, RGQ: int32, SB: array<int32>}, mother_entry: struct{AD: array<int32>, DP: int32, GQ: int32, GT: call, MIN_DP: int32, PGT: call, PID: str, PL: array<int32>, PS: int32, RGQ: int32, SB: array<int32>}, is_female: bool, p_de_novo: float64, confidence: str}>"
chr1:16959,"[""G"",""C""]","[(""EGAN00001315776"",3.36e-05,(""EGAN00..."
chr1:17375,"[""A"",""G""]","[(""EGAN00001343225"",1.34e-02,(""EGAN00..."
chr1:17407,"[""G"",""A""]","[(""EGAN00001343225"",5.75e-03,(""EGAN00..."
chr1:17452,"[""C"",""T""]","[(""EGAN00001313169"",3.21e-03,(""EGAN00..."
chr1:17487,"[""C"",""A""]","[(""EGAN00001315786"",3.33e-06,(""EGAN00..."
chr1:17512,"[""C"",""G""]","[(""EGAN00001315333"",4.13e-04,(""EGAN00..."
chr1:17519,"[""G"",""T""]","[(""EGAN00001324367"",2.20e-02,(""EGAN00..."
chr1:63628,"[""G"",""A""]","[(""EGAN00001342824"",3.33e-06,(""EGAN00..."
chr1:133383,"[""A"",""ACT""]","[(""EGAN00001342875"",3.33e-06,(""EGAN00..."
chr1:135166,"[""T"",""TGAGGCC""]","[(""EGAN00001319138"",2.01e-05,(""EGAN00..."


In [5]:
run_hash="91b132aa"
ht = hl.read_table(
        f'{temp_dir}/ddd-elgh-ukbb/variant_qc/models/{run_hash}/{run_hash}_rf_result_sanger_cohorts_DENOVO_family_stats_SYNONYMOUS.ht')

In [6]:
ht.consequence.summarize()

0,1
Non-missing,552327 (4.04%)
Missing,13117412 (95.96%)
Min Size,18
Max Size,86
Mean Size,21.86
Sample Values,"['synonymous_variant', 'synonymous_variant', 'synonymous_variant', 'synonymous_variant', 'synonymous_variant']"
