In [1]:
import os
import hail as hl
import pyspark
import bokeh
import logging
import random
import pandas as pd
import numpy as np
from scipy import stats
import pickle 
from matplotlib import pyplot as plt
from typing import Any, Counter, List, Optional, Tuple, Union
from hail.plot import show, output_notebook
tmp_dir = "hdfs://spark-master:9820/"
temp_dir = "file:///home/ubuntu/data/tmp"
plot_dir = "/home/ubuntu/data/tmp"

sc = pyspark.SparkContext()
hadoop_config = sc._jsc.hadoopConfiguration()
hadoop_config.set("fs.s3a.access.key", "8YY584J59H7Q6AVKHSU8")
hadoop_config.set("fs.s3a.secret.key", "P8vePa7JUvxKXX2me9ti1cGujgYWMoimAwx4mMlM")
hadoop_config.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
hadoop_config.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
hl.init(sc=sc, tmp_dir=tmp_dir, default_reference='GRCh38')
output_notebook()
logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s")
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

pip-installed Hail requires additional configuration options in Spark referring
  to the path to the Hail Python module directory HAIL_DIR,
  e.g. /path/to/python/site-packages/hail:
    spark.jars=HAIL_DIR/hail-all-spark.jar
    spark.driver.extraClassPath=HAIL_DIR/hail-all-spark.jar
    spark.executor.extraClassPath=./hail-all-spark.jarRunning on Apache Spark version 2.4.5
SparkUI available at http://spark-master:4040
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.41-b8144dba46e6
LOGGING: writing to /home/ubuntu/data/tmp/scripts/sanger_gnomad_hail_qc/notebooks/hail-20201019-1236-0.2.41-b8144dba46e6.log


In [2]:
ht = hl.read_table(
        f'{temp_dir}/ddd-elgh-ukbb/variant_qc/Sanger_table_for_RF.ht')

In [3]:
LABEL_COL = "rf_label"
TRAIN_COL = "rf_train"
PREDICTION_COL = "rf_prediction"
INFO_FEATURES = [
    "AS_QD",
    "AS_ReadPosRankSum",
    "AS_MQRankSum",
    "AS_SOR",
]  # Note: AS_SOR is currently in VQSR HT and named SOR in the VQSR split HT
FEATURES = [
    "InbreedingCoeff",
    "variant_type",
    "allele_type",
    "n_alt_alleles",
    "was_mixed",
    "has_star",
    "AS_QD",
    "AS_MQRankSum",
    "AS_SOR",
    "AS_ReadPosRankSum",
]
TRUTH_DATA = ["hapmap", "omni", "mills", "kgp_phase1_hc"]
INBREEDING_COEFF_HARD_CUTOFF = -0.3


In [8]:
features = FEATURES
test_intervals = 'chr20'

fp_expr = ht.fail_hard_filters
tp_expr = ht.omni | ht.mills

tp_expr = tp_expr | ht.transmitted_singleton

if test_intervals:
    if isinstance(test_intervals, str):
        test_intervals = [test_intervals]
    test_intervals = [
        hl.parse_locus_interval(x, reference_genome="GRCh38")
        for x in test_intervals
    ]
ht = ht.annotate(tp=tp_expr, fp=fp_expr)


In [9]:
ht.describe()

----------------------------------------
Global fields:
    None
----------------------------------------
Row fields:
    'locus': locus<GRCh38> 
    'alleles': array<str> 
    'a_index': int32 
    'was_split': bool 
    'InbreedingCoeff': float32 
    'variant_type': str 
    'allele_type': str 
    'n_alt_alleles': int32 
    'was_mixed': bool 
    'has_star': bool 
    'AS_QD': array<float64> 
    'AS_MQRankSum': array<float64> 
    'AS_SOR': array<float64> 
    'AS_ReadPosRankSum': array<float64> 
    'hapmap': bool 
    'omni': bool 
    'mills': bool 
    'kgp_phase1_hc': bool 
    'transmitted_singleton': bool 
    'fail_hard_filters': bool 
    'tp': bool 
    'fp': bool 
----------------------------------------
Key: ['locus', 'alleles']
----------------------------------------


In [10]:
ht.count()

7854607

In [7]:
test_intervals="chr20"
if isinstance(test_intervals, str):
    test_intervals = [test_intervals]
    test_intervals = [hl.parse_locus_interval(x, reference_genome="GRCh38")
            for x in test_intervals
        ]

In [13]:
print(hl.eval(test_intervals))


[Interval(start=Locus(contig=chr20, position=1, reference_genome=GRCh38), end=Locus(contig=chr20, position=64444167, reference_genome=GRCh38), includes_start=True, includes_end=True)]


In [None]:
test_expr=hl.literal(test_intervals).any(lambda interval: interval.contains(ht.locus)

In [2]:
 mt = hl.read_matrix_table(
        f'{temp_dir}/ddd-elgh-ukbb/Sanger_cohorts_chr1to3-20_split.mt')

In [2]:
truth_data_ht = hl.read_table(
        f'{temp_dir}/ddd-elgh-ukbb/variant_qc/truthset_table.ht')
trio_stats_table = hl.read_table(
        f'{temp_dir}/ddd-elgh-ukbb/variant_qc/Sanger_cohorts_trios_stats.ht')
    #inbreeding_ht = hl.read_table(f'{temp_dir}/ddd-elgh-ukbb/variant_qc/Sanger_cohorts_inbreeding.ht')
allele_data_ht = hl.read_table(
        f'{temp_dir}/ddd-elgh-ukbb/variant_qc/Sanger_cohorts_allele_data.ht')
allele_counts_ht = hl.read_table(
        f'{temp_dir}/ddd-elgh-ukbb/variant_qc/Sanger_cohorts_qc_ac.ht')
allele_counts_ht=hl.read_table('hdfs://spark-master:9820/ddd-elgh-ukbb/Sanger_cohorts_qc_ac.ht')
inbreeding_ht = hl.read_table(
        f'{temp_dir}/ddd-elgh-ukbb/variant_qc/Sanger_cohorts_inbreeding.ht')
group = "raw"
mt = hl.read_matrix_table(
        f'{temp_dir}/ddd-elgh-ukbb/Sanger_cohorts_chr1to3-20_split.mt')


In [4]:
ht = mt.rows()
ht = ht.transmute(**ht.info)
ht = ht.select("FS", "MQ", "QD", "InbreedingCoeff", *INFO_FEATURES)

trio_stats_ht = trio_stats_table.select(
        f"n_transmitted_{group}", f"ac_children_{group}"
    )


In [10]:
allele_counts_ht=hl.read_table('hdfs://spark-master:9820/ddd-elgh-ukbb/Sanger_cohorts_qc_ac.ht')
allele_counts_ht=allele_counts_ht.drop('rsid')
ht = ht.annotate(
        **inbreeding_ht[ht.key],
        **trio_stats_ht[ht.key],
        **truth_data_ht[ht.key],
        **allele_data_ht[ht.key].allele_data,
        **allele_counts_ht[ht.key],
    )

TypeError: annotate() got multiple values for keyword argument 'qual'

In [8]:
allele_counts_ht.show(5)

locus,alleles,rsid,qual,filters,info.AC,info.AF,info.AN,info.AS_BaseQRankSum,info.AS_FS,info.AS_InbreedingCoeff,info.AS_MQ,info.AS_MQRankSum,info.AS_QD,info.AS_ReadPosRankSum,info.AS_SOR,info.BaseQRankSum,info.DB,info.DP,info.DS,info.END,info.ExcessHet,info.FS,info.InbreedingCoeff,info.MLEAC,info.MLEAF,info.MQ,info.MQRankSum,info.QD,info.RAW_MQandDP,info.ReadPosRankSum,info.SOR,a_index,was_split,ac_qc_samples_raw,ac_qc_samples_adj
locus<GRCh38>,array<str>,str,float64,set<str>,array<int32>,array<float64>,int32,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,float64,bool,int32,bool,int32,float64,float64,float64,array<int32>,array<float64>,float64,float64,float64,array<int32>,float64,float64,int32,bool,int64,int64
chr1:12938,"[""GCAAA"",""G""]","""rs756849893""",327.0,,[2],[3.43e-04],5828,,,[-1.68e-01],,,[2.54e+01],,,,True,3591,False,,0.0004,0.0,-0.168,[64],[1.10e-02],25.0,,28.7,,,2.3,1,False,2,0
chr1:13024,"[""G"",""A""]",,151.0,,[2],[1.15e-04],17384,,,[-1.47e-01],,,[3.10e+01],,,,False,13955,False,,0.0001,0.0,-0.147,[20],[1.15e-03],25.0,,27.2,,,3.26,1,False,2,0
chr1:13087,"[""A"",""G""]",,101.0,,[5],[1.24e-04],40192,,,[-1.26e-01],,,[7.23e+00],,,-0.61,False,39714,False,,0.0,0.0,-0.126,[17],[4.23e-04],33.1,-0.136,7.23,,1.69,0.595,1,False,5,1
chr1:13116,"[""T"",""C""]",,123.0,,[2],[4.91e-05],40732,,,[-1.32e-01],,,[2.46e+01],,,,False,41952,False,,0.0008,0.0,-0.132,[8],[1.96e-04],28.0,,24.6,,,1.02,1,False,2,0
chr1:13130,"[""C"",""T""]",,82.3,,[2],[5.57e-05],35920,,,[-1.47e-01],,,[2.74e+01],,,,False,31819,False,,0.0002,0.0,-0.147,[9],[2.51e-04],35.4,,27.4,,,1.18,1,False,2,0


In [12]:
def generate_ac(mt: hl.MatrixTable, fam_file: str) -> hl.Table:
    """
    Creates Table with QC samples, QC samples removing children and release samples raw and adj ACs.
    """
    #mt = mt.filter_cols(mt.meta.high_quality)
    fam_ht = hl.import_fam(fam_file, delimiter="\t")
    mt = mt.annotate_cols(unrelated_sample=hl.is_missing(fam_ht[mt.s]))
    mt = mt.filter_rows(hl.len(mt.alleles) > 1)
    mt = annotate_adj(mt)
    mt = mt.annotate_rows(
        ac_qc_samples_raw=hl.agg.sum(mt.GT.n_alt_alleles()),
        #ac_qc_samples_unrelated_raw=hl.agg.filter(~mt.meta.all_samples_related, hl.agg.sum(mt.GT.n_alt_alleles())),
        #ac_release_samples_raw=hl.agg.filter(mt.meta.release, hl.agg.sum(mt.GT.n_alt_alleles())),
        ac_qc_samples_adj=hl.agg.filter(mt.adj, hl.agg.sum(mt.GT.n_alt_alleles())),
        #ac_qc_samples_unrelated_adj=hl.agg.filter(~mt.meta.all_samples_related & mt.adj, hl.agg.sum(mt.GT.n_alt_alleles())),
        #ac_release_samples_adj=hl.agg.filter(mt.meta.release & mt.adj, hl.agg.sum(mt.GT.n_alt_alleles())),
    )
    return mt.rows()

In [13]:
from gnomad.utils.annotations import annotate_adj
fam = "s3a://DDD-ELGH-UKBB-exomes/trios/DDD_trios.fam"
pedigree = hl.Pedigree.read(fam)
qc_ac_ht=generate_ac(mt, fam )

In [14]:
qc_ac_ht.describe()

----------------------------------------
Global fields:
    None
----------------------------------------
Row fields:
    'locus': locus<GRCh38> 
    'alleles': array<str> 
    'rsid': str 
    'qual': float64 
    'filters': set<str> 
    'info': struct {
        AC: array<int32>, 
        AF: array<float64>, 
        AN: int32, 
        AS_BaseQRankSum: array<float64>, 
        AS_FS: array<float64>, 
        AS_InbreedingCoeff: array<float64>, 
        AS_MQ: array<float64>, 
        AS_MQRankSum: array<float64>, 
        AS_QD: array<float64>, 
        AS_ReadPosRankSum: array<float64>, 
        AS_SOR: array<float64>, 
        BaseQRankSum: float64, 
        DB: bool, 
        DP: int32, 
        DS: bool, 
        END: int32, 
        ExcessHet: float64, 
        FS: float64, 
        InbreedingCoeff: float64, 
        MLEAC: array<int32>, 
        MLEAF: array<float64>, 
        MQ: float64, 
        MQRankSum: float64, 
        QD: float64, 
        RAW_MQandDP: array<int32>, 


In [18]:
qc_ac_ht=qc_ac_ht.select(*['ac_qc_samples_raw', 'ac_qc_samples_adj'])

In [19]:
ht = ht.annotate(
        **inbreeding_ht[ht.key],
        **trio_stats_ht[ht.key],
        **truth_data_ht[ht.key],
        **allele_data_ht[ht.key].allele_data,
        **qc_ac_ht[ht.key],
    )

In [22]:
qc_ac_ht.write(
        f'{tmp_dir}/ddd-elgh-ukbb/Sanger_cohorts_qc_ac.ht', overwrite=True)

KeyboardInterrupt: 

In [3]:
from gnomad.utils.annotations import annotate_adj, bi_allelic_expr
from gnomad.utils.annotations import annotate_adj, bi_allelic_expr,bi_allelic_site_inbreeding_expr

In [6]:
mt1=mt.annotate_rows(InbreedingCoeff=bi_allelic_site_inbreeding_expr(mt.GT))

In [8]:
mt1.rows().describe()

----------------------------------------
Global fields:
    None
----------------------------------------
Row fields:
    'locus': locus<GRCh38> 
    'alleles': array<str> 
    'rsid': str 
    'qual': float64 
    'filters': set<str> 
    'info': struct {
        AC: array<int32>, 
        AF: array<float64>, 
        AN: int32, 
        AS_BaseQRankSum: array<float64>, 
        AS_FS: array<float64>, 
        AS_InbreedingCoeff: array<float64>, 
        AS_MQ: array<float64>, 
        AS_MQRankSum: array<float64>, 
        AS_QD: array<float64>, 
        AS_ReadPosRankSum: array<float64>, 
        AS_SOR: array<float64>, 
        BaseQRankSum: float64, 
        DB: bool, 
        DP: int32, 
        DS: bool, 
        END: int32, 
        ExcessHet: float64, 
        FS: float64, 
        InbreedingCoeff: float64, 
        MLEAC: array<int32>, 
        MLEAF: array<float64>, 
        MQ: float64, 
        MQRankSum: float64, 
        QD: float64, 
        RAW_MQandDP: array<int32>, 
