In [1]:
import os
import hail as hl
import pyspark
import bokeh
import logging
import random
import pandas as pd
import numpy as np
from scipy import stats
import pickle 
from matplotlib import pyplot as plt
from typing import Any, Counter, List, Optional, Tuple, Union,Dict,Set
from hail.plot import show, output_notebook
from bokeh.palettes import d3  # pylint: disable=no-name-in-module
from bokeh.models import Plot, Row, Span, NumeralTickFormatter, LabelSet
from gnomad.utils.plotting import *
from typing import Set, Tuple

tmp_dir = "hdfs://spark-master:9820/"
temp_dir = "file:///home/ubuntu/data/tmp"
plot_dir = "/home/ubuntu/data/tmp"

sc = pyspark.SparkContext()
hadoop_config = sc._jsc.hadoopConfiguration()
hadoop_config.set("fs.s3a.access.key", "8YY584J59H7Q6AVKHSU8")
hadoop_config.set("fs.s3a.secret.key", "P8vePa7JUvxKXX2me9ti1cGujgYWMoimAwx4mMlM")
hadoop_config.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
hadoop_config.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
hl.init(sc=sc, tmp_dir=tmp_dir, default_reference='GRCh38')
output_notebook()
logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s")
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

pip-installed Hail requires additional configuration options in Spark referring
  to the path to the Hail Python module directory HAIL_DIR,
  e.g. /path/to/python/site-packages/hail:
    spark.jars=HAIL_DIR/hail-all-spark.jar
    spark.driver.extraClassPath=HAIL_DIR/hail-all-spark.jar
    spark.executor.extraClassPath=./hail-all-spark.jarRunning on Apache Spark version 2.4.5
SparkUI available at http://spark-master:4040
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.41-b8144dba46e6
LOGGING: writing to /home/ubuntu/data/tmp/scripts/sanger_gnomad_hail_qc/notebooks/hail-20201125-1357-0.2.41-b8144dba46e6.log


In [6]:
mt = hl.read_matrix_table(
        f'{temp_dir}/ddd-elgh-ukbb/variant_qc/Sanger_cohorts_chr1-7and20_split.mt')

In [7]:
mt.describe()

----------------------------------------
Global fields:
    None
----------------------------------------
Column fields:
    's': str
----------------------------------------
Row fields:
    'locus': locus<GRCh38>
    'alleles': array<str>
    'rsid': str
    'qual': float64
    'filters': set<str>
    'info': struct {
        AC: array<int32>, 
        AF: array<float64>, 
        AN: int32, 
        AS_BaseQRankSum: array<float64>, 
        AS_FS: array<float64>, 
        AS_InbreedingCoeff: array<float64>, 
        AS_MQ: array<float64>, 
        AS_MQRankSum: array<float64>, 
        AS_QD: array<float64>, 
        AS_ReadPosRankSum: array<float64>, 
        AS_SOR: array<float64>, 
        BaseQRankSum: float64, 
        DB: bool, 
        DP: int32, 
        DS: bool, 
        END: int32, 
        ExcessHet: float64, 
        FS: float64, 
        InbreedingCoeff: float64, 
        MLEAC: array<int32>, 
        MLEAF: array<float64>, 
        MQ: float64, 
        MQRankSum: floa

In [3]:
run_hash="91b132aa"
ht = hl.read_table(
        f'{temp_dir}/ddd-elgh-ukbb/variant_qc/models/{run_hash}/{run_hash}_rf_result_sanger_cohorts_DENOVO_family_stats_SYNONYMOUS.ht')

In [12]:
ht=ht.annotate(AC=mt.rows()[ht.key].info.AC)

In [13]:
ht.describe()

----------------------------------------
Global fields:
    'feature_medians': dict<tuple (
        str
    ), struct {
        a_index: int32, 
        n_alt_alleles: int32, 
        QD: float64, 
        MQRankSum: float64, 
        SOR: float64, 
        ReadPosRankSum: float64, 
        FS: float64, 
        DP: int32
    }> 
    'variants_by_strata': dict<tuple (
        str
    ), int64> 
    'features_importance': dict<str, float64> 
    'features': array<str> 
    'test_results': array<struct {
        rf_prediction: str, 
        rf_label: str, 
        n: int32
    }> 
    'rf_hash': str 
----------------------------------------
Row fields:
    'locus': locus<GRCh38> 
    'alleles': array<str> 
    'a_index': int32 
    'was_split': bool 
    'InbreedingCoeff': float32 
    'variant_type': str 
    'allele_type': str 
    'n_alt_alleles': int32 
    'was_mixed': bool 
    'has_star': bool 
    'QD': float64 
    'MQRankSum': float64 
    'SOR': float64 
    'ReadPosRankSum': 

In [14]:
run_hash="91b132aa"
ht.write(f'{tmp_dir}/{run_hash}_rf_result_sanger_cohorts_DENOVO_family_stats_SYNONYMOUS_AC.ht', overwrite=True)

2020-11-25 14:12:06 Hail: INFO: wrote table with 13669739 rows in 500 partitions to hdfs://spark-master:9820//91b132aa_rf_result_sanger_cohorts_DENOVO_family_stats_SYNONYMOUS_AC.ht


In [4]:
ht.family_stats.describe()

--------------------------------------------------------
Type:
        array<struct {
        mendel: struct {
            errors: int64
        }, 
        tdt: struct {
            t: int64, 
            u: int64, 
            chi_sq: float64, 
            p_value: float64
        }, 
        unrelated_qc_callstats: struct {
            AC: array<int32>, 
            AF: array<float64>, 
            AN: int32, 
            homozygote_count: array<int32>
        }, 
        meta: dict<str, str>
    }>
--------------------------------------------------------
Source:
    <hail.table.Table object at 0x7fc56e3c4a20>
Index:
    ['row']
--------------------------------------------------------


In [6]:
ht.family_stats.show()

locus,alleles,family_stats
locus<GRCh38>,array<str>,"array<struct{mendel: struct{errors: int64}, tdt: struct{t: int64, u: int64, chi_sq: float64, p_value: float64}, unrelated_qc_callstats: struct{AC: array<int32>, AF: array<float64>, AN: int32, homozygote_count: array<int32>}, meta: dict<str, str>}>"
chr1:12938,"[""GCAAA"",""G""]","[((0),(NA,NA,NA,NA),([4198,2],[1.00e+..."
chr1:13024,"[""G"",""A""]","[((1),(NA,NA,NA,NA),([12992,0],[1.00e..."
chr1:13087,"[""A"",""G""]","[((1),(NA,NA,NA,NA),([30709,3],[1.00e..."
chr1:13116,"[""T"",""C""]","[((0),(NA,NA,NA,NA),([31130,2],[1.00e..."
chr1:13130,"[""C"",""T""]","[((0),(NA,NA,NA,NA),([27456,2],[1.00e..."
chr1:13151,"[""G"",""C""]","[((0),(NA,NA,NA,NA),([31880,2],[1.00e..."
chr1:13164,"[""G"",""A""]","[((0),(NA,NA,NA,NA),([35810,2],[1.00e..."
chr1:13176,"[""G"",""T""]","[((0),(NA,NA,NA,NA),([38963,1],[1.00e..."
chr1:13198,"[""C"",""A""]","[((0),(NA,NA,NA,NA),([45027,1],[1.00e..."
chr1:13216,"[""C"",""G""]","[((1),(NA,NA,NA,NA),([48651,1],[1.00e..."
