In [1]:
import os
import hail as hl
import pyspark
import bokeh
import logging
import random
import pandas as pd
import numpy as np
from scipy import stats
import pickle 
from matplotlib import pyplot as plt
from typing import Any, Counter, List, Optional, Tuple, Union,Dict,Set
from hail.plot import show, output_notebook
from bokeh.palettes import d3  # pylint: disable=no-name-in-module
from bokeh.models import Plot, Row, Span, NumeralTickFormatter, LabelSet
from gnomad.utils.plotting import *
from typing import Set, Tuple

tmp_dir = "hdfs://spark-master:9820/"
temp_dir = "file:///home/ubuntu/data/tmp"
plot_dir = "/home/ubuntu/data/tmp"

sc = pyspark.SparkContext()
hadoop_config = sc._jsc.hadoopConfiguration()
hadoop_config.set("fs.s3a.access.key", "8YY584J59H7Q6AVKHSU8")
hadoop_config.set("fs.s3a.secret.key", "P8vePa7JUvxKXX2me9ti1cGujgYWMoimAwx4mMlM")
hadoop_config.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
hadoop_config.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
hl.init(sc=sc, tmp_dir=tmp_dir, default_reference='GRCh38')
output_notebook()
logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s")
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

pip-installed Hail requires additional configuration options in Spark referring
  to the path to the Hail Python module directory HAIL_DIR,
  e.g. /path/to/python/site-packages/hail:
    spark.jars=HAIL_DIR/hail-all-spark.jar
    spark.driver.extraClassPath=HAIL_DIR/hail-all-spark.jar
    spark.executor.extraClassPath=./hail-all-spark.jarRunning on Apache Spark version 2.4.5
SparkUI available at http://spark-master:4041
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.41-b8144dba46e6
LOGGING: writing to /home/ubuntu/data/tmp/scripts/sanger_gnomad_hail_qc/notebooks/hail-20201202-1343-0.2.41-b8144dba46e6.log


In [6]:
variant_list=f"{temp_dir}/intervalwgs/variant_list.txt"

ht=hl.import_table(variant_list,types={'f0':'str','f1':'int32'}, no_header=True)


2020-11-20 15:20:23 Hail: INFO: Reading table with no type imputation
  Loading column 'f0' as type 'str' (user-specified)
  Loading column 'f1' as type 'int32' (user-specified)



In [7]:
ht=ht.annotate(chrom=ht.f0)
ht=ht.annotate(position=ht.f1)
ht=ht.select(ht.chrom,ht.position)

In [8]:
ht = ht.key_by(
    locus=hl.locus(ht.chrom, ht.position, reference_genome='GRCh38'))

In [10]:
ht.write(f"{tmp_dir}/intervalwgs/variant_list.ht", overwrite=True)

2020-11-20 15:22:16 Hail: INFO: Coerced sorted dataset
2020-11-20 15:22:19 Hail: INFO: wrote table with 287 rows in 101 partitions to hdfs://spark-master:9820//intervalwgs/variant_list.ht


In [3]:
ht=hl.read_table(f"{temp_dir}/intervalwgs/variant_list.ht")

In [4]:
mt=hl.read_matrix_table(f'{temp_dir}/intervalwgs/WGS_final_march_2020_dbsnp_v53.mt')

In [5]:
mt1=mt.filter_rows(hl.is_defined(ht[mt.locus]))


In [None]:
mt1.write(f"{tmp_dir}/intervalwgs/variant_list_for_kousik.mt", overwrite=True)

In [15]:
mt1.describe()

----------------------------------------
Global fields:
    None
----------------------------------------
Column fields:
    's': str
    'sample_QC_nonHail_unfiltered': struct {
        SupplierName: str, 
        SangerName: str, 
        ALIQUOT_LAB1: str, 
        GenotypeID: int64, 
        GenotypeData: int64, 
        Identifier: int32, 
        Wgs_RAW_bl: str, 
        Wgs_RAW_24m: str, 
        Study: str, 
        BWA: str, 
        DUP: str, 
        `Inferred.GWA.ID`: int64, 
        MatchedIDs: int32, 
        `PIHAT.01875`: str, 
        SEX: int32, 
        KARYOTYPE: str, 
        Depth: float64, 
        `Median.FreeMix.NPG`: float64, 
        `Median.FreeMix.HGI`: float64, 
        NRD: float64, 
        Missing: int32, 
        Het: int32, 
        HomAlt: int32, 
        HetHomAlt: float64, 
        `Mean.Chim`: float64, 
        `PASS.ID`: int32, 
        `PASS.DUP`: int32, 
        `PASS.SampleSwap`: int32, 
        `PASS.PIHAT`: int32, 
        `PASS.Sex`: int32

In [16]:
ht1=mt1.rows()

In [17]:
ht1.describe()

----------------------------------------
Global fields:
    None
----------------------------------------
Row fields:
    'locus': locus<GRCh38> 
    'alleles': array<str> 
    'rsid': str 
    'qual': float64 
    'filters': set<str> 
    'info': struct {
        AC: array<int32>, 
        AF: array<float64>, 
        AN: int32, 
        BaseQRankSum: float64, 
        DP: int32, 
        DS: bool, 
        END: int32, 
        ExcessHet: float64, 
        FS: float64, 
        InbreedingCoeff: float64, 
        MLEAC: array<int32>, 
        MLEAF: array<float64>, 
        MQ: float64, 
        MQRankSum: float64, 
        QD: float64, 
        ReadPosRankSum: float64, 
        SOR: float64
    } 
    'a_index': int32 
    'was_split': bool 
    'Variant_Type': str 
    'VQSLOD_SNP': struct {
        VQSLOD: float64
    } 
    'VQSLOD_INDEL': struct {
        VQSLOD: float64
    } 
    'variant_QC_Hail': struct {
        dp_stats: struct {
            mean: float64, 
            stdev

In [18]:
ht1.flatten().export(f'{tmp_dir}/variant_list.tsv.bgz',header=True )

2020-11-20 15:40:13 Hail: INFO: merging 11054 files totalling 383.4K...
2020-11-20 15:40:24 Hail: INFO: while writing:
    hdfs://spark-master:9820//variant_list.tsv.bgz
  merge time: 11.023s
