In [1]:
import os
import hail as hl
import pyspark
import bokeh
import logging
import random
import pandas as pd
import numpy as np
from scipy import stats
import pickle 
from matplotlib import pyplot as plt
from typing import Any, Counter, List, Optional, Tuple, Union,Dict,Set
from hail.plot import show, output_notebook
from bokeh.palettes import d3  # pylint: disable=no-name-in-module
from bokeh.models import Plot, Row, Span, NumeralTickFormatter, LabelSet
from gnomad.utils.plotting import *
from typing import Set, Tuple

tmp_dir = "hdfs://spark-master:9820/"
temp_dir = "file:///home/ubuntu/data/tmp"
plot_dir = "/home/ubuntu/data/tmp"

sc = pyspark.SparkContext()
hadoop_config = sc._jsc.hadoopConfiguration()
hadoop_config.set("fs.s3a.access.key", "8YY584J59H7Q6AVKHSU8")
hadoop_config.set("fs.s3a.secret.key", "P8vePa7JUvxKXX2me9ti1cGujgYWMoimAwx4mMlM")
hadoop_config.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
hadoop_config.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
hl.init(sc=sc, tmp_dir=tmp_dir, default_reference='GRCh38')
output_notebook()
logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s")
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

pip-installed Hail requires additional configuration options in Spark referring
  to the path to the Hail Python module directory HAIL_DIR,
  e.g. /path/to/python/site-packages/hail:
    spark.jars=HAIL_DIR/hail-all-spark.jar
    spark.driver.extraClassPath=HAIL_DIR/hail-all-spark.jar
    spark.executor.extraClassPath=./hail-all-spark.jarRunning on Apache Spark version 2.4.7
SparkUI available at http://spark-master:4040
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.60-de1845e1c2f6
LOGGING: writing to /home/ubuntu/data/tmp/scripts/sanger_gnomad_hail_qc/notebooks/hail-20210208-1016-0.2.60-de1845e1c2f6.log


In [2]:
mt=hl.read_matrix_table(f'{temp_dir}/ddd-elgh-ukbb/variant_qc/Sanger_cohorts_chr1-7and20_after_RF_final.mt')

In [4]:
mt.describe()

----------------------------------------
Global fields:
    None
----------------------------------------
Column fields:
    's': str
----------------------------------------
Row fields:
    'locus': locus<GRCh38>
    'alleles': array<str>
    'rsid': str
    'qual': float64
    'filters': set<str>
    'info': struct {
        AC: array<int32>, 
        AF: array<float64>, 
        AN: int32, 
        AS_BaseQRankSum: array<float64>, 
        AS_FS: array<float64>, 
        AS_InbreedingCoeff: array<float64>, 
        AS_MQ: array<float64>, 
        AS_MQRankSum: array<float64>, 
        AS_QD: array<float64>, 
        AS_ReadPosRankSum: array<float64>, 
        AS_SOR: array<float64>, 
        BaseQRankSum: float64, 
        DB: bool, 
        DP: int32, 
        DS: bool, 
        END: int32, 
        ExcessHet: float64, 
        FS: float64, 
        InbreedingCoeff: float64, 
        MLEAC: array<int32>, 
        MLEAF: array<float64>, 
        MQ: float64, 
        MQRankSum: floa

In [5]:
def annotate_samples_with_cohort_info(mt: hl.MatrixTable, cohort_file) -> hl.MatrixTable:
    '''

    :param mt: matrixtable with cohort samples and variants
    :param cohort_file: a txt file with no header line and 2 columns, 1st: for sampleID;  2nd: cohortname; example:/lustre/scratch115/projects/autozyg/new_autozyg_DDD_callset.April2019/sample_list.after_QC.ELGH_BiB_Birm_controls_only.with_cohort_labels.txt
    :return: matrixtable with new column annotation
    '''
    # import the tab delimited file. Note that it is important for joins of tables to have defined keys in the hail tables
    table_cohort = hl.import_table(cohort_file, key='sample')
    # annotate the samples with a new attribute called cohort:
    mt_result = mt.annotate_cols(cohort=table_cohort[mt.s].cohort)
    return mt_result


In [4]:
#MAF, AC, AN per population
table_cohort = hl.import_table(
        f"{temp_dir}/ddd-elgh-ukbb/sanger_cohorts_corrected_ukbb_july_2020.tsv", delimiter="\t").key_by('s')


2021-02-08 10:16:50 Hail: INFO: Reading table without type imputation
  Loading field 's' as type str (not specified)
  Loading field 'cohort' as type str (not specified)


In [5]:
mt_result = mt.annotate_cols(cohort=table_cohort[mt.s].cohort)
#mt_annotated.write(
#        f"{tmp_dir}/ddd-elgh-ukbb/WES_annotated.mt", overwrite=True)


In [6]:
mt_result.describe()

----------------------------------------
Global fields:
    None
----------------------------------------
Column fields:
    's': str
    'cohort': str
----------------------------------------
Row fields:
    'locus': locus<GRCh38>
    'alleles': array<str>
    'rsid': str
    'qual': float64
    'filters': set<str>
    'info': struct {
        AC: array<int32>, 
        AF: array<float64>, 
        AN: int32, 
        AS_BaseQRankSum: array<float64>, 
        AS_FS: array<float64>, 
        AS_InbreedingCoeff: array<float64>, 
        AS_MQ: array<float64>, 
        AS_MQRankSum: array<float64>, 
        AS_QD: array<float64>, 
        AS_ReadPosRankSum: array<float64>, 
        AS_SOR: array<float64>, 
        BaseQRankSum: float64, 
        DB: bool, 
        DP: int32, 
        DS: bool, 
        END: int32, 
        ExcessHet: float64, 
        FS: float64, 
        InbreedingCoeff: float64, 
        MLEAC: array<int32>, 
        MLEAF: array<float64>, 
        MQ: float64, 
     

In [17]:
cohorts=mt_result.cohort.describe()

--------------------------------------------------------
Type:
        str
--------------------------------------------------------
Source:
    <hail.matrixtable.MatrixTable object at 0x7fc1ae7cc9e8>
Index:
    ['column']
--------------------------------------------------------


In [21]:
df=pd.read_csv(f"{temp_dir}/ddd-elgh-ukbb/sanger_cohorts_corrected_ukbb_july_2020.tsv",sep="\t")

In [25]:
cohorts_array=df.cohort.unique()
cohorts_array

array(['UK_10K_CHD', 'DDD', 'Birmingham_part1', 'BiB_part1',
       'Birmingham_part2', 'ELGH_part1', 'ELGH_part4', 'BiB_part2',
       'ELGH_part3', 'ELGH_part5', 'UKBB'], dtype=object)

In [32]:
cohorts_dict={}
for cohort in cohorts_array:
    cohorts[cohort]=mt_result['cohort']

TypeError: 'NoneType' object does not support item assignment

In [39]:
for cohort in cohorts_array:
    mt_cohort=mt_result.filter_cols(mt_result['cohort']==cohort)
    #numofsamples=mt_cohort.aggregate_cols(hl.agg.count_where(hl.is_defined('s')))
    mt_result.annotate_rows(AC+f'{cohort}'=cohrt)
    #print(numofsamples)
    #print(cohort)

SyntaxError: keyword can't be an expression (<ipython-input-39-6ed2262583e5>, line 4)

In [None]:
mt_result.annotate_rows(AC+f{})

In [None]:
numofsamples=mt.aggregate_cols(hl.agg.count_where(hl.is_defined(phenotype)))
        fraction=(int(numofsamples)*100/group_samples)
        missingness=100-fraction

In [None]:
    n_called=agg.count_where(hl.is_defined(mt.GT))
    missingness=n_called/numofsamples *2
    mt=mt.annotate_rows(call_stats= hl.agg.call_stats(mt.GT, mt.alleles))
    mt=mt.annotate_rows(AC= mt.call_stats.AC)
    mt=mt.annotate_rows(AN= mt.call_stats.AN)
    mt=mt.annotate_rows(maf= hl.float64(mt.final_AC[1]/mt.final_AN))
    mt=mt.annotate_rows(final_call_rate=hl.float64(0))


In [42]:
mt=mt_result

mt = mt.annotate_rows(
        MAF_cohorts = hl.agg.group_by(mt.cohort,
                              hl.min(hl.agg.call_stats(mt.GT, mt.alleles).AF))
)
mt=mt.annotate_rows(
AN_cohorts = hl.agg.group_by(mt.cohort,
                              hl.max(hl.agg.call_stats(mt.GT, mt.alleles).AN))
)

mt=mt.annotate_rows(
AC_cohorts = hl.agg.group_by(mt.cohort,
                              hl.max(hl.agg.call_stats(mt.GT, mt.alleles).AC))
)

mt =mt.annotate_rows(
missingness_cohorts= hl.agg.group_by(mt.cohort, hl.max( (hl.agg.count_where(hl.is_missing(mt['GT'])))/ mt.count_rows()*2))

)



In [43]:
mt=mt.annotate_rows(
    info = mt.info.annotate(MAF_cohorts=mt.MAF_cohorts)
)
mt=mt.annotate_rows(
    info = mt.info.annotate(AN_cohorts=mt.AN_cohorts)
)
mt=mt.annotate_rows(
    info = mt.info.annotate(AC_cohorts=mt.AC_cohorts)
)
mt=mt.annotate_rows(
    info = mt.info.annotate(missingness_cohorts=mt.missingness_cohorts)
)

In [45]:
mt.AN_cohorts.show()

KeyboardInterrupt: 

In [32]:
mt.info.A

2021-02-08 10:43:45 Hail: WARN: export_vcf: ignored the following fields:
    'cohort' (column)
    'a_index' (row)
    'was_split' (row)
    'Variant_Type' (row)
    'filtercol' (row)
    'MAF_cohorts' (row)
    'AN_cohorts' (row)
    'AC_cohorts' (row)
    'missingness_cohorts' (row)


KeyboardInterrupt: 

In [8]:
mt.describe()

----------------------------------------
Global fields:
    None
----------------------------------------
Column fields:
    's': str
    'cohort': str
----------------------------------------
Row fields:
    'locus': locus<GRCh38>
    'alleles': array<str>
    'rsid': str
    'qual': float64
    'filters': set<str>
    'info': struct {
        AC: array<int32>, 
        AF: array<float64>, 
        AN: int32, 
        AS_BaseQRankSum: array<float64>, 
        AS_FS: array<float64>, 
        AS_InbreedingCoeff: array<float64>, 
        AS_MQ: array<float64>, 
        AS_MQRankSum: array<float64>, 
        AS_QD: array<float64>, 
        AS_ReadPosRankSum: array<float64>, 
        AS_SOR: array<float64>, 
        BaseQRankSum: float64, 
        DB: bool, 
        DP: int32, 
        DS: bool, 
        END: int32, 
        ExcessHet: float64, 
        FS: float64, 
        InbreedingCoeff: float64, 
        MLEAC: array<int32>, 
        MLEAF: array<float64>, 
        MQ: float64, 
     

In [9]:
mt.AF.show()

locus,alleles,AF
locus<GRCh38>,array<str>,"dict<str, float64>"
chr1:12938,"[""GCAAA"",""G""]","{""ELGH_part3"":0.00e+00},""DDD"":4.07e-04},""Birmingham_part1"":0.00e+00},""Birmingham_part2"":0.00e+00},""UKBB"":0.00e+00},""ELGH_part5"":0.00e+00},""BiB_part1"":0.00e+00},""ELGH_part1"":0.00e+00},""BiB_part2"":0.00e+00},""ELGH_part4"":0.00e+00},""UK_10K_CHD"":0.00e+00}}"
chr1:13024,"[""G"",""A""]","{""ELGH_part3"":0.00e+00},""DDD"":1.45e-04},""Birmingham_part1"":0.00e+00},""Birmingham_part2"":0.00e+00},""UKBB"":0.00e+00},""ELGH_part5"":0.00e+00},""BiB_part1"":0.00e+00},""ELGH_part1"":0.00e+00},""BiB_part2"":0.00e+00},""ELGH_part4"":0.00e+00},""UK_10K_CHD"":0.00e+00}}"
chr1:13087,"[""A"",""G""]","{""ELGH_part3"":0.00e+00},""DDD"":9.67e-05},""Birmingham_part1"":0.00e+00},""Birmingham_part2"":0.00e+00},""UKBB"":1.34e-03},""ELGH_part5"":0.00e+00},""BiB_part1"":0.00e+00},""ELGH_part1"":0.00e+00},""BiB_part2"":0.00e+00},""ELGH_part4"":0.00e+00},""UK_10K_CHD"":0.00e+00}}"
chr1:13116,"[""T"",""C""]","{""ELGH_part3"":0.00e+00},""DDD"":6.33e-05},""Birmingham_part1"":0.00e+00},""Birmingham_part2"":0.00e+00},""UKBB"":0.00e+00},""ELGH_part5"":0.00e+00},""BiB_part1"":0.00e+00},""ELGH_part1"":0.00e+00},""BiB_part2"":0.00e+00},""ELGH_part4"":0.00e+00},""UK_10K_CHD"":0.00e+00}}"
chr1:13130,"[""C"",""T""]","{""ELGH_part3"":0.00e+00},""DDD"":7.14e-05},""Birmingham_part1"":0.00e+00},""Birmingham_part2"":0.00e+00},""UKBB"":0.00e+00},""ELGH_part5"":0.00e+00},""BiB_part1"":0.00e+00},""ELGH_part1"":0.00e+00},""BiB_part2"":0.00e+00},""ELGH_part4"":0.00e+00},""UK_10K_CHD"":0.00e+00}}"
chr1:13151,"[""G"",""C""]","{""ELGH_part3"":0.00e+00},""DDD"":6.20e-05},""Birmingham_part1"":0.00e+00},""Birmingham_part2"":0.00e+00},""UKBB"":0.00e+00},""ELGH_part5"":0.00e+00},""BiB_part1"":0.00e+00},""ELGH_part1"":0.00e+00},""BiB_part2"":0.00e+00},""ELGH_part4"":0.00e+00},""UK_10K_CHD"":0.00e+00}}"
chr1:13164,"[""G"",""A""]","{""ELGH_part3"":0.00e+00},""DDD"":5.62e-05},""Birmingham_part1"":0.00e+00},""Birmingham_part2"":0.00e+00},""UKBB"":0.00e+00},""ELGH_part5"":0.00e+00},""BiB_part1"":0.00e+00},""ELGH_part1"":0.00e+00},""BiB_part2"":0.00e+00},""ELGH_part4"":0.00e+00},""UK_10K_CHD"":0.00e+00}}"
chr1:13176,"[""G"",""T""]","{""ELGH_part3"":0.00e+00},""DDD"":7.80e-05},""Birmingham_part1"":0.00e+00},""Birmingham_part2"":0.00e+00},""UKBB"":0.00e+00},""ELGH_part5"":0.00e+00},""BiB_part1"":0.00e+00},""ELGH_part1"":0.00e+00},""BiB_part2"":0.00e+00},""ELGH_part4"":0.00e+00},""UK_10K_CHD"":0.00e+00}}"
chr1:13198,"[""C"",""A""]","{""ELGH_part3"":0.00e+00},""DDD"":2.27e-05},""Birmingham_part1"":0.00e+00},""Birmingham_part2"":0.00e+00},""UKBB"":0.00e+00},""ELGH_part5"":0.00e+00},""BiB_part1"":0.00e+00},""ELGH_part1"":0.00e+00},""BiB_part2"":0.00e+00},""ELGH_part4"":0.00e+00},""UK_10K_CHD"":0.00e+00}}"
chr1:13216,"[""C"",""G""]","{""ELGH_part3"":0.00e+00},""DDD"":1.06e-04},""Birmingham_part1"":0.00e+00},""Birmingham_part2"":0.00e+00},""UKBB"":0.00e+00},""ELGH_part5"":0.00e+00},""BiB_part1"":0.00e+00},""ELGH_part1"":0.00e+00},""BiB_part2"":0.00e+00},""ELGH_part4"":0.00e+00},""UK_10K_CHD"":0.00e+00}}"
