In [1]:
import os
import hail as hl
import pyspark
import bokeh
import logging
import random
import pandas as pd
import numpy as np
from scipy import stats
import pickle 
from matplotlib import pyplot as plt
from typing import Any, Counter, List, Optional, Tuple, Union
from hail.plot import show, output_notebook
tmp_dir = "hdfs://spark-master:9820/"
temp_dir = "file:///home/ubuntu/data/tmp"
plot_dir = "/home/ubuntu/data/tmp"

sc = pyspark.SparkContext()
hadoop_config = sc._jsc.hadoopConfiguration()
hadoop_config.set("fs.s3a.access.key", "8YY584J59H7Q6AVKHSU8")
hadoop_config.set("fs.s3a.secret.key", "P8vePa7JUvxKXX2me9ti1cGujgYWMoimAwx4mMlM")
hadoop_config.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
hadoop_config.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
hl.init(sc=sc, tmp_dir=tmp_dir, default_reference='GRCh38')
output_notebook()
logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s")
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

pip-installed Hail requires additional configuration options in Spark referring
  to the path to the Hail Python module directory HAIL_DIR,
  e.g. /path/to/python/site-packages/hail:
    spark.jars=HAIL_DIR/hail-all-spark.jar
    spark.driver.extraClassPath=HAIL_DIR/hail-all-spark.jar
    spark.executor.extraClassPath=./hail-all-spark.jarRunning on Apache Spark version 2.4.5
SparkUI available at http://spark-master:4040
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.41-b8144dba46e6
LOGGING: writing to /home/ubuntu/data/tmp/scripts/sanger_gnomad_hail_qc/notebooks/hail-20201020-1614-0.2.41-b8144dba46e6.log


The first step was to create the hail table with the truthset and negative (fail filters)
I have used variants from our datasets from the follwing chromosomes:
chr1, chr2, chr3, chr4 and chr20

In [2]:
ht = hl.read_table(
        f'{temp_dir}/ddd-elgh-ukbb/variant_qc/Sanger_table_for_RF_by_variant_type.ht')

In [3]:
ht.describe()

----------------------------------------
Global fields:
    'feature_medians': dict<tuple (
        str
    ), struct {
        a_index: int32, 
        n_alt_alleles: int32
    }> 
    'variants_by_strata': dict<tuple (
        str
    ), int64> 
----------------------------------------
Row fields:
    'locus': locus<GRCh38> 
    'alleles': array<str> 
    'a_index': int32 
    'was_split': bool 
    'InbreedingCoeff': float32 
    'variant_type': str 
    'allele_type': str 
    'n_alt_alleles': int32 
    'was_mixed': bool 
    'has_star': bool 
    'AS_QD': array<float64> 
    'AS_MQRankSum': array<float64> 
    'AS_SOR': array<float64> 
    'AS_ReadPosRankSum': array<float64> 
    'hapmap': bool 
    'omni': bool 
    'mills': bool 
    'kgp_phase1_hc': bool 
    'transmitted_singleton': bool 
    'fail_hard_filters': bool 
    'feature_imputed': struct {
        a_index: bool, 
        n_alt_alleles: bool
    } 
----------------------------------------
Key: ['locus', 'alleles']
-

How many positive sites and how many negative sites do we have:

In [11]:
print(ht.omni.summarize())
print(ht.mills.summarize())
print(ht.kgp_phase1_hc.summarize())
print(ht.hapmap.summarize())
print(ht.fail_hard_filters.summarize())

0,1
Non-missing,64417 (0.82%)
Missing,7790190 (99.18%)
Counts,{True: 64417}


None


0,1
Non-missing,17319 (0.22%)
Missing,7837288 (99.78%)
Counts,{True: 17319}


None


0,1
Non-missing,348055 (4.43%)
Missing,7506552 (95.57%)
Counts,{True: 348055}


None


0,1
Non-missing,70005 (0.89%)
Missing,7784602 (99.11%)
Counts,{True: 70005}


None


0,1
Non-missing,7853399 (99.98%)
Missing,1208 (0.02%)
Counts,"{False: 7495017, True: 358382}"


None


In [13]:
fp_expr = ht.fail_hard_filters
tp_expr = ht.omni | ht.mills | ht.kgp_phase1_hc | ht.hapmap
print(" Total Negative sites:")
print(fp_expr.summarize())
print("Total Positive sites:")
print(tp_expr.summarize())

 Total Negative sites:


0,1
Non-missing,7853399 (99.98%)
Missing,1208 (0.02%)
Counts,"{False: 7495017, True: 358382}"


None
Total Positive sites:


0,1
Non-missing,371387 (4.73%)
Missing,7483220 (95.27%)
Counts,{True: 371387}


None


*****************************
SUMMARY:
Total Negative Sites:358382
Total Positive Sites:371387
******************************

Train RF. 
command:
PYSPARK_DRIVER_PYTHON=/home/ubuntu/venv/bin/python spark-submit --master local[*] variant_qc/3.train_apply_finalise_RF.py --train_rf --fp_to_tp 1.0 > train_RF.log 2>&1

This command trains the RF using the positive and negative sites and applies it as a test to a selected chromosome. I have used chr20 and chr4 in different runs and they gave similar results. I copy here the results for chr4.

The results of training are saved in two files:
training hail table: ht=hl.read_table(f'{temp_dir}/ddd-elgh-ukbb/variant_qc/models/e8a43dd1/training.ht')
training model: model=f'{temp_dir}/ddd-elgh-ukbb/variant_qc/models/e8a43dd1/model.model'
The training step the rf_label and rf_train columns to the hail table. 
The hash e8a43dd1 is unique to this model and it is used as input to the next step of applying the RF model to all the dataset. 

Apply RF 
command:
PYSPARK_DRIVER_PYTHON=/home/ubuntu/venv/bin/python spark-submit --master local[*] variant_qc/3.train_apply_finalise_RF.py --apply_rf --run_hash e8a43dd1 > apply_RF_chr4.log 2>&1

This saves the result of the application to a hail table and adds a column rf_prediction:
ht=hl.read_table(f'{temp_dir}/ddd-elgh-ukbb/variant_qc/models/e8a43dd1/rf_result_sanger_cohorts_new.ht')


In [2]:
ht=hl.read_table(f'{temp_dir}/ddd-elgh-ukbb/variant_qc/models/e8a43dd1/rf_result_sanger_cohorts_new.ht')

In [18]:
#Initial rf_train training labels:
ht.aggregate(hl.agg.counter(ht.rf_label))

{'FP': 349871, 'TP': 362888, None: 7150788}

In [19]:
#chr4 training results
ht.aggregate(hl.agg.counter(ht.rf_train))

{False: 23107, True: 698607, None: 7141833}

In [15]:
# Final predition values
ht.aggregate(hl.agg.counter(ht.rf_prediction))

{'FP': 4849919, 'TP': 3012587, None: 1041}

In [21]:
# The highest rf_probability values are now 0.83 
ht.rf_probability.summarize()

0,1
Non-missing,7863547 (100.00%)
Missing,0
Min Size,2
Max Size,2
Mean Size,2.00

0,1
Non-missing,15727094 (100.00%)
Missing,0
Min Size,2
Max Size,2
Mean Size,2.00
Sample Values,"['FP', 'TP', 'FP', 'TP', 'FP']"

0,1
Non-missing,15725012 (99.99%)
Missing,2082 (0.01%)
Minimum,0.17
Maximum,0.83
Mean,0.50
Std Dev,0.20


In [14]:
p = hl.plot.histogram(ht.rf_probability.values(), legend='rf_probability', title='rf_probability Histogram')
show(p)

TypeError: histogram: parameter 'data': expected (hail.utils.struct.Struct or expression of type float64), found hail.expr.expressions.typed_expressions.ArrayNumericExpression: <ArrayNumericExpression of type array<float64>>

In [28]:
values=ht.rf_probability.values().collect()

In [None]:
p = hl.plot.histogram(values, legend='rf_probability', title='rf_probability Histogram')
show(p)

The final predictions after the application step:
{'FP': 4849919, 'TP': 3012587, None: 1041}

The next step is the finalisation step which requires ranking of the variants to finalise the predictions. 