In [1]:
import os
import hail as hl
import pyspark
import bokeh
import logging
import random
import pandas as pd
import numpy as np
import pickle 
from typing import Any, Counter, List, Optional, Tuple, Union
from hail.plot import show, output_notebook
tmp_dir = "hdfs://spark-master:9820/"
sc = pyspark.SparkContext()
temp_dir = working_dir = os.path.join(os.environ["HAIL_HOME"], "tmp")
hadoop_config = sc._jsc.hadoopConfiguration()
hadoop_config.set("fs.s3a.access.key", "8YY584J59H7Q6AVKHSU8")
hadoop_config.set("fs.s3a.secret.key", "P8vePa7JUvxKXX2me9ti1cGujgYWMoimAwx4mMlM")
hadoop_config.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
hadoop_config.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
hl.init(sc=sc, tmp_dir=tmp_dir, default_reference='GRCh38')
output_notebook()
logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s")
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

  to the path to the Hail Python module directory HAIL_DIR,
  e.g. /path/to/python/site-packages/hail:
    spark.jars=HAIL_DIR/hail-all-spark.jar
    spark.driver.extraClassPath=HAIL_DIR/hail-all-spark.jar
    spark.executor.extraClassPath=./hail-all-spark.jar
  'pip-installed Hail requires additional configuration options in Spark referring\n'
Running on Apache Spark version 2.4.3
SparkUI available at http://spark-master:4040
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.31-6060f9c971cc
LOGGING: writing to /opt/sanger.ac.uk/hgi/hail/tmp/scripts/sanger_gnomad_hail_qc/hail-20200723-1404-0.2.31-6060f9c971cc.log


In [2]:

def assign_population_pcs(
    pop_pca_scores: Union[hl.Table, pd.DataFrame],
    pc_cols: Union[hl.expr.ArrayExpression, List[str]],
    known_col: str = "known_pop",
    fit: Any = None,  # Type should be RandomForestClassifier but we do not want to import sklearn.RandomForestClassifier outside
    seed: int = 42,
    prop_train: float = 0.8,
    n_estimators: int = 100,
    min_prob: float = 0.9,
    output_col: str = "pop",
    missing_label: str = "oth",
) -> Tuple[
    Union[hl.Table, pd.DataFrame], Any
]:  # 2nd element of the tuple should be RandomForestClassifier but we do not want to import sklearn.RandomForestClassifier outside
    """
    This function uses a random forest model to assign population labels based on the results of PCA.
    Default values for model and assignment parameters are those used in gnomAD.

    As input, this function can either take:

    - A Hail Table (typically the output of `hwe_normalized_pca`). In this case,
        - `pc_cols` should be an ArrayExpression of Floats where each element is one of the PCs to use.
        - A Hail Table will be returned as output
    - A Pandas DataFrame. In this case:
        - Each PC should be in a separate column and `pc_cols` is the list of all the columns containing the PCs to use.
        - A pandas DataFrame is returned as output

    .. note::

        If you have a Pandas Dataframe and have all PCs as an array in a single column, the
        `expand_pd_array_col` can be used to expand this column into multiple `PC` columns.

    :param pop_pc_pd: Input Hail Table or Pandas Dataframe
    :param pc_cols: Columns storing the PCs to use
    :param known_col: Column storing the known population labels
    :param RandomForestClassifier fit: fit from a previously trained random forest model (i.e., the output from a previous RandomForestClassifier() call)
    :param seed: Random seed
    :param prop_train: Proportion of known data used for training
    :param n_estimators: Number of trees to use in the RF model
    :param min_prob: Minimum probability of belonging to a given population for the population to be set (otherwise set to `None`)
    :param output_col: Output column storing the assigned population
    :param missing_label: Label for samples for which the assignment probability is smaller than `min_prob`
    :return: Hail Table or Pandas Dataframe (depending on input) containing sample IDs and imputed population labels, trained random forest model
    """
    from sklearn.ensemble import RandomForestClassifier

    hail_input = isinstance(pop_pca_scores, hl.Table)
    if hail_input:
        pop_pc_pd = pop_pca_scores.select(
            known_col, pca_scores=pc_cols).to_pandas()

        # Explode the PC array
        num_out_cols = min([len(x)
                            for x in pop_pc_pd["pca_scores"].values.tolist()])
        pc_cols = [f"PC{i+1}" for i in range(num_out_cols)]
        pop_pc_pd[pc_cols] = pd.DataFrame(pop_pc_pd["pca_scores"].values.tolist())[
            list(range(num_out_cols))
        ]

    else:
        pop_pc_pd = pop_pca_scores

    train_data = pop_pc_pd.loc[~pop_pc_pd[known_col].isnull()]

    N = len(train_data)

    # Split training data into subsamples for fitting and evaluating
    if not fit:
        random.seed(seed)
        train_subsample_ridx = random.sample(
            list(range(0, N)), int(N * prop_train))
        train_fit = train_data.iloc[train_subsample_ridx]
        fit_samples = [x for x in train_fit["s"]]
        evaluate_fit = train_data.loc[~train_data["s"].isin(fit_samples)]

        # Train RF
        training_set_known_labels = train_fit[known_col].values
        training_set_pcs = train_fit[pc_cols].values
        evaluation_set_pcs = evaluate_fit[pc_cols].values

        pop_clf = RandomForestClassifier(
            n_estimators=n_estimators, random_state=seed)
        pop_clf.fit(training_set_pcs, training_set_known_labels)
        print(
            "Random forest feature importances are as follows: {}".format(
                pop_clf.feature_importances_
            )
        )

        # Evaluate RF
        predictions = pop_clf.predict(evaluation_set_pcs)
        error_rate = 1 - sum(evaluate_fit[known_col] == predictions) / float(
            len(predictions)
        )
        print("Estimated error rate for RF model is {}".format(error_rate))
    else:
        pop_clf = fit

    # Classify data
    pop_pc_pd[output_col] = pop_clf.predict(pop_pc_pd[pc_cols].values)
    probs = pop_clf.predict_proba(pop_pc_pd[pc_cols].values)
    probs = pd.DataFrame(
        probs, columns=[f"prob_{p}" for p in pop_clf.classes_])
    pop_pc_pd = pd.concat([pop_pc_pd, probs], axis=1)
    probs["max"] = probs.max(axis=1)
    pop_pc_pd.loc[probs["max"] < min_prob, output_col] = missing_label
    pop_pc_pd = pop_pc_pd.drop(pc_cols, axis="columns")

    logger.info(
        "Found the following sample count after population assignment: {}".format(
            ", ".join(
                f"{pop}: {count}"
                for pop, count in Counter(pop_pc_pd[output_col]).items()
            )
        )
    )

    if hail_input:
        pops_ht = hl.Table.from_pandas(pop_pc_pd, key=list(pop_pca_scores.key))
        pops_ht.annotate_globals(
            assign_pops_from_pc_params=hl.struct(min_assignment_prob=min_prob)
        )
        return pops_ht, pop_clf
    else:
        return pop_pc_pd, pop_clf


In [3]:
mt = hl.read_matrix_table(
        f"{temp_dir}/ddd-elgh-ukbb/Sanger_chr1-20-XY_pca_scores.mt")

In [4]:
mt.count()

(3182, 93674)

In [5]:
pca_scores = hl.read_table(f"{temp_dir}/ddd-elgh-ukbb/pca_scores_known_pop.ht")
pca_scores.show()

s,scores,known_pop
str,array<float64>,str
"""EGAN00001006259""","[2.73e-01,-1.81e-01,3.93e-03,-2.33e-03,-6.34e-02,5.34e-02,-1.50e-02,-9.84...",""""""
"""EGAN00001006260""","[2.54e-01,-1.50e-01,-4.74e-03,-1.48e-02,-4.04e-02,5.38e-02,-1.15e-02,-1.4...",""""""
"""EGAN00001006261""","[2.59e-01,-1.69e-01,-1.03e-02,-1.06e-02,-4.64e-02,4.43e-02,-2.10e-02,-3.5...",""""""
"""EGAN00001006263""","[2.19e-01,-1.58e-01,-1.80e-02,-1.78e-02,-8.02e-02,4.07e-02,6.73e-02,-1.93...",""""""
"""EGAN00001006264""","[2.36e-01,-1.99e-01,-2.98e-02,1.18e-02,-8.97e-02,2.23e-02,1.91e-02,2.21e-...",""""""
"""EGAN00001006265""","[-2.21e-02,-1.52e-01,-1.42e-01,4.74e-02,-3.72e-02,-1.60e-02,7.17e-02,1.00...",""""""
"""EGAN00001006266""","[-4.15e-02,-1.52e-01,-1.18e-01,2.08e-02,-2.92e-03,-8.27e-03,2.69e-02,2.11...",""""""
"""EGAN00001006267""","[-9.80e-03,-1.41e-01,-1.04e-01,1.87e-02,-2.07e-02,-2.39e-02,-2.85e-02,-2....",""""""
"""EGAN00001006268""","[-5.27e-03,-2.03e-01,-1.59e-01,3.84e-02,-4.07e-02,-1.50e-02,-2.04e-02,1.8...",""""""
"""EGAN00001006269""","[-1.50e-03,-1.73e-01,-1.41e-01,-1.80e-03,-4.64e-02,3.35e-02,3.41e-02,-7.9...",""""""


In [8]:
known_col="known_pop"
pc_cols=pca_scores.scores
pop_pc_pd = pca_scores.select(
            known_col, pca_scores=pc_cols).to_pandas()

In [9]:
pop_pc_pd.head()

Unnamed: 0,s,known_pop,pca_scores
0,EGAN00001006259,,"[0.27322528634700055, -0.18142160992318895, 0...."
1,EGAN00001006260,,"[0.254147008564085, -0.15015278089094658, -0.0..."
2,EGAN00001006261,,"[0.25928917301000326, -0.1694873940765352, -0...."
3,EGAN00001006263,,"[0.21922423765417157, -0.1581704760343723, -0...."
4,EGAN00001006264,,"[0.2362999720429752, -0.19872536196842433, -0...."


In [10]:
train_data = pop_pc_pd.loc[pop_pc_pd[known_col] != ""]
N = len(train_data)

In [11]:
train_data.head()

Unnamed: 0,s,known_pop,pca_scores
43944,UKB_1000054_0230795760,British,"[-0.10661136959441164, 0.05558966893670524, -0..."
43945,UKB_1000235_0229406923,British,"[-0.03287185047756297, 0.042144172215897654, 0..."
43946,UKB_1000372_0230791869,British,"[-0.0523382083178357, 0.03499287098774602, 0.0..."
43947,UKB_1000395_0230813918,British,"[-0.05857213594331682, 0.020423078180638876, 0..."
43948,UKB_1000538_0230722138,British,"[-0.07139132478362317, 0.031122850653172985, 0..."


In [12]:
print(N)

49716


In [None]:
train_data = pop_pc_pd.loc[~pop_pc_pd[known_col].isnull()]


In [7]:
pop_pc_pd , pop_clf = assign_population_pcs(
        pca_scores, pca_scores.scores, known_col="known_pop")

Random forest feature importances are as follows: [0.18323768 0.35043116 0.12051448 0.04447509 0.21799286 0.01712276
 0.01863069 0.01563075 0.01593255 0.016032  ]
Estimated error rate for RF model is 0.05289564985321593


INFO (__main__ 113): Found the following sample count after population assignment: : 43810, oth: 8950, British: 40447, Caribbean: 206, Indian: 178, Chinese: 66, African: 7, White and Black Caribbean: 6, Other ethnic group: 3, Any other white background: 1


FatalError: PythonException: Traceback (most recent call last):
  File "/opt/sanger.ac.uk/hgi/spark-2.4.3-bin-netlib-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py", line 267, in main
    ("%d.%d" % sys.version_info[:2], version))
Exception: Python in worker has different version 2.7 than that in driver 3.7, PySpark cannot run with different minor versions.Please check environment variables PYSPARK_PYTHON and PYSPARK_DRIVER_PYTHON are correctly set.


Java stack trace:
org.apache.spark.SparkException: Job aborted due to stage failure: Task 273 in stage 3.0 failed 100 times, most recent failure: Lost task 273.99 in stage 3.0 (TID 3022, 192.168.226.138, executor 92): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/opt/sanger.ac.uk/hgi/spark-2.4.3-bin-netlib-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py", line 267, in main
    ("%d.%d" % sys.version_info[:2], version))
Exception: Python in worker has different version 2.7 than that in driver 3.7, PySpark cannot run with different minor versions.Please check environment variables PYSPARK_PYTHON and PYSPARK_DRIVER_PYTHON are correctly set.

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:452)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:588)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:571)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:406)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:439)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:439)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:439)
	at is.hail.rvd.RVD$$anonfun$30.apply(RVD.scala:1227)
	at is.hail.rvd.RVD$$anonfun$30.apply(RVD.scala:1226)
	at is.hail.sparkextras.ContextRDD$$anonfun$cmapPartitionsWithIndex$1$$anonfun$apply$16.apply(ContextRDD.scala:281)
	at is.hail.sparkextras.ContextRDD$$anonfun$cmapPartitionsWithIndex$1$$anonfun$apply$16.apply(ContextRDD.scala:281)
	at is.hail.sparkextras.ContextRDD$$anonfun$run$1$$anonfun$apply$8.apply(ContextRDD.scala:219)
	at is.hail.sparkextras.ContextRDD$$anonfun$run$1$$anonfun$apply$8.apply(ContextRDD.scala:219)
	at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:435)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:441)
	at scala.collection.Iterator$class.foreach(Iterator.scala:891)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1334)
	at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
	at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
	at scala.collection.AbstractIterator.to(Iterator.scala:1334)
	at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
	at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1334)
	at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
	at scala.collection.AbstractIterator.toArray(Iterator.scala:1334)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$13.apply(RDD.scala:945)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$13.apply(RDD.scala:945)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2101)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2101)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:121)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	at java.base/java.lang.Thread.run(Thread.java:834)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1889)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1877)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1876)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1876)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2110)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2059)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2048)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2126)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:945)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:944)
	at is.hail.sparkextras.ContextRDD.collect(ContextRDD.scala:223)
	at is.hail.rvd.RVD$.getKeyInfo(RVD.scala:1232)
	at is.hail.rvd.RVD$.makeCoercer(RVD.scala:1301)
	at is.hail.rvd.RVD$.coerce(RVD.scala:1257)
	at is.hail.rvd.RVD$.coerce(RVD.scala:1241)
	at is.hail.expr.ir.TableValue$.apply(TableValue.scala:39)
	at is.hail.utils.Py4jUtils$$anonfun$pyFromDF$1.apply(Py4jUtils.scala:159)
	at is.hail.utils.Py4jUtils$$anonfun$pyFromDF$1.apply(Py4jUtils.scala:158)
	at is.hail.utils.package$.using(package.scala:596)
	at is.hail.expr.ir.ExecuteContext$$anonfun$scoped$1.apply(ExecuteContext.scala:10)
	at is.hail.expr.ir.ExecuteContext$$anonfun$scoped$1.apply(ExecuteContext.scala:9)
	at is.hail.utils.package$.using(package.scala:596)
	at is.hail.annotations.Region$.scoped(Region.scala:18)
	at is.hail.expr.ir.ExecuteContext$.scoped(ExecuteContext.scala:9)
	at is.hail.utils.Py4jUtils$class.pyFromDF(Py4jUtils.scala:158)
	at is.hail.utils.package$.pyFromDF(package.scala:74)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)

org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/opt/sanger.ac.uk/hgi/spark-2.4.3-bin-netlib-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py", line 267, in main
    ("%d.%d" % sys.version_info[:2], version))
Exception: Python in worker has different version 2.7 than that in driver 3.7, PySpark cannot run with different minor versions.Please check environment variables PYSPARK_PYTHON and PYSPARK_DRIVER_PYTHON are correctly set.

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:452)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:588)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:571)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:406)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:439)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:439)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:439)
	at is.hail.rvd.RVD$$anonfun$30.apply(RVD.scala:1227)
	at is.hail.rvd.RVD$$anonfun$30.apply(RVD.scala:1226)
	at is.hail.sparkextras.ContextRDD$$anonfun$cmapPartitionsWithIndex$1$$anonfun$apply$16.apply(ContextRDD.scala:281)
	at is.hail.sparkextras.ContextRDD$$anonfun$cmapPartitionsWithIndex$1$$anonfun$apply$16.apply(ContextRDD.scala:281)
	at is.hail.sparkextras.ContextRDD$$anonfun$run$1$$anonfun$apply$8.apply(ContextRDD.scala:219)
	at is.hail.sparkextras.ContextRDD$$anonfun$run$1$$anonfun$apply$8.apply(ContextRDD.scala:219)
	at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:435)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:441)
	at scala.collection.Iterator$class.foreach(Iterator.scala:891)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1334)
	at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
	at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
	at scala.collection.AbstractIterator.to(Iterator.scala:1334)
	at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
	at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1334)
	at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
	at scala.collection.AbstractIterator.toArray(Iterator.scala:1334)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$13.apply(RDD.scala:945)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$13.apply(RDD.scala:945)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2101)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2101)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:121)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	at java.lang.Thread.run(Thread.java:834)




Hail version: 0.2.31-6060f9c971cc
Error summary: PythonException: Traceback (most recent call last):
  File "/opt/sanger.ac.uk/hgi/spark-2.4.3-bin-netlib-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py", line 267, in main
    ("%d.%d" % sys.version_info[:2], version))
Exception: Python in worker has different version 2.7 than that in driver 3.7, PySpark cannot run with different minor versions.Please check environment variables PYSPARK_PYTHON and PYSPARK_DRIVER_PYTHON are correctly set.


In [5]:
pca_scores.describe()

----------------------------------------
Global fields:
    None
----------------------------------------
Row fields:
    's': str 
    'scores': array<float64> 
----------------------------------------
Key: ['s']
----------------------------------------


In [6]:
pca_scores=pca_scores.annotate(known_pop=mt.cols()[pca_scores.s].known_pop)
 #mt = mt_vqc_filtered.annotate_cols(
 #       scores=pca_scores[mt_vqc_filtered.col_key].scores)

2020-07-23 12:26:57 Hail: WARN: cols(): Resulting column table is sorted by 'col_key'.
    To preserve matrix table column order, first unkey columns with 'key_cols_by()'


In [7]:
pca_scores.show()

2020-07-23 12:27:14 Hail: INFO: Coerced sorted dataset


s,scores,known_pop
str,array<float64>,str
"""EGAN00001006259""","[2.73e-01,-1.81e-01,3.93e-03,-2.33e-03,-6.34e-02,5.34e-02,-1.50e-02,-9.84...",""""""
"""EGAN00001006260""","[2.54e-01,-1.50e-01,-4.74e-03,-1.48e-02,-4.04e-02,5.38e-02,-1.15e-02,-1.4...",""""""
"""EGAN00001006261""","[2.59e-01,-1.69e-01,-1.03e-02,-1.06e-02,-4.64e-02,4.43e-02,-2.10e-02,-3.5...",""""""
"""EGAN00001006263""","[2.19e-01,-1.58e-01,-1.80e-02,-1.78e-02,-8.02e-02,4.07e-02,6.73e-02,-1.93...",""""""
"""EGAN00001006264""","[2.36e-01,-1.99e-01,-2.98e-02,1.18e-02,-8.97e-02,2.23e-02,1.91e-02,2.21e-...",""""""
"""EGAN00001006265""","[-2.21e-02,-1.52e-01,-1.42e-01,4.74e-02,-3.72e-02,-1.60e-02,7.17e-02,1.00...",""""""
"""EGAN00001006266""","[-4.15e-02,-1.52e-01,-1.18e-01,2.08e-02,-2.92e-03,-8.27e-03,2.69e-02,2.11...",""""""
"""EGAN00001006267""","[-9.80e-03,-1.41e-01,-1.04e-01,1.87e-02,-2.07e-02,-2.39e-02,-2.85e-02,-2....",""""""
"""EGAN00001006268""","[-5.27e-03,-2.03e-01,-1.59e-01,3.84e-02,-4.07e-02,-1.50e-02,-2.04e-02,1.8...",""""""
"""EGAN00001006269""","[-1.50e-03,-1.73e-01,-1.41e-01,-1.80e-03,-4.64e-02,3.35e-02,3.41e-02,-7.9...",""""""


In [8]:
pca_scores.write(f"{temp_dir}/ddd-elgh-ukbb/pca_scores_known_pop.ht")

2020-07-23 12:27:53 Hail: INFO: Coerced sorted dataset
2020-07-23 12:27:57 Hail: INFO: wrote table with 93674 rows in 4 partitions to /opt/sanger.ac.uk/hgi/hail/tmp/ddd-elgh-ukbb/pca_scores_known_pop.ht


In [9]:
mt.describe()

----------------------------------------
Global fields:
    None
----------------------------------------
Column fields:
    's': str
    'cohort': str
    'f_stat': float64
    'is_female': bool
    'sex': str
    'data_type': str
    'known_pop': str
    'gVCF': str
    'scores': array<float64>
----------------------------------------
Row fields:
    'locus': locus<GRCh38>
    'alleles': array<str>
    'rsid': str
    'qual': float64
    'filters': set<str>
    'info': struct {
        AC: array<int32>, 
        AF: array<float64>, 
        AN: int32, 
        AS_BaseQRankSum: array<float64>, 
        AS_FS: array<float64>, 
        AS_InbreedingCoeff: array<float64>, 
        AS_MQ: array<float64>, 
        AS_MQRankSum: array<float64>, 
        AS_QD: array<float64>, 
        AS_ReadPosRankSum: array<float64>, 
        AS_SOR: array<float64>, 
        BaseQRankSum: float64, 
        DB: bool, 
        DP: int32, 
        DS: bool, 
        END: int32, 
        ExcessHet: float64, 


In [23]:
mt.cols()

<hail.table.Table at 0x7f806bf4fbe0>

In [10]:

population_assignment_table = assign_population_pcs(
        pca_scores,pca_scores.scores, known_col="known_pop")

2020-07-23 12:28:11 Hail: INFO: Coerced sorted dataset


Random forest feature importances are as follows: [0.18323768 0.35043116 0.12051448 0.04447509 0.21799286 0.01712276
 0.01863069 0.01563075 0.01593255 0.016032  ]
Estimated error rate for RF model is 0.05289564985321593


INFO (__main__ 113): Found the following sample count after population assignment: : 43810, oth: 8950, British: 40447, Caribbean: 206, Indian: 178, Chinese: 66, African: 7, White and Black Caribbean: 6, Other ethnic group: 3, Any other white background: 1


FatalError: PythonException: Traceback (most recent call last):
  File "/opt/sanger.ac.uk/hgi/spark-2.4.3-bin-netlib-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py", line 267, in main
    ("%d.%d" % sys.version_info[:2], version))
Exception: Python in worker has different version 2.7 than that in driver 3.7, PySpark cannot run with different minor versions.Please check environment variables PYSPARK_PYTHON and PYSPARK_DRIVER_PYTHON are correctly set.


Java stack trace:
org.apache.spark.SparkException: Job aborted due to stage failure: Task 150 in stage 7.0 failed 100 times, most recent failure: Lost task 150.99 in stage 7.0 (TID 21068, 192.168.226.169, executor 114): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/opt/sanger.ac.uk/hgi/spark-2.4.3-bin-netlib-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py", line 267, in main
    ("%d.%d" % sys.version_info[:2], version))
Exception: Python in worker has different version 2.7 than that in driver 3.7, PySpark cannot run with different minor versions.Please check environment variables PYSPARK_PYTHON and PYSPARK_DRIVER_PYTHON are correctly set.

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:452)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:588)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:571)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:406)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:439)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:439)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:439)
	at is.hail.rvd.RVD$$anonfun$30.apply(RVD.scala:1227)
	at is.hail.rvd.RVD$$anonfun$30.apply(RVD.scala:1226)
	at is.hail.sparkextras.ContextRDD$$anonfun$cmapPartitionsWithIndex$1$$anonfun$apply$16.apply(ContextRDD.scala:281)
	at is.hail.sparkextras.ContextRDD$$anonfun$cmapPartitionsWithIndex$1$$anonfun$apply$16.apply(ContextRDD.scala:281)
	at is.hail.sparkextras.ContextRDD$$anonfun$run$1$$anonfun$apply$8.apply(ContextRDD.scala:219)
	at is.hail.sparkextras.ContextRDD$$anonfun$run$1$$anonfun$apply$8.apply(ContextRDD.scala:219)
	at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:435)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:441)
	at scala.collection.Iterator$class.foreach(Iterator.scala:891)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1334)
	at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
	at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
	at scala.collection.AbstractIterator.to(Iterator.scala:1334)
	at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
	at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1334)
	at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
	at scala.collection.AbstractIterator.toArray(Iterator.scala:1334)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$13.apply(RDD.scala:945)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$13.apply(RDD.scala:945)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2101)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2101)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:121)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	at java.base/java.lang.Thread.run(Thread.java:834)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1889)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1877)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1876)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1876)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2110)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2059)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2048)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2126)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:945)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:944)
	at is.hail.sparkextras.ContextRDD.collect(ContextRDD.scala:223)
	at is.hail.rvd.RVD$.getKeyInfo(RVD.scala:1232)
	at is.hail.rvd.RVD$.makeCoercer(RVD.scala:1301)
	at is.hail.rvd.RVD$.coerce(RVD.scala:1257)
	at is.hail.rvd.RVD$.coerce(RVD.scala:1241)
	at is.hail.expr.ir.TableValue$.apply(TableValue.scala:39)
	at is.hail.utils.Py4jUtils$$anonfun$pyFromDF$1.apply(Py4jUtils.scala:159)
	at is.hail.utils.Py4jUtils$$anonfun$pyFromDF$1.apply(Py4jUtils.scala:158)
	at is.hail.utils.package$.using(package.scala:596)
	at is.hail.expr.ir.ExecuteContext$$anonfun$scoped$1.apply(ExecuteContext.scala:10)
	at is.hail.expr.ir.ExecuteContext$$anonfun$scoped$1.apply(ExecuteContext.scala:9)
	at is.hail.utils.package$.using(package.scala:596)
	at is.hail.annotations.Region$.scoped(Region.scala:18)
	at is.hail.expr.ir.ExecuteContext$.scoped(ExecuteContext.scala:9)
	at is.hail.utils.Py4jUtils$class.pyFromDF(Py4jUtils.scala:158)
	at is.hail.utils.package$.pyFromDF(package.scala:74)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)

org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/opt/sanger.ac.uk/hgi/spark-2.4.3-bin-netlib-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py", line 267, in main
    ("%d.%d" % sys.version_info[:2], version))
Exception: Python in worker has different version 2.7 than that in driver 3.7, PySpark cannot run with different minor versions.Please check environment variables PYSPARK_PYTHON and PYSPARK_DRIVER_PYTHON are correctly set.

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:452)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:588)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:571)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:406)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:439)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:439)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:439)
	at is.hail.rvd.RVD$$anonfun$30.apply(RVD.scala:1227)
	at is.hail.rvd.RVD$$anonfun$30.apply(RVD.scala:1226)
	at is.hail.sparkextras.ContextRDD$$anonfun$cmapPartitionsWithIndex$1$$anonfun$apply$16.apply(ContextRDD.scala:281)
	at is.hail.sparkextras.ContextRDD$$anonfun$cmapPartitionsWithIndex$1$$anonfun$apply$16.apply(ContextRDD.scala:281)
	at is.hail.sparkextras.ContextRDD$$anonfun$run$1$$anonfun$apply$8.apply(ContextRDD.scala:219)
	at is.hail.sparkextras.ContextRDD$$anonfun$run$1$$anonfun$apply$8.apply(ContextRDD.scala:219)
	at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:435)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:441)
	at scala.collection.Iterator$class.foreach(Iterator.scala:891)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1334)
	at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
	at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
	at scala.collection.AbstractIterator.to(Iterator.scala:1334)
	at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
	at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1334)
	at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
	at scala.collection.AbstractIterator.toArray(Iterator.scala:1334)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$13.apply(RDD.scala:945)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$13.apply(RDD.scala:945)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2101)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2101)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:121)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	at java.lang.Thread.run(Thread.java:834)




Hail version: 0.2.31-6060f9c971cc
Error summary: PythonException: Traceback (most recent call last):
  File "/opt/sanger.ac.uk/hgi/spark-2.4.3-bin-netlib-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py", line 267, in main
    ("%d.%d" % sys.version_info[:2], version))
Exception: Python in worker has different version 2.7 than that in driver 3.7, PySpark cannot run with different minor versions.Please check environment variables PYSPARK_PYTHON and PYSPARK_DRIVER_PYTHON are correctly set.


In [9]:
mt=hl.read_matrix_table(f"{temp_dir}/ddd-elgh-ukbb/chr1_chr20_XY_cohorts_split.mt")
cohorts_pop=hl.import_table("s3a://DDD-ELGH-UKBB-exomes/ancestry/sanger_cohort_known_populations_ukbb.tsv",delimiter="\t").key_by('s')


2020-07-22 06:20:28 Hail: INFO: Reading table with no type imputation
  Loading column 's' as type 'str' (type not specified)
  Loading column 'cohort' as type 'str' (type not specified)
  Loading column 'sample' as type 'str' (type not specified)
  Loading column 'gVCF' as type 'str' (type not specified)
  Loading column 'filename' as type 'str' (type not specified)
  Loading column 'gVCF_ID' as type 'str' (type not specified)
  Loading column 'eid' as type 'str' (type not specified)
  Loading column 'encoding' as type 'str' (type not specified)
  Loading column 'coding' as type 'str' (type not specified)
  Loading column 'known_population' as type 'str' (type not specified)



In [None]:
pca_scores = hl.read_table(f"{temp_dir}/ddd-elgh-ukbb/pca_scores.ht")
pca_loadings = hl.read_table(f"{temp_dir}/ddd-elgh-ukbb/pca_loadings.ht")

In [10]:
cohorts_pop.describe()

----------------------------------------
Global fields:
    None
----------------------------------------
Row fields:
    's': str 
    'cohort': str 
    'sample': str 
    'gVCF': str 
    'filename': str 
    'gVCF_ID': str 
    'eid': str 
    'encoding': str 
    'coding': str 
    'known_population': str 
----------------------------------------
Key: ['s']
----------------------------------------


In [3]:
mt.describe()

----------------------------------------
Global fields:
    None
----------------------------------------
Column fields:
    's': str
    'cohort': str
----------------------------------------
Row fields:
    'locus': locus<GRCh38>
    'alleles': array<str>
    'rsid': str
    'qual': float64
    'filters': set<str>
    'info': struct {
        AC: array<int32>, 
        AF: array<float64>, 
        AN: int32, 
        AS_BaseQRankSum: array<float64>, 
        AS_FS: array<float64>, 
        AS_InbreedingCoeff: array<float64>, 
        AS_MQ: array<float64>, 
        AS_MQRankSum: array<float64>, 
        AS_QD: array<float64>, 
        AS_ReadPosRankSum: array<float64>, 
        AS_SOR: array<float64>, 
        BaseQRankSum: float64, 
        DB: bool, 
        DP: int32, 
        DS: bool, 
        END: int32, 
        ExcessHet: float64, 
        FS: float64, 
        InbreedingCoeff: float64, 
        MLEAC: array<int32>, 
        MLEAF: array<float64>, 
        MQ: float64, 
     

In [11]:
mt=mt.drop('cohort')

In [16]:
mt = mt.annotate_cols(cohort=cohorts_pop[mt.s].cohort)
mt = mt.annotate_cols(known_pop=cohorts_pop[mt.s].known_population)
mt = mt.annotate_cols(gVCF=cohorts_pop[mt.s].gVCF_ID)

In [19]:
mt=mt.checkpoint(f"{tmp_dir}/ddd-elgh-ukbb/Sanger_cohorts_chr1-20-XY_new_cohorts.mt", overwrite=True)

2020-07-22 06:32:00 Hail: INFO: Coerced sorted dataset
2020-07-22 06:58:04 Hail: INFO: wrote matrix table with 4278567 rows and 93674 columns in 21200 partitions to hdfs://spark-master:9820//ddd-elgh-ukbb/Sanger_cohorts_chr1-20-XY_new_cohorts.mt


In [18]:
mt.cols().show()

2020-07-22 06:25:58 Hail: WARN: cols(): Resulting column table is sorted by 'col_key'.
    To preserve matrix table column order, first unkey columns with 'key_cols_by()'


KeyboardInterrupt: 

In [2]:
def pc_project(
    mt: hl.MatrixTable,
    loadings_ht: hl.Table,
    loading_location: str = "loadings",
    af_location: str = "pca_af",
) -> hl.Table:
    """
    Projects samples in `mt` on pre-computed PCs.

    :param mt: MT containing the samples to project
    :param loadings_ht: HT containing the PCA loadings and allele frequencies used for the PCA
    :param loading_location: Location of expression for loadings in `loadings_ht`
    :param af_location: Location of expression for allele frequency in `loadings_ht`
    :return: Table with scores calculated from loadings in column `scores`
    """

    n_variants = loadings_ht.count()

    mt = mt.annotate_rows(
        pca_loadings=loadings_ht[mt.row_key][loading_location],
        pca_af=loadings_ht[mt.row_key][af_location],
    )

    mt = mt.filter_rows(
        hl.is_defined(mt.pca_loadings)
        & hl.is_defined(mt.pca_af)
        & (mt.pca_af > 0)
        & (mt.pca_af < 1)
    )

    gt_norm = (mt.GT.n_alt_alleles() - 2 * mt.pca_af) / hl.sqrt(
        n_variants * 2 * mt.pca_af * (1 - mt.pca_af)
    )

    mt = mt.annotate_cols(scores=hl.agg.array_sum(mt.pca_loadings * gt_norm))

    return mt.cols().select("scores")


In [3]:
project_mt = hl.read_matrix_table(
        f"{temp_dir}/ddd-elgh-ukbb/relatedness_ancestry/ddd-elgh-ukbb/chr1_chr20_XY_ldpruned.mt")
mt_1kg_chr1_chr20=hl.read_matrix_table(f"{temp_dir}/ddd-elgh-ukbb/1000g_chr1_20_AKT_overlap.mt")


In [4]:
project_mt1 = project_mt.key_rows_by("locus")
mt_1kg = mt_1kg_chr1_chr20.filter_rows(
        hl.is_defined(project_mt1.rows()[mt_1kg_chr1_chr20.locus]))
#mt_1kg = mt_1kg.checkpoint(
#        f"{temp_dir}/ddd-elgh-ukbb/1000g_chr1_20_AKT_projectdata_overlap.mt", overwrite=True)

In [5]:
mt_1kg =hl.read_matrix_table(f"{temp_dir}/ddd-elgh-ukbb/1000g_chr1_20_AKT_projectdata_overlap.mt")
mt_1kg.count()

(1217, 2504)

In [6]:
pca_evals, pca_scores, loadings_ht = hl.hwe_normalized_pca(
        mt_1kg.GT, k=10, compute_loadings=True)
mt_1kg = mt_1kg.annotate_rows(
af=hl.agg.mean(mt_1kg.GT.n_alt_alleles()) / 2)
loadings_ht = loadings_ht.annotate(af=mt_1kg.rows()[loadings_ht.key].af)

2020-07-16 08:29:29 Hail: INFO: hwe_normalized_pca: running PCA using 1217 variants.
2020-07-16 08:29:33 Hail: INFO: pca: running PCA with 10 components...
2020-07-16 08:29:50 Hail: INFO: Coerced sorted dataset


In [7]:
loadings_ht.show()

locus,alleles,loadings,af
locus<GRCh38>,array<str>,array<float64>,float64
chr1:965125,"[""G"",""C""]","[3.17e-02,4.00e-02,3.17e-02,4.92e-03,3.45e-03,-2.75e-03,-4.52e-02,-2.88e-...",0.169
chr1:1085966,"[""A"",""G""]","[-1.40e-02,9.96e-03,1.28e-02,-1.48e-02,9.26e-03,-6.52e-03,3.96e-02,5.28e-...",0.179
chr1:1185156,"[""G"",""A""]","[1.41e-02,-4.01e-02,-4.25e-03,-2.90e-02,4.75e-02,5.84e-03,-9.92e-03,-7.09...",0.0517
chr1:1268007,"[""G"",""A""]","[-5.42e-02,-2.64e-03,-7.26e-03,5.28e-03,-2.28e-03,-3.71e-02,5.73e-02,3.27...",0.0671
chr1:1290851,"[""G"",""A""]","[-2.39e-02,-1.81e-02,-1.14e-02,2.25e-02,1.04e-03,9.58e-03,-3.93e-02,1.26e...",0.0707
chr1:1426175,"[""T"",""C""]","[-6.68e-02,-1.26e-02,-1.90e-03,1.19e-02,2.65e-02,-4.35e-02,9.08e-03,-1.13...",0.207
chr1:1522693,"[""A"",""G""]","[1.87e-02,5.69e-02,-5.82e-02,-2.92e-02,3.50e-03,-1.56e-02,-2.64e-02,-3.57...",0.0831
chr1:1533909,"[""G"",""C""]","[-4.89e-02,-4.17e-02,2.73e-03,3.62e-03,-3.23e-03,-2.96e-02,1.66e-02,2.66e...",0.494
chr1:1627156,"[""G"",""A""]","[3.35e-02,-5.75e-03,-8.35e-03,9.85e-02,-2.35e-02,-2.48e-02,-1.40e-02,1.63...",0.141
chr1:1955424,"[""T"",""C""]","[-6.55e-02,-1.28e-02,4.75e-02,-8.75e-05,-1.39e-02,-1.81e-02,6.22e-03,-2.2...",0.193


In [8]:
ht = pc_project(
        project_mt, loadings_ht, "loadings", "af")


2020-07-16 08:29:58 Hail: WARN: cols(): Resulting column table is sorted by 'col_key'.
    To preserve matrix table column order, first unkey columns with 'key_cols_by()'


In [9]:
#ht.write(f"{tmp_dir}/ddd-elgh-ukbb/ourproject_1kg_AKT_scores.ht", overwrite=True)
mt = project_mt.annotate_cols(scores=ht[project_mt.s].scores)
mt=mt.write(f"{tmp_dir}/ddd-elgh-ukbb/ourproject_1kg_AKT_projection.mt", overwrite=True)
#ht.export(f"{temp_dir}/ddd-elgh-ukbb/ourproject_1kg_AKT.txt.bgz")

2020-07-16 08:38:53 Hail: INFO: wrote matrix table with 24291 rows and 93674 columns in 21200 partitions to hdfs://spark-master:9820//ddd-elgh-ukbb/ourproject_1kg_AKT_projection.mt


In [54]:
mt=hl.read_matrix_table(f"{temp_dir}/ddd-elgh-ukbb/ourproject_1kg_AKT_projection.mt")

p1=hl.plot.scatter(mt.scores[1], mt.scores[2], xlabel="PC1", ylabel="PC2", label=mt.cohort)
p2=hl.plot.scatter(mt.scores[0], mt.scores[2], xlabel="PC1", ylabel="PC3", label=mt.cohort)
p3=hl.plot.scatter(mt.scores[0], mt.scores[3], xlabel="PC1", ylabel="PC4", label=mt.cohort)
p4=hl.plot.scatter(mt.scores[0], mt.scores[4], xlabel="PC1", ylabel="PC5", label=mt.cohort)
p5=hl.plot.scatter(mt.scores[1], mt.scores[3], xlabel="PC2", ylabel="PC4", label=mt.cohort)
p6=hl.plot.scatter(mt.scores[4], mt.scores[9], xlabel="PC5", ylabel="PC10", label=mt.cohort)
p7=hl.plot.scatter(mt.scores[6], mt.scores[8], xlabel="PC7", ylabel="PC9", label=mt.cohort)

show(bokeh.layouts.gridplot([[p1],[p2],[p3],[p4],[p5],[p6],[p7]]))

In [17]:
loadings_ht.show()

locus,alleles,loadings,af
locus<GRCh38>,array<str>,array<float64>,float64
chr1:965125,"[""G"",""C""]","[3.17e-02,4.00e-02,3.17e-02,4.92e-03,3.45e-03,-2.75e-03,-4.52e-02,-2.88e-...",0.169
chr1:1085966,"[""A"",""G""]","[-1.40e-02,9.96e-03,1.28e-02,-1.48e-02,9.26e-03,-6.52e-03,3.96e-02,5.28e-...",0.179
chr1:1185156,"[""G"",""A""]","[1.41e-02,-4.01e-02,-4.25e-03,-2.90e-02,4.75e-02,5.84e-03,-9.92e-03,-7.09...",0.0517
chr1:1268007,"[""G"",""A""]","[-5.42e-02,-2.64e-03,-7.26e-03,5.28e-03,-2.28e-03,-3.71e-02,5.73e-02,3.27...",0.0671
chr1:1290851,"[""G"",""A""]","[-2.39e-02,-1.81e-02,-1.14e-02,2.25e-02,1.04e-03,9.58e-03,-3.93e-02,1.26e...",0.0707
chr1:1426175,"[""T"",""C""]","[-6.68e-02,-1.26e-02,-1.90e-03,1.19e-02,2.65e-02,-4.35e-02,9.08e-03,-1.13...",0.207
chr1:1522693,"[""A"",""G""]","[1.87e-02,5.69e-02,-5.82e-02,-2.92e-02,3.50e-03,-1.56e-02,-2.64e-02,-3.57...",0.0831
chr1:1533909,"[""G"",""C""]","[-4.89e-02,-4.17e-02,2.73e-03,3.62e-03,-3.23e-03,-2.96e-02,1.66e-02,2.66e...",0.494
chr1:1627156,"[""G"",""A""]","[3.35e-02,-5.75e-03,-8.35e-03,9.85e-02,-2.35e-02,-2.48e-02,-1.40e-02,1.63...",0.141
chr1:1955424,"[""T"",""C""]","[-6.55e-02,-1.28e-02,4.75e-02,-8.75e-05,-1.39e-02,-1.81e-02,6.22e-03,-2.2...",0.193


In [18]:
mt_1kg =hl.read_matrix_table(f"{temp_dir}/ddd-elgh-ukbb/1000g_chr1_20_AKT_projectdata_overlap.mt")
mt_1kg.count()

(1217, 2504)

In [33]:
#p6=hl.plot.scatter(mt_1kg_annotated.scores[0], mt_1kg_annotated.scores[1], xlabel="PC1", ylabel="PC2")
#show(p6)
pca_scores_tb=hl.read_table(f"{temp_dir}/ddd-elgh-ukbb/1K_AKT_data_pca_scores.ht")
pca_scores_tb.describe()

----------------------------------------
Global fields:
    None
----------------------------------------
Row fields:
    's': str 
    'scores': array<float64> 
----------------------------------------
Key: ['s']
----------------------------------------


In [34]:
#read and annotate
annotations=hl.import_table(f"{temp_dir}/ddd-elgh-ukbb/1kg_annotations.txt", delimiter="\t").key_by('Sample')


2020-07-16 09:36:04 Hail: INFO: Reading table with no type imputation
  Loading column 'Sample' as type 'str' (type not specified)
  Loading column 'Population' as type 'str' (type not specified)
  Loading column 'SuperPopulation' as type 'str' (type not specified)



In [37]:
pca_scores_tb1=pca_scores_tb.annotate(Population=annotations[pca_scores_tb.s].Population)
pca_scores_tb2=pca_scores_tb1.annotate(SuperPopulation=annotations[pca_scores_tb1.s].SuperPopulation)
pca_scores_tb=pca_scores_tb2
#mt = project_mt.annotate_cols(scores=ht[project_mt.s].scores)


In [45]:
pca_scores_tb.write(f"{tmp_dir}/ddd-elgh-ukbb/1kg_pca_scores_annotated.ht", overwrite=True)
pca_scores_tb.export(f"{temp_dir}/ddd-elgh-ukbb/1kg_pca_scores_annotated.tsv")

2020-07-16 10:06:32 Hail: INFO: Ordering unsorted dataset with network shuffle
2020-07-16 10:06:32 Hail: INFO: Ordering unsorted dataset with network shuffle
2020-07-16 10:06:37 Hail: INFO: wrote table with 2504 rows in 2 partitions to hdfs://spark-master:9820//ddd-elgh-ukbb/1kg_pca_scores_annotated.ht
2020-07-16 10:06:37 Hail: INFO: Ordering unsorted dataset with network shuffle
2020-07-16 10:06:37 Hail: INFO: Ordering unsorted dataset with network shuffle
2020-07-16 10:06:42 Hail: INFO: merging 2 files totalling 552.0K...
2020-07-16 10:06:42 Hail: INFO: while writing:
    /opt/sanger.ac.uk/hgi/hail/tmp/ddd-elgh-ukbb/1kg_pca_scores_annotated.tsv
  merge time: 13.324ms


In [55]:
p1=hl.plot.scatter(pca_scores_tb.scores[0], pca_scores_tb.scores[1], xlabel="PC1", ylabel="PC2", label=pca_scores_tb.SuperPopulation)
p2=hl.plot.scatter(pca_scores_tb.scores[0], pca_scores_tb.scores[2], xlabel="PC1", ylabel="PC3", label=pca_scores_tb.SuperPopulation)
p3=hl.plot.scatter(pca_scores_tb.scores[0], pca_scores_tb.scores[3], xlabel="PC1", ylabel="PC4", label=pca_scores_tb.SuperPopulation)
p4=hl.plot.scatter(pca_scores_tb.scores[0], pca_scores_tb.scores[4], xlabel="PC1", ylabel="PC5", label=pca_scores_tb.SuperPopulation)
p5=hl.plot.scatter(pca_scores_tb.scores[1], pca_scores_tb.scores[3], xlabel="PC2", ylabel="PC4", label=pca_scores_tb.SuperPopulation)
p6=hl.plot.scatter(pca_scores_tb.scores[4], pca_scores_tb.scores[9], xlabel="PC5", ylabel="PC10", label=pca_scores_tb.SuperPopulation)
p7=hl.plot.scatter(pca_scores_tb.scores[6], pca_scores_tb.scores[8], xlabel="PC7", ylabel="PC9", label=pca_scores_tb.SuperPopulation)
show(bokeh.layouts.gridplot([[p1],[p2],[p3],[p4],[p5],[p6],[p7]]))

2020-07-16 10:28:11 Hail: INFO: Ordering unsorted dataset with network shuffle
2020-07-16 10:28:16 Hail: INFO: Ordering unsorted dataset with network shuffle
2020-07-16 10:28:20 Hail: INFO: Ordering unsorted dataset with network shuffle
2020-07-16 10:28:23 Hail: INFO: Ordering unsorted dataset with network shuffle
2020-07-16 10:28:29 Hail: INFO: Ordering unsorted dataset with network shuffle
2020-07-16 10:28:34 Hail: INFO: Ordering unsorted dataset with network shuffle
2020-07-16 10:28:38 Hail: INFO: Ordering unsorted dataset with network shuffle


In [46]:
mt=hl.read_matrix_table(f"{temp_dir}/ddd-elgh-ukbb/ourproject_1kg_AKT_projection.mt")


In [47]:
mtcols=mt.cols()

In [48]:
mtcols.describe()

----------------------------------------
Global fields:
    None
----------------------------------------
Row fields:
    's': str 
    'cohort': str 
    'f_stat': float64 
    'is_female': bool 
    'sex': str 
    'data_type': str 
    'scores': array<float64> 
----------------------------------------
Key: ['s']
----------------------------------------


In [51]:
mtcols.export(f"{temp_dir}/ddd-elgh-ukbb/megaWES_scores.tsv")

2020-07-16 10:10:57 Hail: INFO: Coerced sorted dataset
2020-07-16 10:11:01 Hail: INFO: merging 2 files totalling 23.7M...
2020-07-16 10:11:01 Hail: INFO: while writing:
    /opt/sanger.ac.uk/hgi/hail/tmp/ddd-elgh-ukbb/megaWES_scores.tsv
  merge time: 83.182ms
