In [1]:
import pyspark
import glow

In [2]:
builder = pyspark.sql.SparkSession.builder.appName("GlowVCFExplore") \
    .config("spark.jars.packages", "io.projectglow:glow-spark3_2.12:1.2.1") \
    .config("spark.hadoop.io.compression.codecs", "io.projectglow.sql.util.BGZFCodec")

In [3]:
spark = builder.getOrCreate()

In [4]:
spark = glow.register(spark)

In [None]:
spark

In [None]:
# Page Break

# Explore bcBio Germline VCF

In [5]:
bcbio_germline_src = "./data/bcbio_giab_somatic/na12878-na24385-germline-hg38-truth.vcf.gz"

In [None]:
bcbio_germline_df = spark.read.format("vcf").load(bcbio_germline_src)

In [7]:
bcbio_germline_df.printSchema()

root
 |-- contigName: string (nullable = true)
 |-- start: long (nullable = true)
 |-- end: long (nullable = true)
 |-- names: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- referenceAllele: string (nullable = true)
 |-- alternateAlleles: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- qual: double (nullable = true)
 |-- filters: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- splitFromMultiAllelic: boolean (nullable = true)
 |-- INFO_platformnames: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- INFO_callsetwithotheruniqgenopassing: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- INFO_callsetnames: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- INFO_AC: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- INFO_varType: string (nullable = true)
 |-- INFO_DPSum: integer (nullable = true)
 

In [None]:
bcbio_germline_df.createOrReplaceTempView("bcbio_germline_table")

In [9]:
spark.sql("describe bcbio_germline_table").show(n=1000, truncate=True)

+--------------------+--------------------+-------+
|            col_name|           data_type|comment|
+--------------------+--------------------+-------+
|          contigName|              string|   null|
|               start|              bigint|   null|
|                 end|              bigint|   null|
|               names|       array<string>|   null|
|     referenceAllele|              string|   null|
|    alternateAlleles|       array<string>|   null|
|                qual|              double|   null|
|             filters|       array<string>|   null|
|splitFromMultiAll...|             boolean|   null|
|  INFO_platformnames|       array<string>|   null|
|INFO_callsetwitho...|       array<string>|   null|
|   INFO_callsetnames|       array<string>|   null|
|             INFO_AC|          array<int>|   null|
|        INFO_varType|              string|   null|
|          INFO_DPSum|                 int|   null|
|INFO_datasetsmiss...|       array<string>|   null|
|           

In [10]:
spark.sql("select count(1) as number_of_variants from bcbio_germline_table").show()



+------------------+
|number_of_variants|
+------------------+
|           2123774|
+------------------+



                                                                                

In [11]:
spark.sql("select distinct contigName from bcbio_germline_table order by contigName").show(46)



+----------+
|contigName|
+----------+
|      chr1|
|     chr10|
|     chr11|
|     chr12|
|     chr13|
|     chr14|
|     chr15|
|     chr16|
|     chr17|
|     chr18|
|     chr19|
|      chr2|
|     chr20|
|     chr21|
|     chr22|
|      chr3|
|      chr4|
|      chr5|
|      chr6|
|      chr7|
|      chr8|
|      chr9|
+----------+



                                                                                

In [12]:
spark.sql("select contigName, start, end from bcbio_germline_table").show()

+----------+------+------+
|contigName| start|   end|
+----------+------+------+
|      chr1|817185|817186|
|      chr1|817340|817341|
|      chr1|817888|817889|
|      chr1|818801|818802|
|      chr1|818811|818812|
|      chr1|818953|818954|
|      chr1|819122|819123|
|      chr1|819583|819584|
|      chr1|824319|824320|
|      chr1|824456|824457|
|      chr1|825531|825532|
|      chr1|825766|825767|
|      chr1|826576|826577|
|      chr1|826892|826893|
|      chr1|827208|827209|
|      chr1|827211|827212|
|      chr1|827220|827221|
|      chr1|827251|827252|
|      chr1|828013|828014|
|      chr1|830724|830725|
+----------+------+------+
only showing top 20 rows





In [13]:
spark.sql("select contigName, count(end) as num_of_pos from bcbio_germline_table group by contigName order by num_of_pos desc").show(46)



+----------+----------+
|contigName|num_of_pos|
+----------+----------+
|      chr2|    178117|
|      chr1|    176212|
|      chr3|    159225|
|      chr4|    146225|
|      chr6|    142792|
|      chr5|    132555|
|      chr7|    122488|
|     chr11|    118991|
|     chr10|    114638|
|      chr8|    109925|
|     chr12|    102157|
|      chr9|     97462|
|     chr13|     94342|
|     chr14|     74504|
|     chr15|     65969|
|     chr17|     54231|
|     chr18|     53338|
|     chr20|     45056|
|     chr19|     40865|
|     chr16|     40249|
|     chr21|     31506|
|     chr22|     22927|
+----------+----------+



                                                                                

In [14]:
spark.sql("select referenceAllele, alternateAlleles, array_size(alternateAlleles) from bcbio_germline_table").show()

+---------------+----------------+----------------------------+
|referenceAllele|alternateAlleles|array_size(alternateAlleles)|
+---------------+----------------+----------------------------+
|              G|             [A]|                           1|
|              A|             [G]|                           1|
|              C|             [G]|                           1|
|              A|             [G]|                           1|
|              A|             [G]|                           1|
|              T|             [C]|                           1|
|              G|             [A]|                           1|
|              C|             [T]|                           1|
|              T|             [C]|                           1|
|              T|             [A]|                           1|
|              C|             [T]|                           1|
|              T|             [C]|                           1|
|              A|            [AT]|      



In [15]:
spark.sql("select referenceAllele, alternateAlleles, count(*) as num_of_snps \
from bcbio_germline_table \
where \
    char_length(referenceAllele) = 1 and \
    array_size(alternateAlleles) = 1 and \
    char_length(alternateAlleles[0]) = 1 \
    group by referenceAllele, alternateAlleles \
    order by num_of_snps desc").show()



+---------------+----------------+-----------+
|referenceAllele|alternateAlleles|num_of_snps|
+---------------+----------------+-----------+
|              T|             [C]|     318495|
|              A|             [G]|     317816|
|              C|             [T]|     305162|
|              G|             [A]|     304371|
|              G|             [C]|      78709|
|              C|             [G]|      78244|
|              A|             [C]|      77564|
|              T|             [G]|      77273|
|              G|             [T]|      75411|
|              C|             [A]|      75199|
|              T|             [A]|      63624|
|              A|             [T]|      63543|
+---------------+----------------+-----------+



                                                                                

In [16]:
spark.sql("select contigName, start, end, referenceAllele, alternateAlleles, genotypes.sampleId, genotypes.alleleDepths \
from bcbio_germline_table").show(truncate=False)

+----------+------+------+---------------+----------------+---------+------------+
|contigName|start |end   |referenceAllele|alternateAlleles|sampleId |alleleDepths|
+----------+------+------+---------------+----------------+---------+------------+
|chr1      |817185|817186|G              |[A]             |[NA12878]|[[78, 454]] |
|chr1      |817340|817341|A              |[G]             |[NA12878]|[[107, 342]]|
|chr1      |817888|817889|C              |[G]             |[NA12878]|[[74, 220]] |
|chr1      |818801|818802|A              |[G]             |[NA12878]|[[0, 202]]  |
|chr1      |818811|818812|A              |[G]             |[NA12878]|[[0, 190]]  |
|chr1      |818953|818954|T              |[C]             |[NA12878]|[[0, 246]]  |
|chr1      |819122|819123|G              |[A]             |[NA12878]|[[110, 363]]|
|chr1      |819583|819584|C              |[T]             |[NA12878]|[[91, 378]] |
|chr1      |824319|824320|T              |[C]             |[NA12878]|[[92, 391]] |
|chr



In [17]:
spark.sql("select contigName, start, end, referenceAllele, alternateAlleles, genotypes.sampleId from bcbio_germline_table \
where contigName = 'chr1' and end = 817186").show(truncate=False)



+----------+------+------+---------------+----------------+---------+
|contigName|start |end   |referenceAllele|alternateAlleles|sampleId |
+----------+------+------+---------------+----------------+---------+
|chr1      |817185|817186|G              |[A]             |[NA12878]|
+----------+------+------+---------------+----------------+---------+



                                                                                

In [18]:
spark.sql("select contigName, start, end, referenceAllele, alternateAlleles, genotypes.sampleId[0] from bcbio_germline_table \
where genotypes.sampleId[0] = 'NA12878'").show(truncate=False)

+----------+------+------+---------------+----------------+---------------------+
|contigName|start |end   |referenceAllele|alternateAlleles|genotypes.sampleId[0]|
+----------+------+------+---------------+----------------+---------------------+
|chr1      |817185|817186|G              |[A]             |NA12878              |
|chr1      |817340|817341|A              |[G]             |NA12878              |
|chr1      |817888|817889|C              |[G]             |NA12878              |
|chr1      |818801|818802|A              |[G]             |NA12878              |
|chr1      |818811|818812|A              |[G]             |NA12878              |
|chr1      |818953|818954|T              |[C]             |NA12878              |
|chr1      |819122|819123|G              |[A]             |NA12878              |
|chr1      |819583|819584|C              |[T]             |NA12878              |
|chr1      |824319|824320|T              |[C]             |NA12878              |
|chr1      |8244



In [None]:
# Page Break

# Stop Spark Session

In [19]:
spark.stop()

In [None]:
# Continue to next notebook