In [1]:
import pyspark
import glow

In [2]:
builder = pyspark.sql.SparkSession.builder.appName("GlowVCFExplore") \
    .config("spark.jars.packages", "io.projectglow:glow-spark3_2.12:1.2.1") \
    .config("spark.hadoop.io.compression.codecs", "io.projectglow.sql.util.BGZFCodec")

In [3]:
spark = builder.getOrCreate()

In [4]:
spark = glow.register(spark)

In [None]:
spark

In [None]:
# Page Break

# Explore bcBio Somatic VCF

In [5]:
bcbio_somatic_src = "./data/bcbio_giab_somatic/na12878-na24385-somatic-hg38-truth.vcf.gz"

In [None]:
bcbio_somatic_df = spark.read.format("vcf").load(bcbio_somatic_src)

In [7]:
bcbio_somatic_df.printSchema()

root
 |-- contigName: string (nullable = true)
 |-- start: long (nullable = true)
 |-- end: long (nullable = true)
 |-- names: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- referenceAllele: string (nullable = true)
 |-- alternateAlleles: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- qual: double (nullable = true)
 |-- filters: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- splitFromMultiAllelic: boolean (nullable = true)
 |-- INFO_platformnames: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- INFO_callsetwithotheruniqgenopassing: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- INFO_callsetnames: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- INFO_AC: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- INFO_FREQ: array (nullable = true)
 |    |-- element: string (containsNull = true)


In [None]:
bcbio_somatic_df.createOrReplaceTempView("bcbio_somatic_table")

In [9]:
spark.sql("describe bcbio_somatic_table").show(n=1000, truncate=True)

+--------------------+--------------------+-------+
|            col_name|           data_type|comment|
+--------------------+--------------------+-------+
|          contigName|              string|   null|
|               start|              bigint|   null|
|                 end|              bigint|   null|
|               names|       array<string>|   null|
|     referenceAllele|              string|   null|
|    alternateAlleles|       array<string>|   null|
|                qual|              double|   null|
|             filters|       array<string>|   null|
|splitFromMultiAll...|             boolean|   null|
|  INFO_platformnames|       array<string>|   null|
|INFO_callsetwitho...|       array<string>|   null|
|   INFO_callsetnames|       array<string>|   null|
|             INFO_AC|          array<int>|   null|
|           INFO_FREQ|       array<string>|   null|
|        INFO_varType|              string|   null|
|          INFO_DPSum|                 int|   null|
|INFO_datase

In [10]:
spark.sql("select count(1) as number_of_variants from bcbio_somatic_table").show()

                                                                                

+------------------+
|number_of_variants|
+------------------+
|           1082945|
+------------------+



In [11]:
spark.sql("select distinct contigName from bcbio_somatic_table order by contigName").show(46)



+----------+
|contigName|
+----------+
|      chr1|
|     chr10|
|     chr11|
|     chr12|
|     chr13|
|     chr14|
|     chr15|
|     chr16|
|     chr17|
|     chr18|
|     chr19|
|      chr2|
|     chr20|
|     chr21|
|     chr22|
|      chr3|
|      chr4|
|      chr5|
|      chr6|
|      chr7|
|      chr8|
|      chr9|
+----------+



                                                                                

In [12]:
spark.sql("select contigName, start, end from bcbio_somatic_table").show()

+----------+-------+-------+
|contigName|  start|    end|
+----------+-------+-------+
|      chr1| 852046| 852047|
|      chr1| 971789| 971791|
|      chr1| 974038| 974039|
|      chr1| 975013| 975014|
|      chr1|1004110|1004112|
|      chr1|1004624|1004625|
|      chr1|1005428|1005429|
|      chr1|1005903|1005904|
|      chr1|1005953|1005954|
|      chr1|1006158|1006159|
|      chr1|1008306|1008307|
|      chr1|1009183|1009184|
|      chr1|1013548|1013556|
|      chr1|1014227|1014228|
|      chr1|1014862|1014863|
|      chr1|1015924|1015925|
|      chr1|1016183|1016184|
|      chr1|1016622|1016623|
|      chr1|1017113|1017114|
|      chr1|1034779|1034780|
+----------+-------+-------+
only showing top 20 rows





In [13]:
spark.sql("select contigName, count(end) as num_of_pos from bcbio_somatic_table group by contigName order by num_of_pos desc").show(46)

                                                                                

+----------+----------+
|contigName|num_of_pos|
+----------+----------+
|      chr2|     93239|
|      chr1|     91664|
|      chr3|     83863|
|      chr6|     77849|
|      chr5|     71727|
|      chr4|     67549|
|      chr7|     63553|
|      chr8|     55448|
|     chr10|     54948|
|     chr11|     53769|
|     chr12|     52404|
|      chr9|     50528|
|     chr13|     42746|
|     chr14|     37538|
|     chr15|     30810|
|     chr17|     29174|
|     chr18|     27555|
|     chr20|     26771|
|     chr19|     20911|
|     chr16|     20594|
|     chr21|     16145|
|     chr22|     14160|
+----------+----------+



In [14]:
spark.sql("select referenceAllele, alternateAlleles, array_size(alternateAlleles) from bcbio_somatic_table").show()

+---------------+----------------+----------------------------+
|referenceAllele|alternateAlleles|array_size(alternateAlleles)|
+---------------+----------------+----------------------------+
|              C|             [T]|                           1|
|             AG|             [A]|                           1|
|              C|             [T]|                           1|
|              C|             [T]|                           1|
|             TG|             [T]|                           1|
|              A|             [G]|                           1|
|              C|            [CA]|                           1|
|              C|             [T]|                           1|
|              G|             [A]|                           1|
|              C|             [T]|                           1|
|              G|             [C]|                           1|
|              T|             [C]|                           1|
|       GGCCCACA|             [G]|      



In [15]:
spark.sql("select referenceAllele, alternateAlleles, count(*) as num_of_snps \
from bcbio_somatic_table \
where \
    char_length(referenceAllele) = 1 and \
    array_size(alternateAlleles) = 1 and \
    char_length(alternateAlleles[0]) = 1 \
    group by referenceAllele, alternateAlleles \
    order by num_of_snps desc").show()



+---------------+----------------+-----------+
|referenceAllele|alternateAlleles|num_of_snps|
+---------------+----------------+-----------+
|              G|             [A]|     179649|
|              C|             [T]|     179274|
|              T|             [C]|     145030|
|              A|             [G]|     144496|
|              G|             [T]|      42193|
|              C|             [A]|      41828|
|              C|             [G]|      40728|
|              G|             [C]|      40494|
|              A|             [C]|      36837|
|              T|             [G]|      36177|
|              T|             [A]|      33016|
|              A|             [T]|      32659|
+---------------+----------------+-----------+



                                                                                

In [16]:
spark.sql("select referenceAllele, alternateAlleles, count(*) as num_of_deletions \
from bcbio_somatic_table \
where \
    char_length(referenceAllele) > 1 and \
    array_size(alternateAlleles) = 1 and \
    char_length(alternateAlleles[0]) = 1 \
    group by referenceAllele, alternateAlleles \
    order by num_of_deletions desc").show()



+---------------+----------------+----------------+
|referenceAllele|alternateAlleles|num_of_deletions|
+---------------+----------------+----------------+
|             AT|             [A]|            5802|
|             CT|             [C]|            5631|
|             CA|             [C]|            5300|
|             TA|             [T]|            5080|
|             GA|             [G]|            3949|
|             GT|             [G]|            2947|
|             TG|             [T]|            1759|
|             AG|             [A]|            1563|
|             AC|             [A]|            1474|
|             TC|             [T]|            1324|
|            CAT|             [C]|             939|
|            CTT|             [C]|             862|
|             GC|             [G]|             861|
|            CAA|             [C]|             691|
|            CAG|             [C]|             567|
|            TAA|             [T]|             521|
|           

                                                                                

In [17]:
spark.sql("select referenceAllele, alternateAlleles, count(*) as num_of_insertions \
from bcbio_somatic_table \
where \
    char_length(referenceAllele) = 1 and \
    array_size(alternateAlleles) > 1 \
    group by referenceAllele, alternateAlleles \
    order by num_of_insertions desc").show()



+---------------+----------------+-----------------+
|referenceAllele|alternateAlleles|num_of_insertions|
+---------------+----------------+-----------------+
|              C|       [CA, CAA]|               49|
|              A|       [AT, ATT]|               47|
|              C|       [CT, CTT]|               38|
|              T|       [TA, TAA]|               33|
|              T|    [TAC, TACAC]|               24|
|              A|         [AT, T]|               23|
|              T|         [A, TA]|               22|
|              G|       [GT, GTT]|               21|
|              G|       [GA, GAA]|               20|
|              C|          [G, T]|               14|
|              C|     [CAA, CAAA]|               13|
|              C|      [CA, CAAA]|               13|
|              C|          [A, T]|               11|
|              G|          [A, C]|               10|
|              C|          [A, G]|               10|
|              C|     [CTT, CTTT]|            

                                                                                

In [18]:
spark.sql("select count(1) as `A>C` from bcbio_somatic_table \
where \
    referenceAllele = 'A' and \
    alternateAlleles[0] = 'C' and \
    array_size(alternateAlleles) = 1").show()



+-----+
|  A>C|
+-----+
|36837|
+-----+



                                                                                

In [19]:
spark.sql("select count(1) as `A>AT` from bcbio_somatic_table \
where \
    referenceAllele = 'A' and \
    alternateAlleles[0] = 'AT' and \
    array_size(alternateAlleles) = 2").show()



+----+
|A>AT|
+----+
|  84|
+----+



                                                                                

In [20]:
spark.sql("select contigName, start, end, referenceAllele, alternateAlleles, genotypes.sampleId, genotypes.alleleDepths \
from bcbio_somatic_table").show(truncate=False)

+----------+-------+-------+---------------+----------------+---------+------------+
|contigName|start  |end    |referenceAllele|alternateAlleles|sampleId |alleleDepths|
+----------+-------+-------+---------------+----------------+---------+------------+
|chr1      |852046 |852047 |C              |[T]             |[NA12878]|[[133, 139]]|
|chr1      |971789 |971791 |AG             |[A]             |[NA12878]|[[192, 148]]|
|chr1      |974038 |974039 |C              |[T]             |[NA12878]|[[143, 129]]|
|chr1      |975013 |975014 |C              |[T]             |[NA12878]|[[103, 100]]|
|chr1      |1004110|1004112|TG             |[T]             |[NA12878]|[[152, 141]]|
|chr1      |1004624|1004625|A              |[G]             |[NA12878]|[[129, 119]]|
|chr1      |1005428|1005429|C              |[CA]            |[NA12878]|[[158, 135]]|
|chr1      |1005903|1005904|C              |[T]             |[NA12878]|[[131, 121]]|
|chr1      |1005953|1005954|G              |[A]             |[NA1



In [21]:
spark.sql("select contigName, start, end, referenceAllele, alternateAlleles, genotypes.sampleId from bcbio_somatic_table \
where contigName = 'chr1' and end = 852047").show(truncate=False)



+----------+------+------+---------------+----------------+---------+
|contigName|start |end   |referenceAllele|alternateAlleles|sampleId |
+----------+------+------+---------------+----------------+---------+
|chr1      |852046|852047|C              |[T]             |[NA12878]|
+----------+------+------+---------------+----------------+---------+



                                                                                

In [22]:
spark.sql("select contigName, start, end, referenceAllele, alternateAlleles, genotypes.sampleId[0] from bcbio_somatic_table \
where genotypes.sampleId[0] = 'NA12878'").show(truncate=False)

+----------+-------+-------+---------------+----------------+---------------------+
|contigName|start  |end    |referenceAllele|alternateAlleles|genotypes.sampleId[0]|
+----------+-------+-------+---------------+----------------+---------------------+
|chr1      |852046 |852047 |C              |[T]             |NA12878              |
|chr1      |971789 |971791 |AG             |[A]             |NA12878              |
|chr1      |974038 |974039 |C              |[T]             |NA12878              |
|chr1      |975013 |975014 |C              |[T]             |NA12878              |
|chr1      |1004110|1004112|TG             |[T]             |NA12878              |
|chr1      |1004624|1004625|A              |[G]             |NA12878              |
|chr1      |1005428|1005429|C              |[CA]            |NA12878              |
|chr1      |1005903|1005904|C              |[T]             |NA12878              |
|chr1      |1005953|1005954|G              |[A]             |NA12878        



In [None]:
# Page Break

# Stop Spark Session

In [23]:
spark.stop()

In [None]:
# Continue to next notebook