In [0]:
# import pyspark
# import glow

In [0]:
# builder = pyspark.sql.SparkSession.builder.appName("GlowVCFExplore") \
#     .config("spark.jars.packages", "io.projectglow:glow-spark3_2.12:1.2.1") \
#     .config("spark.hadoop.io.compression.codecs", "io.projectglow.sql.util.BGZFCodec")

In [0]:
# spark = builder.getOrCreate()

In [0]:
# spark = glow.register(spark)

In [0]:
# spark

In [0]:
# Page Break

# Explore bcBio both Somatic and Germline VCFs

In [0]:
# Observe that somatic schema has 2 additional columns `INFO_FREQ` and `INFO_SOMTYPE`
!diff schema_bcbio_giab_somatic.txt schema_bcbio_giab_germline.txt #won't work for files in repo

In [0]:
%fs
ls dbfs:/tmp/dannywong/

In [0]:
# This time we just point to data directory
# bcbio_src = "./data/bcbio_giab_somatic/*.vcf.gz"
bcbio_src = "dbfs:/tmp/dannywong/*.vcf.gz"

In [0]:
bcbio_df = spark.read.format("vcf").load(bcbio_src)

In [0]:
# Observe that Spark has merged the schema i.e. Union of columns from both VCF headers
bcbio_df.printSchema()

In [0]:
bcbio_df.createOrReplaceTempView("vcf_table")

In [0]:
spark.sql("describe vcf_table").show(n=1000, truncate=True)

In [0]:
# Observe that total variants count from both VCFs in single table
spark.sql("select count(1) as number_of_variants from vcf_table").show()

In [0]:
spark.sql("select distinct contigName from vcf_table order by contigName").show(46)

In [0]:
spark.sql("select contigName, start, end from vcf_table").show()

In [0]:
spark.sql("select contigName, count(end) as num_of_pos from vcf_table group by contigName order by num_of_pos desc").show(46)

In [0]:
spark.sql("select referenceAllele, alternateAlleles, array_size(alternateAlleles) from vcf_table").show()

In [0]:
spark.sql("select referenceAllele, alternateAlleles, count(*) as num_of_snps \
from vcf_table \
where \
    char_length(referenceAllele) = 1 and \
    array_size(alternateAlleles) = 1 and \
    char_length(alternateAlleles[0]) = 1 \
    group by referenceAllele, alternateAlleles \
    order by num_of_snps desc").show()

In [0]:
spark.sql("select contigName, start, end, referenceAllele, alternateAlleles, genotypes.sampleId, genotypes.alleleDepths \
from vcf_table").show(truncate=False)

In [0]:
spark.sql("select contigName, start, end, referenceAllele, alternateAlleles, genotypes.sampleId from vcf_table \
where contigName = 'chr1' and end = 817186").show(truncate=False)

In [0]:
spark.sql("select contigName, start, end, referenceAllele, alternateAlleles, genotypes.sampleId[0] from vcf_table \
where genotypes.sampleId[0] = 'NA12878'").show(truncate=False)

In [0]:
# Page Break

# Filtering Somatic or Germline

* Single merged `vcf_table` is great!
* However. Can I still filter, say, I wanted to query Somatic records only or vice versa?
* Recall that `INFO_SOMTYPE` or `INFO_FREQ` columns only present in Somatic VCF header.
* Hence, we can approximate `NULL` record present of either column as pivotal data filter.
* Let try example with `INFO_SOMTYPE` column.

In [0]:
# Count total records
spark.sql("select count(1) as number_of_variants from vcf_table").show()

In [0]:
# Filter Somatic only records
spark.sql("select count(1) as number_of_variants_somatic from vcf_table where INFO_SOMTYPE is not null").show()

In [0]:
# Filter Germline only records
spark.sql("select count(1) as number_of_variants_germline from vcf_table where INFO_SOMTYPE is null").show()

# Summary

* We can process VCF by their study type Somatic or Germline calling or both.
* Assumption is that within each "best practice" BioInformatics Pipeline; it should generate similar VCF structure with minor differences in header annotation.
* Couple of strategy possible:
    * by arranging all Somatic VCF type of the same BioInfo Pipeline output into one table
    * similarly, all Germline VCF type of the same BioInfo Pipeline output into one table
    * if we merged VCF, make sure to have very discriminator column that can filter data records better
        * depends on data pipline setup, this discriminator column can be inserted during post-processing VCF files
        * or, could add as part of BioInfo Pipeline VCF annotation process
* We can prescribe Spark/Glow to auto-discover "schema" out of VCF. Hence, "schema evolution" is possible.
* Or, we can prescribe "pre-defined schema" to Spark/Glow during parsing. Hence, enforcing "strict schema".

In [0]:
# Page Break

# Stop Spark Session

In [0]:
spark.stop()

In [0]:
# Continue to next notebook