In [0]:
# import pyspark
# import glow
from delta import *
from pyspark.sql.functions import explode, col, size

In [0]:
# builder = pyspark.sql.SparkSession.builder.appName("GlowDeltalakeETL") \
#     .config("spark.hadoop.io.compression.codecs", "io.projectglow.sql.util.BGZFCodec") \
#     .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
#     .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

# extra_packages = [
#     "io.projectglow:glow-spark3_2.12:1.2.1",
# ]

In [0]:
# spark = configure_spark_with_delta_pip(builder, extra_packages=extra_packages).getOrCreate()

In [0]:
# spark = glow.register(spark)

In [0]:
# spark

In [0]:
# Page Break

# ETL Somatic VCF and Save as Deltatable

In [0]:
bcbio_somatic_src = "dbfs:/tmp/dannywong/na12878-na24385-somatic-hg38-truth.vcf.gz"

In [0]:
bcbio_somatic_df = spark.read.format("vcf").load(bcbio_somatic_src)

In [0]:
bcbio_somatic_df.printSchema()

In [0]:
# Page Break

## ETL ON NESTED GENOTYPES COLUMN

In [0]:
bcbio_somatic_df_exploded = bcbio_somatic_df.withColumn("genotypes", explode("genotypes"))

In [0]:
bcbio_somatic_df_exploded.printSchema()

In [0]:
# Page Break

## FLATTEN NESTED GENOTYPE FIELDS

In [0]:
def flatten_struct_fields(df_):
    """
    REF https://github.com/microsoft/genomicsnotebook
    :param df_:
    :return:
    """
    flat_cols = [c[0] for c in df_.dtypes if c[1][:6] != 'struct']
    nested_cols = [c[0] for c in df_.dtypes if c[1][:6] =='struct']
    flat_df = df_.select(flat_cols + [col(nc+'.'+c).alias(nc+'_'+c) for nc in nested_cols for c in df_.select(nc+'.*').columns])
    return flat_df

In [0]:
bcbio_somatic_df_exploded_flatten = flatten_struct_fields(bcbio_somatic_df_exploded)

In [0]:
bcbio_somatic_df_exploded_flatten.printSchema()

In [0]:
# Page Break

## QA TRANSFORMED DATAFRAME

In [0]:
bcbio_somatic_df_exploded_flatten.select("genotypes_sampleId").distinct().show()

In [0]:
bcbio_somatic_df_exploded_flatten \
    .select("contigName", "start", "end", "INFO_AC", "INFO_SOMTYPE", "genotypes_sampleId", "genotypes_calls", "genotypes_alleleDepths") \
    .where("genotypes_sampleId = 'NA12878'") \
    .show()

In [0]:
bcbio_somatic_df_exploded_flatten \
    .cube("INFO_AC") \
    .count() \
    .show()

In [0]:
bcbio_somatic_df_exploded_flatten \
    .cube("alternateAlleles") \
    .count() \
    .show()

In [0]:
n = bcbio_somatic_df_exploded_flatten.select("genotypes_sampleId").distinct().count()
print(f"number of samples: {n}")

In [0]:
n = bcbio_somatic_df_exploded_flatten.count()
print(f"number of records: {n}")

In [0]:
n = bcbio_somatic_df_exploded_flatten \
    .where(size("alternateAlleles") > 1) \
    .count()

print(f"number of multiallelic sites: {n}")

In [0]:
# Page Break

# Write to Deltatable

In [0]:
# somatic_table = "./lakehouse/bcbio/somatic_table"
somatic_table = "dbfs:/tmp/dannywong/lakehouse/bcbio/somatic_table"

In [0]:
bcbio_somatic_df_exploded_flatten.write.format("delta").mode("overwrite").save(somatic_table)

In [0]:
%fs
ls dbfs:/tmp/dannywong/lakehouse/bcbio/somatic_table

In [0]:
!tree ./lakehouse/bcbio/somatic_table

In [0]:
# Page Break

# Summary

* In this example, we read VCF through Spark/Glow.
* We perform data transformation on Genotype column.
* We flatten (i.e. data normalisation) the nested vectors of Genotype column.
* So that it can be performant and ease of use at data query (i.e. trade off for information retrieval time) by its predicates such as Sample ID.
* We then write this dataframe out as multi-parts compressed Parquet files through Deltatable framework.
* We can write Spark dataframe in multiple Deltatable write modes such as:
    * We would use `upsert` mode when/if we were to update or insert new records into existing delta table.
    * Or, `append` mode, if destination table is immutable and, so on so ford.
* If we wish, we could also write as-is Parquet only format and arrange them in traditional Datalake structure.
    * e.g. `bcbio_somatic_df_exploded_flatten.write.format("parquet").mode("overwrite").save("./datalake/pipeline=bcbio/type=somatic/year=2022/month=01")`
    * In this case, we will be responsible for maintaining Datalake structure and its key-value partitioning such as `pipeline=bcbio`, etc
* With Deltatable framework, we abstract away this and leave it up to Deltatable framework; and achieve "Logical Table" that perform like relational database table.
* Hence, Deltatable as such technology underpin "LakeHouse" architecture pattern for BigData data warehousing possibility.

In [0]:
# Page Break

# Stop Spark Session

In [0]:
# spark.stop()

In [0]:
# Continue to next notebook