In [0]:
# import pyspark
# import glow
from delta import *
from pyspark.sql.functions import explode, col, size

In [0]:
#  builder = pyspark.sql.SparkSession.builder.appName("GlowDeltalakeETL") \
#     .config("spark.hadoop.io.compression.codecs", "io.projectglow.sql.util.BGZFCodec") \
#     .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
#     .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

# extra_packages = [
#     "io.projectglow:glow-spark3_2.12:1.2.1",
# ]

In [0]:
# spark = configure_spark_with_delta_pip(builder, extra_packages=extra_packages).getOrCreate()

In [0]:
# spark = glow.register(spark)

In [0]:
# spark

In [0]:
# Page Break

# ETL Germline VCF and Save as Deltatable

* NOTE: This is repetition of previous Somatic step.

In [0]:
bcbio_germline_src = "dbfs:/tmp/dannywong/na12878-na24385-germline-hg38-truth.vcf.gz"

In [0]:
bcbio_germline_df = spark.read.format("vcf").load(bcbio_germline_src)

In [0]:
bcbio_germline_df.printSchema()

In [0]:
# Page Break

## ETL ON NESTED GENOTYPES COLUMN

In [0]:
bcbio_germline_df_exploded = bcbio_germline_df.withColumn("genotypes", explode("genotypes"))

In [0]:
bcbio_germline_df_exploded.printSchema()

In [0]:
# Page Break

## FLATTEN NESTED GENOTYPE FIELDS

In [0]:
def flatten_struct_fields(df_):
    """
    REF https://github.com/microsoft/genomicsnotebook
    :param df_:
    :return:
    """
    flat_cols = [c[0] for c in df_.dtypes if c[1][:6] != 'struct']
    nested_cols = [c[0] for c in df_.dtypes if c[1][:6] =='struct']
    flat_df = df_.select(flat_cols + [col(nc+'.'+c).alias(nc+'_'+c) for nc in nested_cols for c in df_.select(nc+'.*').columns])
    return flat_df

In [0]:
bcbio_germline_df_exploded_flatten = flatten_struct_fields(bcbio_germline_df_exploded)

In [0]:
bcbio_germline_df_exploded_flatten.printSchema()

In [0]:
# Page Break

## QA TRANSFORMED DATAFRAME

In [0]:
bcbio_germline_df_exploded_flatten.select("genotypes_sampleId").distinct().show()

In [0]:
bcbio_germline_df_exploded_flatten \
    .select("contigName", "start", "end", "INFO_AC", "genotypes_sampleId", "genotypes_calls", "genotypes_alleleDepths") \
    .where("genotypes_sampleId = 'NA12878'") \
    .show()

In [0]:
bcbio_germline_df_exploded_flatten \
    .cube("INFO_AC") \
    .count() \
    .show()

In [0]:
bcbio_germline_df_exploded_flatten \
    .cube("alternateAlleles") \
    .count() \
    .show()

In [0]:
n = bcbio_germline_df_exploded_flatten.select("genotypes_sampleId").distinct().count()
print(f"number of samples: {n}")

In [0]:
n = bcbio_germline_df_exploded_flatten.count()
print(f"number of records: {n}")

In [0]:
n = bcbio_germline_df_exploded_flatten \
    .where(size("alternateAlleles") > 1) \
    .count()

print(f"number of multiallelic sites: {n}")

In [0]:
# Page Break

# Write to Deltatable

In [0]:
germline_table = "dbfs:/tmp/dannywong/lakehouse/bcbio/germline_table"

In [0]:
bcbio_germline_df_exploded_flatten.write.format("delta").mode("overwrite").save(germline_table)

In [0]:
%fs
ls dbfs:/tmp/dannywong/lakehouse/bcbio/germline_table

In [0]:
!tree ./lakehouse/bcbio/germline_table

In [0]:
# Page Break

# Stop Spark Session

In [0]:
# spark.stop()

In [0]:
# Continue to next notebook