In [1]:
import pyspark
import glow
from delta import *
from pyspark.sql.functions import explode, col, size

In [2]:
builder = pyspark.sql.SparkSession.builder.appName("GlowDeltalakeETL") \
    .config("spark.hadoop.io.compression.codecs", "io.projectglow.sql.util.BGZFCodec") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

extra_packages = [
    "io.projectglow:glow-spark3_2.12:1.2.1",
]

In [3]:
spark = configure_spark_with_delta_pip(builder, extra_packages=extra_packages).getOrCreate()

In [4]:
spark = glow.register(spark)

In [None]:
spark

In [None]:
# Page Break

# ETL Germline VCF and Save as Deltatable

* NOTE: This is repetition of previous Somatic step.

In [5]:
bcbio_germline_src = "./data/bcbio_giab_somatic/na12878-na24385-germline-hg38-truth.vcf.gz"

In [None]:
bcbio_germline_df = spark.read.format("vcf").load(bcbio_germline_src)

In [7]:
bcbio_germline_df.printSchema()

root
 |-- contigName: string (nullable = true)
 |-- start: long (nullable = true)
 |-- end: long (nullable = true)
 |-- names: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- referenceAllele: string (nullable = true)
 |-- alternateAlleles: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- qual: double (nullable = true)
 |-- filters: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- splitFromMultiAllelic: boolean (nullable = true)
 |-- INFO_platformnames: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- INFO_callsetwithotheruniqgenopassing: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- INFO_callsetnames: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- INFO_AC: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- INFO_varType: string (nullable = true)
 |-- INFO_DPSum: integer (nullable = true)
 

In [None]:
# Page Break

## ETL ON NESTED GENOTYPES COLUMN

In [8]:
bcbio_germline_df_exploded = bcbio_germline_df.withColumn("genotypes", explode("genotypes"))

In [9]:
bcbio_germline_df_exploded.printSchema()

root
 |-- contigName: string (nullable = true)
 |-- start: long (nullable = true)
 |-- end: long (nullable = true)
 |-- names: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- referenceAllele: string (nullable = true)
 |-- alternateAlleles: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- qual: double (nullable = true)
 |-- filters: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- splitFromMultiAllelic: boolean (nullable = true)
 |-- INFO_platformnames: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- INFO_callsetwithotheruniqgenopassing: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- INFO_callsetnames: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- INFO_AC: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- INFO_varType: string (nullable = true)
 |-- INFO_DPSum: integer (nullable = true)
 

In [None]:
# Page Break

## FLATTEN NESTED GENOTYPE FIELDS

In [10]:
def flatten_struct_fields(df_):
    """
    REF https://github.com/microsoft/genomicsnotebook
    :param df_:
    :return:
    """
    flat_cols = [c[0] for c in df_.dtypes if c[1][:6] != 'struct']
    nested_cols = [c[0] for c in df_.dtypes if c[1][:6] =='struct']
    flat_df = df_.select(flat_cols + [col(nc+'.'+c).alias(nc+'_'+c) for nc in nested_cols for c in df_.select(nc+'.*').columns])
    return flat_df

In [11]:
bcbio_germline_df_exploded_flatten = flatten_struct_fields(bcbio_germline_df_exploded)

In [12]:
bcbio_germline_df_exploded_flatten.printSchema()

root
 |-- contigName: string (nullable = true)
 |-- start: long (nullable = true)
 |-- end: long (nullable = true)
 |-- names: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- referenceAllele: string (nullable = true)
 |-- alternateAlleles: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- qual: double (nullable = true)
 |-- filters: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- splitFromMultiAllelic: boolean (nullable = true)
 |-- INFO_platformnames: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- INFO_callsetwithotheruniqgenopassing: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- INFO_callsetnames: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- INFO_AC: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- INFO_varType: string (nullable = true)
 |-- INFO_DPSum: integer (nullable = true)
 

In [None]:
# Page Break

## QA TRANSFORMED DATAFRAME

In [13]:
bcbio_germline_df_exploded_flatten.select("genotypes_sampleId").distinct().show()



+------------------+
|genotypes_sampleId|
+------------------+
|           NA12878|
+------------------+



                                                                                

In [14]:
bcbio_germline_df_exploded_flatten \
    .select("contigName", "start", "end", "INFO_AC", "genotypes_sampleId", "genotypes_calls", "genotypes_alleleDepths") \
    .where("genotypes_sampleId = 'NA12878'") \
    .show()

+----------+------+------+-------+------------------+---------------+----------------------+
|contigName| start|   end|INFO_AC|genotypes_sampleId|genotypes_calls|genotypes_alleleDepths|
+----------+------+------+-------+------------------+---------------+----------------------+
|      chr1|817185|817186|    [2]|           NA12878|         [0, 1]|             [78, 454]|
|      chr1|817340|817341|    [2]|           NA12878|         [0, 1]|            [107, 342]|
|      chr1|817888|817889|    [2]|           NA12878|         [0, 1]|             [74, 220]|
|      chr1|818801|818802|    [2]|           NA12878|         [0, 1]|              [0, 202]|
|      chr1|818811|818812|    [2]|           NA12878|         [0, 1]|              [0, 190]|
|      chr1|818953|818954|    [2]|           NA12878|         [0, 1]|              [0, 246]|
|      chr1|819122|819123|    [2]|           NA12878|         [0, 1]|            [110, 363]|
|      chr1|819583|819584|    [2]|           NA12878|         [0, 1]| 



In [15]:
bcbio_germline_df_exploded_flatten \
    .cube("INFO_AC") \
    .count() \
    .show()



+------------+-------+
|     INFO_AC|  count|
+------------+-------+
|      [2, 0]|   9751|
|         [1]|1016830|
|   [1, 0, 0]|   1377|
|      [1, 0]|  16177|
|[1, 1, 0, 0]|    940|
|        null|2123774|
|      [1, 1]|  14887|
|   [2, 0, 0]|    748|
|         [2]|1057338|
|   [1, 1, 0]|   5726|
+------------+-------+



                                                                                

In [16]:
bcbio_germline_df_exploded_flatten \
    .cube("alternateAlleles") \
    .count() \
    .show()



+--------------------+-----+
|    alternateAlleles|count|
+--------------------+-----+
|              [TAGC]|   22|
|            [GAATGC]|    1|
|    [ATGTG, ATGTGTG]|   28|
|             [TAAAC]|  141|
|         [GTGTTTTCT]|    1|
|        [TTTGTTG, T]|    8|
|              [GCAA]|   18|
|  [CTCTA, CTCTATCTA]|    6|
| [TTG, TTGTGTGTG, T]|    2|
|          [ATTTCTAT]|    1|
|          [C, CGTGT]|   63|
|             [TATTC]|   17|
|[CATCTATCTATCT, C...|    1|
| [CGT, CGTGTGTGT, C]|    2|
|[CATT, CATTATT, C...|    3|
|          [AAAGTAGC]|    1|
|            [ATTTCT]|   14|
|     [TAC, T, TACAC]|   15|
| [TCACACACACACACACA]|    1|
|         [GCCC, GCC]|    1|
+--------------------+-----+
only showing top 20 rows



                                                                                

In [17]:
n = bcbio_germline_df_exploded_flatten.select("genotypes_sampleId").distinct().count()
print(f"number of samples: {n}")



number of samples: 1


                                                                                

In [18]:
n = bcbio_germline_df_exploded_flatten.count()
print(f"number of records: {n}")



number of records: 2123774


                                                                                

In [19]:
n = bcbio_germline_df_exploded_flatten \
    .where(size("alternateAlleles") > 1) \
    .count()

print(f"number of multiallelic sites: {n}")



number of multiallelic sites: 49606


                                                                                

In [None]:
# Page Break

# Write to Deltatable

In [20]:
germline_table = "./lakehouse/bcbio/germline_table"

In [21]:
bcbio_germline_df_exploded_flatten.write.format("delta").mode("overwrite").save(germline_table)

22/10/07 23:40:09 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


                                                                                

In [22]:
!tree ./lakehouse/bcbio/germline_table

[01;34m./lakehouse/bcbio/germline_table[0m
├── [01;34m_delta_log[0m
│   └── [00m00000000000000000000.json[0m
├── [00mpart-00000-e6bec1f6-e168-4966-8792-aae80b7f2fb7-c000.snappy.parquet[0m
├── [00mpart-00001-243f7c82-8ee4-428b-8a79-0652c674c3cb-c000.snappy.parquet[0m
├── [00mpart-00002-9b626eac-4afe-499d-8416-348ba0950f6f-c000.snappy.parquet[0m
└── [00mpart-00003-a2267a27-bc20-47ab-8099-37d25dffc783-c000.snappy.parquet[0m

1 directory, 5 files


In [None]:
# Page Break

# Stop Spark Session

In [23]:
spark.stop()

In [None]:
# Continue to next notebook