### Import Modules

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

from delta.tables import *

import os
import json

from datetime import datetime
from pytz import timezone
tz = timezone("America/Sao_Paulo")

### Build a Spark Session
The entry point to programming Spark with the Dataset and DataFrame API.

In [2]:
spark = SparkSession \
    .builder \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.1,io.delta:delta-core_2.12:1.2.1") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .appName("LabCDC") \
    .getOrCreate()

In [3]:
# Show some information about the spark context and get url for Spark UI
spark._sc

### Consume messages from kafka topics

In [4]:
# Creating a Sqpark DataFrame from Kafka topic
df = spark \
  .read \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "kafka-1:9092") \
  .option("subscribe", "lab_cdc.inventory.customers") \
  .load()

In [5]:
df.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



### Exploring kafka message structure

In [6]:
# Collecting data from Spark DataFrame into a list of pyspark Row
kafka_data = df.collect()

# printing some information
len(kafka_data), type(kafka_data), type(kafka_data[0])

(8, list, pyspark.sql.types.Row)

In [7]:
# Showing the dict keys from the Rows
kafka_data[0].asDict().keys()

dict_keys(['key', 'value', 'topic', 'partition', 'offset', 'timestamp', 'timestampType'])

In [8]:
# Showing the values of topic, partition, offset, timestamp, and timestampType from the first Row
print(' Topic:          {}\n'.format(kafka_data[0]['topic']),\
      'Partition:      {}\n'.format(kafka_data[0]['partition']),\
      'Offset:         {}\n'.format( kafka_data[0]['offset']),\
      'Timestamp:      {}\n'.format(kafka_data[0]['timestamp']),\
      'Timestamp Type: {}'.format(kafka_data[0]['timestampType']))

 Topic:          lab_cdc.inventory.customers
 Partition:      2
 Offset:         3
 Timestamp:      2022-04-29 19:46:19.458000
 Timestamp Type: 0


In [9]:
# Showing the Key and Value from the first Row
print('Key\n', json.dumps(json.loads(kafka_data[0]['key']), indent=4, sort_keys=True), \
      '\n\nValue\n',  json.dumps(json.loads(kafka_data[0]['value']), indent=4, sort_keys=True))

Key
 {
    "payload": {
        "id": 2
    },
    "schema": {
        "fields": [
            {
                "default": 0,
                "field": "id",
                "optional": false,
                "type": "int32"
            }
        ],
        "name": "lab_cdc.inventory.customers.Key",
        "optional": false,
        "type": "struct"
    }
} 

Value
 {
    "payload": {
        "after": {
            "email": "gbailey@foobar.com",
            "first_name": "George",
            "id": 2,
            "last_name": "Bailey"
        },
        "before": null,
        "op": "r",
        "source": {
            "connector": "postgresql",
            "db": "postgres",
            "lsn": 24279104,
            "name": "lab_cdc",
            "schema": "inventory",
            "sequence": "[null,\"24279104\"]",
            "snapshot": "true",
            "table": "customers",
            "ts_ms": 1651261553960,
            "txId": 738,
            "version": "1.9.2.Final",
        

In [10]:
# The Value keys has two new keys: payload and schema
# the schema key is verbose and show a lot of important information about the data
# now we show just the payload key from the First Row
json.loads(kafka_data[0]['value'])['payload']

{'before': None,
 'after': {'id': 2,
  'first_name': 'George',
  'last_name': 'Bailey',
  'email': 'gbailey@foobar.com'},
 'source': {'version': '1.9.2.Final',
  'connector': 'postgresql',
  'name': 'lab_cdc',
  'ts_ms': 1651261553960,
  'snapshot': 'true',
  'db': 'postgres',
  'sequence': '[null,"24279104"]',
  'schema': 'inventory',
  'table': 'customers',
  'txId': 738,
  'lsn': 24279104,
  'xmin': None},
 'op': 'r',
 'ts_ms': 1651261553961,
 'transaction': None}

In [11]:
# showing the keys below of payload
json.loads(kafka_data[0]['value'])['payload'].keys()

dict_keys(['before', 'after', 'source', 'op', 'ts_ms', 'transaction'])

In [12]:
# the 'op' key describes the type of operation that caused the connector to generate the event
# ref: https://debezium.io/documentation/reference/stable/connectors/postgresql.html#postgresql-create-events
json.loads(kafka_data[0]['value'])['payload']['op']

'r'

In [13]:
json.loads(kafka_data[0]['value'])['payload']['after']

{'id': 2,
 'first_name': 'George',
 'last_name': 'Bailey',
 'email': 'gbailey@foobar.com'}

In [14]:
json.loads(kafka_data[0]['value'])['payload']['source']

{'version': '1.9.2.Final',
 'connector': 'postgresql',
 'name': 'lab_cdc',
 'ts_ms': 1651261553960,
 'snapshot': 'true',
 'db': 'postgres',
 'sequence': '[null,"24279104"]',
 'schema': 'inventory',
 'table': 'customers',
 'txId': 738,
 'lsn': 24279104,
 'xmin': None}

In [15]:
for row in kafka_data:
    # print(str(row[0]) + "," + str(row[1]))
    json_value = json.loads(row[1])
    # print(json_value['payload'])
    op         = json_value['payload']['op']
    user_id    = json_value['payload']['after']['id']
    first_name = json_value['payload']['after']['first_name']
    last_name  = json_value['payload']['after']['last_name']
    email      = json_value['payload']['after']['email']
    print(op, user_id, first_name, last_name, email)
    # print(json_value['payload']['op'] + "," + str(json_value['payload']['id']) + "," + json_value['payload']['first_name'] + "," + json_value['payload']['last_name'] + "," + json_value['payload']['email'])

r 2 George Bailey gbailey@foobar.com
r 3 Edward Walker ed@walker.com
r 4 Jonh Kretchmar annek@noanswer.org
r 2 George Bailey gbailey@foobar.com
r 3 Edward Walker ed@walker.com
r 4 Jonh Kretchmar annek@noanswer.org
r 1 Sally Thomas sally.thomas@acme.com
r 1 Sally Thomas sally.thomas@acme.com


In [16]:
dicts = json.loads(kafka_data[0]['value'])['schema']['fields']
[x['field'] for x in dicts]

['before', 'after', 'source', 'op', 'ts_ms', 'transaction']

In [17]:
next(item for item in dicts if item["field"] == "after")

{'type': 'struct',
 'fields': [{'type': 'int32', 'optional': False, 'default': 0, 'field': 'id'},
  {'type': 'string', 'optional': True, 'field': 'first_name'},
  {'type': 'string', 'optional': True, 'field': 'last_name'},
  {'type': 'string', 'optional': True, 'field': 'email'}],
 'optional': True,
 'name': 'lab_cdc.inventory.customers.Value',
 'field': 'after'}

### Saving data into a Delta Table

In [18]:
# Define a path location where the table will be created
location = "/delta_lake/customers"
# Create or replace table with path and add properties
deltaTable = DeltaTable.createOrReplace(spark) \
  .addColumn("kafka_key", "STRING") \
  .addColumn("kafka_offset", "BIGINT") \
  .addColumn("id", "BIGINT") \
  .addColumn("first_name", "STRING") \
  .addColumn("last_name", "STRING") \
  .addColumn("email", "STRING") \
  .addColumn("created_at", "TIMESTAMP") \
  .addColumn("updated_at", "TIMESTAMP") \
  .property("description", "table with customers data") \
  .location(location) \
  .execute()

In [19]:
!ls {location} -la

total 12
drwxr-xr-x 3 jovyan users 4096 May  1 20:37 .
drwxrwxr-x 3 jovyan  1000 4096 May  1 20:37 ..
drwxr-xr-x 2 jovyan users 4096 May  1 20:37 _delta_log


In [20]:
# Função para fazer extracao de schema dinamica das mensagens do kafka
def infer_schema_json(kafka_df):
    df_json = (
        # filter out empty values
        kafka_df.withColumn("value", F.expr("string(value)"))
        .filter(F.col("value").isNotNull())
        # get latestecord
        .select("key", F.expr("struct(offset, value) r"))
        .groupBy("key").agg(F.expr("max(r) r")) 
        .select("r.value"))

    # decode the json values
    df_read = spark.read.json(df_json.rdd.map(lambda x: x.value), multiLine=True)

    # drop corrupt records
    if "_corrupt_record" in df_read.columns:
        df_read = (df_read.filter(col("_corrupt_record").isNotNull()).drop("_corrupt_record"))

    # schema
    return df_read.schema.json()

# passa o data frame de leitura do kafka para fazer a inferencia do schema para um formato json
# cria um objeto do tipo pyspark.sql.types.StructType a partir do json
topic_schema_txt = infer_schema_json(df)
topic_schema = StructType.fromJson(json.loads(topic_schema_txt))

In [21]:
# Select columns from messages' payload and enforce unique kafka key
dfw = df \
      .withColumn("value", F.expr("string(value)"))\
      .filter(F.col("value").isNotNull())\
      .select(\
         F.expr("offset as kafka_offset"),\
         F.expr("timestamp as created_at"),\
         F.expr("string(key) as kafka_key"),\
         "value")\
      .select("kafka_key", F.expr("struct(*) as r"))\
      .groupBy("kafka_key")\
      .agg(F.expr("max(r) r"))\
      .withColumn('value', F.from_json(F.col("r.value"), topic_schema))\
      .select('r.kafka_key', 'r.kafka_offset', 'r.created_at', 'value.payload.after.*')
print("Schema\n")
dfw.printSchema()
print("\nRecords\n")
dfw.show()

Schema

root
 |-- kafka_key: string (nullable = true)
 |-- kafka_offset: long (nullable = true)
 |-- created_at: timestamp (nullable = true)
 |-- email: string (nullable = true)
 |-- first_name: string (nullable = true)
 |-- id: long (nullable = true)
 |-- last_name: string (nullable = true)


Records

+--------------------+------------+--------------------+--------------------+----------+---+---------+
|           kafka_key|kafka_offset|          created_at|               email|first_name| id|last_name|
+--------------------+------------+--------------------+--------------------+----------+---+---------+
|{"schema":{"type"...|           2|2022-05-01 14:33:...|sally.thomas@acme...|     Sally|  1|   Thomas|
|{"schema":{"type"...|           6|2022-05-01 14:33:...|  gbailey@foobar.com|    George|  2|   Bailey|
|{"schema":{"type"...|           7|2022-05-01 14:33:...|       ed@walker.com|    Edward|  3|   Walker|
|{"schema":{"type"...|           8|2022-05-01 14:33:...|  annek@noanswer.org| 

In [22]:
# Add kafka data to an existing Delta table using append mode
dfw.write.format("delta").mode("append").save(location)

### Visualize delta lake table

In [23]:
dfr = spark.read.format("delta").load(location)
dfr.show()

+--------------------+------------+---+----------+---------+--------------------+--------------------+----------+
|           kafka_key|kafka_offset| id|first_name|last_name|               email|          created_at|updated_at|
+--------------------+------------+---+----------+---------+--------------------+--------------------+----------+
|{"schema":{"type"...|           2|  1|     Sally|   Thomas|sally.thomas@acme...|2022-05-01 14:33:...|      null|
|{"schema":{"type"...|           6|  2|    George|   Bailey|  gbailey@foobar.com|2022-05-01 14:33:...|      null|
|{"schema":{"type"...|           7|  3|    Edward|   Walker|       ed@walker.com|2022-05-01 14:33:...|      null|
|{"schema":{"type"...|           8|  4|      Jonh|Kretchmar|  annek@noanswer.org|2022-05-01 14:33:...|      null|
+--------------------+------------+---+----------+---------+--------------------+--------------------+----------+



Referências:
https://docs.delta.io/latest/delta-batch.html#-ddlcreatetable
https://docs.delta.io/latest/delta-constraints.html
https://spark.apache.org/docs/3.1.1/sql-ref.html
https://spark.apache.org/docs/3.1.1/sql-ref-syntax.html
https://docs.delta.io/latest/best-practices.html
https://debezium.io/documentation/reference/1.6/connectors/postgresql.html
https://partners-intl.aliyun.com/help/doc-detail/141203.htm
https://spark.apache.org/docs/3.1.1/structured-streaming-kafka-integration.html#content
https://debezium.io/documentation/online-resources/
https://github.com/suchitgupta01/spark-streaming-with-debezium
https://suchit-g.medium.com/spark-streaming-with-kafka-connect-debezium-connector-ab9163808667
https://stackoverflow.com/questions/62296734/how-to-transform-a-debezium-message-in-json-format-such-that-it-can-be-loaded-in
https://github.com/kimaina/openmrs-elt
https://sandeepkattepogu.medium.com/python-spark-transformations-on-kafka-data-8a19b498b32c
https://spark.apache.org/docs/2.1.2/api/python/_modules/pyspark/sql/readwriter.html
https://docs.delta.io/latest/quick-start.html#create-a-table&language-python
https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.collect.html
https://sparkbyexamples.com/pyspark/pyspark-collect/
https://keestalkstech.com/2019/11/streaming-a-kafka-topic-to-a-delta-table-on-s3-with-spark-structured-streaming/ *****
https://spark.apache.org/docs/latest/sql-data-sources-jdbc.html