### Import Modules

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

from delta.tables import *

import os
import json
from collections import namedtuple

from datetime import datetime
from pytz import timezone
tz = timezone("America/Sao_Paulo")

### Build a Spark Session
The entry point to programming Spark with the Dataset and DataFrame API.

In [2]:
spark = SparkSession \
    .builder \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.2,io.delta:delta-core_2.12:2.3.0") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .appName("LabCDC") \
    .getOrCreate()

In [3]:
# Show some information about the spark context and get url for Spark UI
spark._sc

### Consume messages from kafka topics

In [4]:
# Creating a Sqpark DataFrame from Kafka topic
df = spark \
  .read \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "kafka-1:9092") \
  .option("subscribe", "lab_cdc.inventory.customers") \
  .load()

In [5]:
df.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [25]:
df.count()

4

### Exploring kafka message structure

In [7]:
# Collecting data from Spark DataFrame into a list of pyspark Row
kafka_data = df.collect()

# printing some information
len(kafka_data), type(kafka_data), type(kafka_data[0])

(4, list, pyspark.sql.types.Row)

In [8]:
kdata = kafka_data[1]
type(kdata)

pyspark.sql.types.Row

In [9]:
# Showing the dict keys from the Rows
kdata.asDict().keys()

dict_keys(['key', 'value', 'topic', 'partition', 'offset', 'timestamp', 'timestampType'])

In [10]:
# Showing the values of topic, partition, offset, timestamp, and timestampType from the first Row
print(' Topic:          {}\n'.format(kdata['topic']),\
      'Partition:      {}\n'.format(kdata['partition']),\
      'Offset:         {}\n'.format(kdata['offset']),\
      'Timestamp:      {}\n'.format(kdata['timestamp']),\
      'Timestamp Type: {}'.format(kdata['timestampType']))

 Topic:          lab_cdc.inventory.customers
 Partition:      2
 Offset:         1
 Timestamp:      2023-08-08 15:13:13.615000
 Timestamp Type: 0


In [11]:
# Showing the Key and Value from the first Row
print('Key\n', json.dumps(json.loads(kdata['key']), indent=4, sort_keys=True), \
      '\n\nValue\n',  json.dumps(json.loads(kdata['value']), indent=4, sort_keys=True))

Key
 {
    "payload": {
        "id": 3
    },
    "schema": {
        "fields": [
            {
                "default": 0,
                "field": "id",
                "optional": false,
                "type": "int32"
            }
        ],
        "name": "lab_cdc.inventory.customers.Key",
        "optional": false,
        "type": "struct"
    }
} 

Value
 {
    "payload": {
        "after": {
            "email": "ed@walker.com",
            "first_name": "Edward",
            "id": 3,
            "last_name": "Walker"
        },
        "before": null,
        "op": "r",
        "source": {
            "connector": "postgresql",
            "db": "postgres",
            "lsn": 22202840,
            "name": "lab_cdc",
            "schema": "inventory",
            "sequence": "[null,\"22202840\"]",
            "snapshot": "true",
            "table": "customers",
            "ts_ms": 1691507586387,
            "txId": 728,
            "version": "2.3.1.Final",
            "

In [12]:
# The Value keys has two new keys: payload and schema
# the schema key is verbose and show a lot of important information about the data
# now we show just the payload key from the First Row
json.loads(kdata['value'])['payload']

{'before': None,
 'after': {'id': 3,
  'first_name': 'Edward',
  'last_name': 'Walker',
  'email': 'ed@walker.com'},
 'source': {'version': '2.3.1.Final',
  'connector': 'postgresql',
  'name': 'lab_cdc',
  'ts_ms': 1691507586387,
  'snapshot': 'true',
  'db': 'postgres',
  'sequence': '[null,"22202840"]',
  'schema': 'inventory',
  'table': 'customers',
  'txId': 728,
  'lsn': 22202840,
  'xmin': None},
 'op': 'r',
 'ts_ms': 1691507586651,
 'transaction': None}

In [13]:
# showing the keys below of payload
json.loads(kdata['value'])['payload'].keys()

dict_keys(['before', 'after', 'source', 'op', 'ts_ms', 'transaction'])

In [14]:
# the 'op' key describes the type of operation that caused the connector to generate the event
# ref: https://debezium.io/documentation/reference/stable/connectors/postgresql.html#postgresql-create-events
json.loads(kdata['value'])['payload']['op']

'r'

In [15]:
json.loads(kdata['value'])['payload']['after']

{'id': 3,
 'first_name': 'Edward',
 'last_name': 'Walker',
 'email': 'ed@walker.com'}

In [16]:
json.loads(kdata['value'])['payload']['source']

{'version': '2.3.1.Final',
 'connector': 'postgresql',
 'name': 'lab_cdc',
 'ts_ms': 1691507586387,
 'snapshot': 'true',
 'db': 'postgres',
 'sequence': '[null,"22202840"]',
 'schema': 'inventory',
 'table': 'customers',
 'txId': 728,
 'lsn': 22202840,
 'xmin': None}

In [17]:
for row in kafka_data:
    # print(str(row[0]) + "," + str(row[1]))
    if row.value == None:
        continue
    json_value = json.loads(row['value'])
    json_key = json.loads(row['key'])
    # print(json_value['payload'])
    op         = json_value['payload']['op']
    user_id    = json_key['payload']['id']
    if op == 'd':
        first_name = last_name = email = None
    else:
        first_name = json_value['payload']['after']['first_name']
        last_name  = json_value['payload']['after']['last_name']
        email      = json_value['payload']['after']['email']
    print(op, user_id, first_name, last_name, email)
    # print(json_value['payload']['op'] + "," + str(json_value['payload']['id']) + "," + json_value['payload']['first_name'] + "," + json_value['payload']['last_name'] + "," + json_value['payload']['email'])

r 2 George Bailey gbailey@foobar.com
r 3 Edward Walker ed@walker.com
r 4 Jonh Kretchmar annek@noanswer.org
r 1 Sally Thomas sally.thomas@acme.com


In [18]:
dicts = json.loads(kdata['value'])['schema']['fields']
[x['field'] for x in dicts]

['before', 'after', 'source', 'op', 'ts_ms', 'transaction']

In [19]:
next(item for item in dicts if item["field"] == "after")

{'type': 'struct',
 'fields': [{'type': 'int32', 'optional': False, 'default': 0, 'field': 'id'},
  {'type': 'string', 'optional': True, 'field': 'first_name'},
  {'type': 'string', 'optional': True, 'field': 'last_name'},
  {'type': 'string', 'optional': True, 'field': 'email'}],
 'optional': True,
 'name': 'lab_cdc.inventory.customers.Value',
 'field': 'after'}

### Transforming Data

In [20]:
# Função para fazer extracao de schema dinamica das mensagens do kafka
def infer_schema_json(df, column):
    df_json = (
        # filter out empty values
        df.withColumn(column, F.expr("string({})".format(column)))
        .filter(F.col(column).isNotNull())
        # get latestecord
        .select("key", F.expr("struct({}) r".format(column)))
        .groupBy("key").agg(F.expr("max(r) r")) 
        .select("r.{}".format(column)))

    # decode the json values
    df_read = spark.read.json(df_json.rdd.map(lambda x: x[column]), multiLine=True)

    # drop corrupt records
    if "_corrupt_record" in df_read.columns:
        df_read = (df_read.filter(col("_corrupt_record").isNotNull()).drop("_corrupt_record"))

    # schema
    return df_read.schema.json()

In [21]:
topic_schema_value = infer_schema_json(df, "value")
topic_schema_key = infer_schema_json(df, "key")

TopicSchema = namedtuple('TopicSchema', ['key', 'value'])
topic_schema = TopicSchema(key=topic_schema_key, value=topic_schema_value)

In [22]:
# Select columns from messages' payload and enforce unique kafka key
dfw = df \
      .withColumn("value", F.expr("string(value)"))\
      .filter(F.col("value").isNotNull())\
      .select(\
         F.expr("offset as kafka_offset"),\
         F.expr("timestamp as created_at"),\
         F.expr("string(key) as kafka_key"),\
         "value")\
      .withColumn('value', F.from_json(F.col("value"), topic_schema.value))\
      .withColumn('key_json', F.from_json(F.col("kafka_key"), topic_schema.key))\
      .select('kafka_key', 
              'kafka_offset', 
              'value.payload.op', 
              'key_json.payload.id', 
              'value.payload.after.first_name', 
              'value.payload.after.last_name', 
              'value.payload.after.email',
              'created_at')
print("Schema\n")
dfw.printSchema()
print("\nRecords\n")
dfw.show()

Schema

root
 |-- kafka_key: string (nullable = true)
 |-- kafka_offset: long (nullable = true)
 |-- op: string (nullable = true)
 |-- id: long (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- email: string (nullable = true)
 |-- created_at: timestamp (nullable = true)


Records

+--------------------+------------+---+---+----------+---------+--------------------+--------------------+
|           kafka_key|kafka_offset| op| id|first_name|last_name|               email|          created_at|
+--------------------+------------+---+---+----------+---------+--------------------+--------------------+
|{"schema":{"type"...|           0|  r|  2|    George|   Bailey|  gbailey@foobar.com|2023-08-08 15:13:...|
|{"schema":{"type"...|           1|  r|  3|    Edward|   Walker|       ed@walker.com|2023-08-08 15:13:...|
|{"schema":{"type"...|           2|  r|  4|      Jonh|Kretchmar|  annek@noanswer.org|2023-08-08 15:13:...|
|{"schema":{"type"..

### Loading data into Delta Lake

In [23]:
# Define a path location where the table will be created
location = "/delta_lake/customers"
# Create or replace table with path and add properties
deltaTable = DeltaTable.createOrReplace(spark) \
  .addColumn("kafka_key", "STRING") \
  .addColumn("kafka_offset", "BIGINT") \
  .addColumn("op", "STRING") \
  .addColumn("id", "BIGINT") \
  .addColumn("first_name", "STRING") \
  .addColumn("last_name", "STRING") \
  .addColumn("email", "STRING") \
  .addColumn("created_at", "TIMESTAMP") \
  .addColumn("updated_at", "TIMESTAMP") \
  .property("description", "table with customers data") \
  .location(location) \
  .execute()

In [26]:
!ls {location} -la

total 12
drwxr-xr-x 3 jovyan users 4096 Aug  8 16:50 .
drwxrwxr-x 3 jovyan  1000 4096 Aug  8 16:50 ..
drwxr-xr-x 2 jovyan users 4096 Aug  8 16:50 _delta_log


In [27]:
# Add kafka data to an existing Delta table using append mode
dfw.write.format("delta").mode("append").save(location)

### Visualize delta lake table

In [28]:
dfr = spark.read.format("delta").load(location)
dfr.show()

+--------------------+------------+---+---+----------+---------+--------------------+--------------------+----------+
|           kafka_key|kafka_offset| op| id|first_name|last_name|               email|          created_at|updated_at|
+--------------------+------------+---+---+----------+---------+--------------------+--------------------+----------+
|{"schema":{"type"...|           0|  r|  1|     Sally|   Thomas|sally.thomas@acme...|2023-08-08 15:13:...|      null|
|{"schema":{"type"...|           0|  r|  2|    George|   Bailey|  gbailey@foobar.com|2023-08-08 15:13:...|      null|
|{"schema":{"type"...|           1|  r|  3|    Edward|   Walker|       ed@walker.com|2023-08-08 15:13:...|      null|
|{"schema":{"type"...|           2|  r|  4|      Jonh|Kretchmar|  annek@noanswer.org|2023-08-08 15:13:...|      null|
+--------------------+------------+---+---+----------+---------+--------------------+--------------------+----------+



Referências:
https://docs.delta.io/latest/delta-batch.html#-ddlcreatetable
https://docs.delta.io/latest/delta-constraints.html
https://spark.apache.org/docs/3.1.1/sql-ref.html
https://spark.apache.org/docs/3.1.1/sql-ref-syntax.html
https://docs.delta.io/latest/best-practices.html
https://debezium.io/documentation/reference/1.6/connectors/postgresql.html
https://partners-intl.aliyun.com/help/doc-detail/141203.htm
https://spark.apache.org/docs/3.1.1/structured-streaming-kafka-integration.html#content
https://debezium.io/documentation/online-resources/
https://github.com/suchitgupta01/spark-streaming-with-debezium
https://suchit-g.medium.com/spark-streaming-with-kafka-connect-debezium-connector-ab9163808667
https://stackoverflow.com/questions/62296734/how-to-transform-a-debezium-message-in-json-format-such-that-it-can-be-loaded-in
https://github.com/kimaina/openmrs-elt
https://sandeepkattepogu.medium.com/python-spark-transformations-on-kafka-data-8a19b498b32c
https://spark.apache.org/docs/2.1.2/api/python/_modules/pyspark/sql/readwriter.html
https://docs.delta.io/latest/quick-start.html#create-a-table&language-python
https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.collect.html
https://sparkbyexamples.com/pyspark/pyspark-collect/
https://keestalkstech.com/2019/11/streaming-a-kafka-topic-to-a-delta-table-on-s3-with-spark-structured-streaming/ *****
https://spark.apache.org/docs/latest/sql-data-sources-jdbc.html