### Import Modules

In [11]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.window import Window

from delta.tables import *

import os
import json

from datetime import datetime
from pytz import timezone
tz = timezone("America/Sao_Paulo")

### Build a Spark Session
The entry point to programming Spark with the Dataset and DataFrame API.

In [4]:
spark = SparkSession \
    .builder \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.1,io.delta:delta-core_2.12:1.2.1") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .appName("LabCDC") \
    .getOrCreate()

### Visualize delta lake table

In [22]:
# Define a path location where the table will be created
location = "/delta_lake/customers"

dfr = spark.read.format("delta").load(location)

In [29]:
dfr.sort(F.col("created_at").desc()).show()

+--------------------+------------+---+----------+---------+--------------------+--------------------+----------+
|           kafka_key|kafka_offset| id|first_name|last_name|               email|          created_at|updated_at|
+--------------------+------------+---+----------+---------+--------------------+--------------------+----------+
|{"schema":{"type"...|          13|  2|    George|   Bailey|   george@foobar.com|2022-05-02 21:17:...|      null|
|{"schema":{"type"...|           5|  7|  Fernando| Meireles|fernando.meireles...|2022-05-02 21:11:...|      null|
|{"schema":{"type"...|           5|  7|  Fernando| Meireles|fernando.meireles...|2022-05-02 21:11:...|      null|
|{"schema":{"type"...|          12|  6|      Caca|  Diegues|caca.diegues@vida...|2022-05-02 21:11:...|      null|
|{"schema":{"type"...|          12|  6|      Caca|  Diegues|caca.diegues@vida...|2022-05-02 21:11:...|      null|
|{"schema":{"type"...|           4|  5|   Glauber|    Rocha|glauber.rocha@ter...|2022-05

In [26]:
w2 = Window.partitionBy("id").orderBy(F.col("created_at").desc())
dfr.withColumn("row", F.row_number().over(w2)) \
  .filter(F.col("row") == 1).drop("row") \
  .show()

+--------------------+------------+---+----------+---------+--------------------+--------------------+----------+
|           kafka_key|kafka_offset| id|first_name|last_name|               email|          created_at|updated_at|
+--------------------+------------+---+----------+---------+--------------------+--------------------+----------+
|{"schema":{"type"...|           3|  1|     Sally|   Thomas|sally.thomas@acme...|2022-05-02 17:56:...|      null|
|{"schema":{"type"...|          13|  2|    George|   Bailey|   george@foobar.com|2022-05-02 21:17:...|      null|
|{"schema":{"type"...|          10|  3|    Edward|   Walker|       ed@walker.com|2022-05-02 17:56:...|      null|
|{"schema":{"type"...|          11|  4|      Jonh|Kretchmar|  annek@noanswer.org|2022-05-02 17:56:...|      null|
|{"schema":{"type"...|           4|  5|   Glauber|    Rocha|glauber.rocha@ter...|2022-05-02 21:11:...|      null|
|{"schema":{"type"...|          12|  6|      Caca|  Diegues|caca.diegues@vida...|2022-05

Referências:
https://docs.delta.io/latest/delta-batch.html#-ddlcreatetable
https://docs.delta.io/latest/delta-constraints.html
https://spark.apache.org/docs/3.1.1/sql-ref.html
https://spark.apache.org/docs/3.1.1/sql-ref-syntax.html
https://docs.delta.io/latest/best-practices.html
https://debezium.io/documentation/reference/1.6/connectors/postgresql.html
https://partners-intl.aliyun.com/help/doc-detail/141203.htm
https://spark.apache.org/docs/3.1.1/structured-streaming-kafka-integration.html#content
https://debezium.io/documentation/online-resources/
https://github.com/suchitgupta01/spark-streaming-with-debezium
https://suchit-g.medium.com/spark-streaming-with-kafka-connect-debezium-connector-ab9163808667
https://stackoverflow.com/questions/62296734/how-to-transform-a-debezium-message-in-json-format-such-that-it-can-be-loaded-in
https://github.com/kimaina/openmrs-elt
https://sandeepkattepogu.medium.com/python-spark-transformations-on-kafka-data-8a19b498b32c
https://spark.apache.org/docs/2.1.2/api/python/_modules/pyspark/sql/readwriter.html
https://docs.delta.io/latest/quick-start.html#create-a-table&language-python
https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.collect.html
https://sparkbyexamples.com/pyspark/pyspark-collect/
https://keestalkstech.com/2019/11/streaming-a-kafka-topic-to-a-delta-table-on-s3-with-spark-structured-streaming/ *****
https://spark.apache.org/docs/latest/sql-data-sources-jdbc.html