### Import Modules

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.window import Window

from delta.tables import *

import os
import json

from datetime import datetime
from pytz import timezone
tz = timezone("America/Sao_Paulo")

### Build a Spark Session
The entry point to programming Spark with the Dataset and DataFrame API.

In [2]:
spark = SparkSession \
    .builder \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.1,io.delta:delta-core_2.12:1.2.1") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .appName("LabCDC") \
    .getOrCreate()

### Visualize delta lake table

In [3]:
# Define a path location where the table will be created
location = "/delta_lake/customers"

dfr = spark.read.format("delta").load(location)

In [4]:
dfr.count()

4

In [12]:
dfr.show()

+--------------------+------------+---+---+----------+---------+--------------------+--------------------+----------+
|           kafka_key|kafka_offset| op| id|first_name|last_name|               email|          created_at|updated_at|
+--------------------+------------+---+---+----------+---------+--------------------+--------------------+----------+
|{"schema":{"type"...|           2|  c|  7|  Fernando| Meireles|fernando.meireles...|2022-05-06 16:01:...|      null|
|{"schema":{"type"...|           1|  c|  5|   Glauber|    Rocha|glauber.rocha@ter...|2022-05-06 16:01:...|      null|
|{"schema":{"type"...|           3|  c|  8|  Fernando| Meireles|meireles@cidadede...|2022-05-06 16:03:...|      null|
|{"schema":{"type"...|           3|  c|  6|    Hilton|  Lacerda|hilton@tatuagem.m...|2022-05-06 16:01:...|      null|
|{"schema":{"type"...|           0|  r|  1|     Sally|   Thomas|sally.thomas@acme...|2022-05-06 14:30:...|      null|
|{"schema":{"type"...|           4|  u|  2|    George|  

In [13]:
w2 = Window.partitionBy("id").orderBy(F.col("created_at").desc())
dfr.withColumn("row", F.row_number().over(w2)) \
  .filter(F.col("row") == 1).drop("row") \
  .filter(F.col("op") != 'd') \
  .show()

+--------------------+------------+---+---+----------+---------+--------------------+--------------------+----------+
|           kafka_key|kafka_offset| op| id|first_name|last_name|               email|          created_at|updated_at|
+--------------------+------------+---+---+----------+---------+--------------------+--------------------+----------+
|{"schema":{"type"...|           0|  r|  1|     Sally|   Thomas|sally.thomas@acme...|2022-05-06 14:30:...|      null|
|{"schema":{"type"...|           4|  u|  2|    George|   Bailey|   george@foobar.com|2022-05-06 16:02:...|      null|
|{"schema":{"type"...|           5|  u|  3|    Edward|   Walker|   edward@walker.com|2022-05-06 16:03:...|      null|
|{"schema":{"type"...|           2|  r|  4|      Jonh|Kretchmar|  annek@noanswer.org|2022-05-06 14:30:...|      null|
|{"schema":{"type"...|           1|  c|  5|   Glauber|    Rocha|glauber.rocha@ter...|2022-05-06 16:01:...|      null|
|{"schema":{"type"...|           3|  c|  6|    Hilton|  