In [4]:
import pandas as pd
import numpy as np
import json
import random

In [1]:
spark.stop()

In [3]:
scala_version = '2.12' 
spark_version = '3.5.1'

spark = (SparkSession.builder
            .appName("Consumer")
            .config("spark.jars", ",".join([
            "/opt/spark/jars/spark-sql-kafka-0-10_2.12-3.5.1.jar",
            "/opt/spark/jars/kafka-clients-3.2.0.jar",
            "/opt/spark/jars/commons-pool2-2.12.0.jar",
            "/opt/spark/jars/spark-streaming-kafka-0-10-assembly_2.12-3.5.1.jar",
            "/opt/spark/jars/spark-token-provider-kafka-0-10_2.12-3.5.1.jar"
            ]))
            .getOrCreate()
        )

spark.sparkContext.setLogLevel("WARN")

24/08/27 08:14:10 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [4]:
print(spark.sparkContext.getConf().get("spark.jars"))

/opt/spark/jars/spark-sql-kafka-0-10_2.12-3.5.1.jar,/opt/spark/jars/kafka-clients-3.2.0.jar,/opt/spark/jars/commons-pool2-2.12.0.jar,/opt/spark/jars/spark-streaming-kafka-0-10-assembly_2.12-3.5.1.jar,/opt/spark/jars/spark-token-provider-kafka-0-10_2.12-3.5.1.jar


In [27]:
class Consumer():
    def __init__(self):
        self.BOOTSTRAP_SERVER = "192.168.1.53:9092"
        self.base_dir = "/home/iceberg/notebooks"
        self.topic = "botPredict"
        self.tableName = "bot_db.bot_ddvc_hcm_bot_predict"

    def get_schema(self):
        from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType
        schema = StructType([
            StructField("ID", StringType(), True),
            StructField("BOT_ID", StringType(), True),
            StructField("TEXT", StringType(), True),
            StructField("INTENT_CONFIDENCE", StringType(), True),
            StructField("INTENT_NAME", StringType(), True),
            StructField("STEP", IntegerType(), True),
            StructField("NLU_THRESHOLD", StringType(), True),
            StructField("SENDER_ID", StringType(), True),
            StructField("SOURCE", StringType(), True),
            StructField("CREATED_TIME", TimestampType(), True),
            StructField("LAST_UPDATED_TIME", TimestampType(), True),
            StructField("ID_CHATLOG", StringType(), True),
            StructField("UPDATED_INTENT", StringType(), True),
            StructField("LEN_CARD_DATA", IntegerType(), True),
            StructField("STATUS_DELETE", StringType(), True),
            StructField("STATUS_CONFIRM", StringType(), True),
            StructField("INTENT_MAP_CLICK_BUTTON", StringType(), True)
        ])
        return schema

    def ingest_from_kafka(self):
        import pyspark.sql.functions as f
        kafka_df = (spark.readStream
                        .format("kafka")
                        .option("kafka.bootstrap.servers", self.BOOTSTRAP_SERVER)
                        .option("subscribe", f"{self.topic}")
                        .option("startingOffsets", "earliest")
                        .option("maxOffsetsPerTrigger", 10)
                        .load() 
        )
        return kafka_df

    def get_kafka_message(self, kafka_df):
        from pyspark.sql.functions import from_json, cast, to_timestamp
        raw_df = kafka_df.select(
            kafka_df.key.cast("string").alias("key"),
            from_json(kafka_df.value.cast("string"), self.get_schema()).alias("value"),
            "topic",
            "timestamp"            
        )
        
        return raw_df

    def get_quality_df(self, raw_df):
        from pyspark.sql.functions import from_json, cast, to_timestamp
        predict_df = raw_df.select("value.*").filter(raw_df.value.ID != "ID")
        predict_df = predict_df.withColumn("CREATED_TIME", to_timestamp("CREATED_TIME", "dd-MMM-yy hh.mm.ss.SSSSSSSSS a")) \
                            .withColumn("LAST_UPDATED_TIME", to_timestamp("LAST_UPDATED_TIME", "dd-MMM-yy hh.mm.ss.SSSSSSSSS a"))

        return predict_df
        
    def clean(self):
        import shutil
        shutil.rmtree(f"{self.base_dir}/checkpoints/iceberg-consumer")

    def process(self):
        print("Start streaming...", end='')
        kafka_df = self.ingest_from_kafka()
        raw_df = self.get_kafka_message(kafka_df)
        predict_df = self.get_quality_df(raw_df)
        # self.clean()
        sQuery = (predict_df.writeStream
                      .format("iceberg")
                      .queryName("iceberg-ingestion")
                      .option("checkpointLocation", f"{self.base_dir}/checkpoints/iceberg-consumer")
                      .outputMode("append")
                      .trigger(processingTime = "5 seconds")
                      .toTable(f"{self.tableName}")
        )
        print("Done")
        return sQuery 
    


In [59]:
consumer = Consumer()
consumer.clean()

In [60]:
consumer = Consumer()
sQuery = consumer.process()


Start streaming...Done


24/08/27 10:03:32 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
24/08/27 10:03:32 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.


In [55]:
for query in spark.streams.active:
    query.stop()

In [8]:
%%sql
CREATE TABLE IF NOT EXISTS bot_db.bot_ddvc_hcm_bot_predict (
    `ID` STRING,
    `BOT_ID` STRING,
    `TEXT` STRING,
    `INTENT_CONFIDENCE` STRING,
    `INTENT_NAME` STRING,
    `STEP` INT,
    `NLU_THRESHOLD` STRING,
    `SENDER_ID` STRING,
    `SOURCE` STRING,
    `CREATED_TIME` TIMESTAMP,
    `LAST_UPDATED_TIME` TIMESTAMP,
    `ID_CHATLOG` STRING,
    `UPDATED_INTENT` STRING,
    `LEN_CARD_DATA` INT,
    `STATUS_DELETE` STRING,
    `STATUS_CONFIRM` STRING,
    `INTENT_MAP_CLICK_BUTTON` STRING
) USING iceberg
PARTITIONED BY (days(`CREATED_TIME`));

In [62]:
%%sql
SELECT count(*) FROM bot_db.bot_ddvc_hcm_bot_predict

count(1)
1763


In [48]:
%%sql
SELECT * FROM bot_db.bot_ddvc_hcm_bot_predict

ID,BOT_ID,TEXT,INTENT_CONFIDENCE,INTENT_NAME,STEP,NLU_THRESHOLD,SENDER_ID,SOURCE,CREATED_TIME,LAST_UPDATED_TIME,ID_CHATLOG,UPDATED_INTENT,LEN_CARD_DATA,STATUS_DELETE,STATUS_CONFIRM,INTENT_MAP_CLICK_BUTTON
30eb3929-3319-4660-8140-c538d79f6a14,99ce9950-1ca5-11ef-981d-ffb7de893de8,/home,1.0,xin_chao,406665,0.8,599754d5-3393-4ba4-a884-1a6122f2f4a7,action,2024-07-24 16:26:39.632000,2024-07-24 16:26:39.632000,a668727c-1302-4a4b-9ea1-6c59643914fb,,1,,,
336270ce-6385-47b3-8e0e-c4149bbf98a3,99ce9950-1ca5-11ef-981d-ffb7de893de8,/home,1.0,xin_chao,406665,0.8,599754d5-3393-4ba4-a884-1a6122f2f4a7,action,2024-07-24 16:36:27.646000,2024-07-24 16:36:27.646000,ec42aaf9-1792-4b42-8acb-01151196a4a1,,1,,,
30d6e6fa-4941-450d-a656-e59285a65551,99ce9950-1ca5-11ef-981d-ffb7de893de8,/home,1.0,xin_chao,406665,0.8,599754d5-3393-4ba4-a884-1a6122f2f4a7,action,2024-07-24 16:36:37.635000,2024-07-24 16:36:37.635000,fe374c2c-24e9-4b42-a61a-ba5d6fd29c8b,,1,,,
447f2dac-3fbd-479c-ac92-5d810ed21b5f,99ce9950-1ca5-11ef-981d-ffb7de893de8,/home,1.0,xin_chao,406665,0.8,599754d5-3393-4ba4-a884-1a6122f2f4a7,action,2024-07-24 16:15:58.627000,2024-07-24 16:15:58.627000,05d5431a-8369-4eee-a278-77818cc7ac87,,1,,,
6f092a33-f23a-4914-8dce-87622d9788eb,99ce9950-1ca5-11ef-981d-ffb7de893de8,/home,1.0,xin_chao,406665,0.8,599754d5-3393-4ba4-a884-1a6122f2f4a7,action,2024-07-24 16:16:02.615000,2024-07-24 16:16:02.615000,22335979-6d2e-432f-8783-82bc0409cb9f,,1,,,
ff158304-231e-4038-b1e6-4224f4a8316f,99ce9950-1ca5-11ef-981d-ffb7de893de8,/home,1.0,xin_chao,406665,0.8,599754d5-3393-4ba4-a884-1a6122f2f4a7,action,2024-07-24 16:33:55.652000,2024-07-24 16:33:55.652000,59ecbb62-733b-4531-91f7-77b7b39d8782,,1,,,
7dc63bb2-c757-4ed0-bec4-55cf4536949d,99ce9950-1ca5-11ef-981d-ffb7de893de8,/home,1.0,xin_chao,406665,0.8,599754d5-3393-4ba4-a884-1a6122f2f4a7,action,2024-07-24 16:42:36.670000,2024-07-24 16:42:36.670000,29c4c844-b50a-4b04-83d1-213f0b941043,,1,,,
5e23bfb3-6383-4fb3-81bb-feca98ff8d16,99ce9950-1ca5-11ef-981d-ffb7de893de8,/home,1.0,xin_chao,406665,0.8,599754d5-3393-4ba4-a884-1a6122f2f4a7,action,2024-07-24 16:42:44.683000,2024-07-24 16:42:44.683000,6f1070f0-0404-46b9-85ac-472300676d4a,,1,,,
e2dc7402-2b07-45a9-bd62-9b2a13b0424b,99ce9950-1ca5-11ef-981d-ffb7de893de8,/home,1.0,xin_chao,406665,0.8,599754d5-3393-4ba4-a884-1a6122f2f4a7,action,2024-07-24 16:42:46.652000,2024-07-24 16:42:46.652000,14a1fa43-c20f-4543-8eaf-73074a7d2a76,,1,,,
0f0e2104-f634-43e7-816a-8f0e6bb0a86e,99ce9950-1ca5-11ef-981d-ffb7de893de8,/home,1.0,xin_chao,406665,0.8,f1dde04c-3024-4789-9c56-e0ea41313704,action,2024-07-24 10:15:46.775000,2024-07-24 10:15:46.775000,b9b84bc4-631f-48e5-84b1-49624d9b9351,,1,,,


In [57]:
%%sql
TRUNCATE TABLE bot_db.bot_ddvc_hcm_bot_predict

In [None]:
for query in spark.stream