In [3]:
import pandas as pd
import numpy as np
import json
import random

In [1]:
spark.stop()

In [3]:
scala_version = '2.12' 
spark_version = '3.5.1'

spark = (SparkSession.builder
            .appName("Producer")
            .config("spark.jars", ",".join([
            "/opt/spark/jars/spark-sql-kafka-0-10_2.12-3.5.1.jar",
            "/opt/spark/jars/kafka-clients-3.2.0.jar",
            "/opt/spark/jars/commons-pool2-2.12.0.jar",
            "/opt/spark/jars/spark-streaming-kafka-0-10-assembly_2.12-3.5.1.jar",
            "/opt/spark/jars/spark-token-provider-kafka-0-10_2.12-3.5.1.jar"
            ]))
            .getOrCreate()
        )

spark.sparkContext.setLogLevel("WARN")
spark.conf.set("spark.sql.debug.maxToStringFields", "1000")

24/08/27 08:13:34 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
24/08/27 08:13:34 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [4]:
spark.conf.set("spark.sql.debug.maxToStringFields", 1000)

In [6]:
print(spark.sparkContext.getConf().get("spark.jars"))

/opt/spark/jars/spark-sql-kafka-0-10_2.12-3.5.1.jar,/opt/spark/jars/kafka-clients-3.2.0.jar,/opt/spark/jars/commons-pool2-2.12.0.jar,/opt/spark/jars/spark-streaming-kafka-0-10-assembly_2.12-3.5.1.jar,/opt/spark/jars/spark-token-provider-kafka-0-10_2.12-3.5.1.jar


In [5]:
class Producer():
    def __init__(self):
        self.BOOTSTRAP_SERVER = "192.168.1.53:9092"
        self.base_dir = "/home/iceberg/notebooks"
        self.topic = "botPredict"
        self.processingTime = "5 seconds"

    def get_schema(self):
        from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType
        schema = StructType([
            StructField("ID", StringType(), True),
            StructField("BOT_ID", StringType(), True),
            StructField("TEXT", StringType(), True),
            StructField("INTENT_CONFIDENCE", StringType(), True),
            StructField("INTENT_NAME", StringType(), True),
            StructField("STEP", IntegerType(), True),
            StructField("NLU_THRESHOLD", StringType(), True),
            StructField("SENDER_ID", StringType(), True),
            StructField("SOURCE", StringType(), True),
            StructField("CREATED_TIME", StringType(), True),
            StructField("LAST_UPDATED_TIME", StringType(), True),
            StructField("ID_CHATLOG", StringType(), True),
            StructField("UPDATED_INTENT", StringType(), True),
            StructField("LEN_CARD_DATA", IntegerType(), True),
            StructField("STATUS_DELETE", StringType(), True),
            StructField("STATUS_CONFIRM", StringType(), True),
            StructField("INTENT_MAP_CLICK_BUTTON", StringType(), True)
        ])
        return schema


    def read_chat_data(self, path):
        import pyspark.sql.functions as f
        predict_df = spark.readStream \
            .format("csv") \
            .schema(self.get_schema()) \
            .option("delimiter", "|") \
            .option("escape", "\"") \
            .option("multiline", "true") \
            .load(path)
        
        fixed_predict_df = predict_df.withColumn("CREATED_TIME", f.to_timestamp("CREATED_TIME", "dd-MMM-yy hh.mm.ss.SSSSSSSSS a")) \
                            .withColumn("LAST_UPDATED_TIME", f.to_timestamp("LAST_UPDATED_TIME", "dd-MMM-yy hh.mm.ss.SSSSSSSSS a"))

        return fixed_predict_df

    def get_kafka_message(self, df, key):
        return df.selectExpr(f"{key} as key", "to_json(struct(*)) as value")

    # Topic: bot_ddvc_hcm_bot_predict
    def send_message(self, kafka_df):
        return (kafka_df.writeStream
                    .format("kafka")
                    .queryName("kafka-producer")
                    .option("kafka.bootstrap.servers", self.BOOTSTRAP_SERVER)
                    .option("topic", f"{self.topic}")
                    .option("checkpointLocation", f"{self.base_dir}/checkpoints/producer")
                    .outputMode("append")
                    .trigger(processingTime = f"{self.processingTime}")
                    .start()
               )

    def clean(self):
        import shutil
        shutil.rmtree("/home/iceberg/notebooks/checkpoints/producer")

    def process(self):
        print(f"Starting Kafka Producer Stream...", '')
        path = f"{self.base_dir}/data"
        df = self.read_chat_data(path)
        # Chọn cột "ID" làm key và toàn bộ cột trong dataframe làm value
        kafka_df = self.get_kafka_message(df, "ID")
        sQuery = self.send_message(kafka_df)
        print("Done\n")
        return sQuery  

In [None]:
producer = Producer()
producer.clean()
sQuery = producer.process()
sQuery.awaitTermination()

Starting Kafka Producer Stream... 


24/08/27 08:13:58 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


Done



                                                                                

In [9]:
import shutil
shutil.rmtree("/home/iceberg/notebooks/checkpoints/producer")

In [None]:
for query in spark.streams.active:
    query.stop()