## Read bot_predict csv and insert into iceberg table

In [None]:
spark.stop()

In [None]:
spark = SparkSession.builder \
    .appName("CSV to Iceberg") \
    .getOrCreate()

In [1]:
predict_df = spark.read \
    .format("csv") \
    .option("inferSchema", "true") \
    .option("header", "true") \
    .option("delimiter", "|") \
    .option("escape", "\"") \
    .option("multiline", "true") \
    .load("data/bot_ddvc_hcm_bot_predict.csv")

In [2]:
predict_df.printSchema()

root
 |-- ID: string (nullable = true)
 |-- BOT_ID: string (nullable = true)
 |-- TEXT: string (nullable = true)
 |-- INTENT_CONFIDENCE: double (nullable = true)
 |-- INTENT_NAME: string (nullable = true)
 |-- STEP: integer (nullable = true)
 |-- NLU_THRESHOLD: double (nullable = true)
 |-- SENDER_ID: string (nullable = true)
 |-- SOURCE: string (nullable = true)
 |-- CREATED_TIME: string (nullable = true)
 |-- LAST_UPDATED_TIME: string (nullable = true)
 |-- ID_CHATLOG: string (nullable = true)
 |-- UPDATED_INTENT: string (nullable = true)
 |-- LEN_CARD_DATA: integer (nullable = true)
 |-- STATUS_DELETE: string (nullable = true)
 |-- STATUS_CONFIRM: string (nullable = true)
 |-- INTENT_MAP_CLICK_BUTTON: string (nullable = true)



In [3]:
print(predict_df.count())

10729


In [None]:
import pyspark.sql.functions as f

fixed_predict_df = predict_df.withColumn("CREATED_TIME", f.to_timestamp("CREATED_TIME", "dd-MMM-yy hh.mm.ss.SSSSSSSSS a")) \
                            .withColumn("LAST_UPDATED_TIME", f.to_timestamp("LAST_UPDATED_TIME", "dd-MMM-yy hh.mm.ss.SSSSSSSSS a"))

In [None]:
fixed_predict_df.select("ID", "BOT_ID", "TEXT", "CREATED_TIME", "LAST_UPDATED_TIME").show(10, truncate=True)

In [None]:
table_name = "bot_db.bot_ddvc_hcm_bot_predict"

In [None]:
%%sql
CREATE DATABASE IF NOT EXISTS bot_db;

In [None]:
%%sql
CREATE TABLE IF NOT EXISTS bot_db.bot_ddvc_hcm_bot_predict (
    `ID` STRING,
    `BOT_ID` STRING,
    `TEXT` STRING,
    `INTENT_CONFIDENCE` STRING,
    `INTENT_NAME` STRING,
    `STEP` INT,
    `NLU_THRESHOLD` STRING,
    `SENDER_ID` STRING,
    `SOURCE` STRING,
    `CREATED_TIME` TIMESTAMP,
    `LAST_UPDATED_TIME` TIMESTAMP,
    `ID_CHATLOG` STRING,
    `UPDATED_INTENT` STRING,
    `LEN_CARD_DATA` INT,
    `STATUS_DELETE` STRING,
    `STATUS_CONFIRM` STRING,
    `INTENT_MAP_CLICK_BUTTON` STRING
) USING iceberg
PARTITIONED BY (days(`CREATED_TIME`));


In [None]:
table_name = "bot_db.bot_ddvc_hcm_bot_predict"
fixed_predict_df.writeTo(table_name).append()

In [None]:
%%sql
select * from bot_db.bot_ddvc_hcm_bot_predict

## Read bot_hcm csv and insert into iceberg table

In [None]:
bot_hcm_df = spark.read \
    .format("csv") \
    .option("inferSchema", "true") \
    .option("header", "true") \
    .option("delimiter", "|") \
    .option("escape", "\"") \
    .option("multiline", "true") \
    .load("data/bot_dvc_hcm.csv")

In [None]:
fixed_bot_hcm_df = bot_hcm_df.withColumn("CREATED_TIME", f.to_timestamp("CREATED_TIME", "dd-MMM-yy hh.mm.ss.SSSSSSSSS a")) 

In [None]:
fixed_bot_hcm_df.printSchema()

In [None]:
fixed_bot_hcm_df.where("id = '86d87a80-c031-4836-95f8-50f78cffc952'").select("*").show(1, truncate=False)


In [None]:
%%sql

CREATE TABLE IF NOT EXISTS bot_db.bot_dvc_hcm (
    `ID` STRING,
    `BOT_ID` STRING,
    `SENDER_ID` STRING,
    `MESSAGE` STRING,
    `INPUT_CHANNEL` STRING,
    `CREATED_TIME` TIMESTAMP,
    `ADD_INFO` STRING,
    `SOURCE` STRING,
    `TEXT` STRING,
    `META_DATA` STRING,
    `SESSION_ID` STRING,
    `LOGSTT_ID` STRING,
    `AUDIO_URL` STRING,
    `MSG_RATED` STRING,
    `PAGE_ID` STRING
) USING iceberg
PARTITIONED BY (days(`CREATED_TIME`));


In [None]:
table_name = "bot_db.bot_dvc_hcm"
fixed_bot_hcm_df.writeTo(table_name).append()

In [None]:
%%sql
select message from bot_db.bot_dvc_hcm limit 1

In [None]:
%%sql
select distinct input_channel
from bot_db.bot_dvc_hcm