# SPARK STREAM

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = SparkSession.builder \
    .appName("KafkaStreamingAnalysis") \
    .config("spark.jars.packages","org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.0") \
    .getOrCreate()

# Đọc dữ liệu từ Kafka
df_kafka = spark.readStream.format("kafka") \
    .option("kafka.bootstrap.servers", "localhost:9092") \
    .option("subscribe", "transactions") \
    .option("startingOffsets", "earliest") \
    .load()

# Ép kiểu chuỗi
lines = df_kafka.selectExpr("CAST(value AS STRING) as csv")

# Tách các cột từ CSV
columns = ["User", "Card", "Year", "Month", "Day", "Time", "Amount", "Use Chip",
           "Merchant Name", "Merchant City", "Merchant State", "Zip", "MCC", "Errors?", "Is Fraud?"]

df_parsed = lines.selectExpr("split(csv, ',') as data") \
    .select([col("data")[i].alias(columns[i]) for i in range(len(columns))]) \
    .filter(
        (col("Year").isNotNull()) & (length(col("Year")) > 0) &
        (col("Month").isNotNull()) & (length(col("Month")) > 0) &
        (col("Day").isNotNull()) & (length(col("Day")) > 0) &
        (col("Time").isNotNull()) & (length(col("Time")) > 0)
    ) \
    .withColumn("Amount_casted", regexp_replace("Amount", "[$]", "").cast("float")) \
    .withColumn("event_time",
        # to_timestamp(
        #     concat(
        #         lpad(col("Day"), 2, '0'), lit("-"),
        #         lpad(col("Month"), 2, '0'), lit("-"),
        #         col("Year"), lit(" "),
        #         col("Time"), lit(":00")
        #     ),
        #     "dd-MM-yyyy HH:mm:ss"
        # )
        to_timestamp(
            concat(
                col("Year"), lit("-"),
                lpad(col("Month"), 2, '0'), lit("-"),
                lpad(col("Day"), 2, '0'), lit(" "),
                col("Time"), lit(":00")
            ),
            "yyyy-MM-dd HH:mm:ss"
        )
    ) \
    .withColumnRenamed("Errors?", "Errors") \
    .withColumnRenamed("Is Fraud?", "Is Fraud") \
    .withColumn("Hour", substring(col("Time"), 1, 2).cast("int")) \
    .withColumn(
        "DayOfWeek",
        when(dayofweek(col("event_time")) == 1, "Chủ Nhật")
        .when(dayofweek(col("event_time")) == 2, "Thứ 2")
        .when(dayofweek(col("event_time")) == 3, "Thứ 3")
        .when(dayofweek(col("event_time")) == 4, "Thứ 4")
        .when(dayofweek(col("event_time")) == 5, "Thứ 5")
        .when(dayofweek(col("event_time")) == 6, "Thứ 6")
        .when(dayofweek(col("event_time")) == 7, "Thứ 7")
    )\
    # .withColumnRenamed("event_time", "event_time_str") \
    # .withColumn("event_time",
    #     date_format(col("event_time_str"), "dd-MM-yyyy HH:mm:ss")
    # )


df_parsed_selected = df_parsed.select("User", "Card", "event_time", "Hour", "Amount_casted", "Use Chip",
           "Merchant Name", "Merchant City", "Merchant State", "Zip", "MCC", "Errors", "Is Fraud", "DayOfWeek")

# Ghi ra HDFS dưới dạng file CSV
df_parsed_selected.writeStream \
    .outputMode("append") \
    .format("csv") \
    .option("header", "true") \
    .option("path", "hdfs://localhost:9000/transactions") \
    .option("checkpointLocation", "hdfs://localhost:9000/checkpoints_DE") \
    .start()

try:
    spark.streams.awaitAnyTermination()
except Exception as e:
    print("✅ Streaming stopped gracefully.")
    print(f"ℹ️ Reason: {str(e)}")
finally:
    spark.stop()


25/07/12 13:07:06 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
25/07/12 13:07:07 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.
ERROR:root:Exception while sending command.                                     
Traceback (most recent call last):
  File "/home/panda/Desktop/SourceCode/venv310/lib/python3.10/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
RuntimeError: reentrant call inside <_io.BufferedReader name=70>

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/panda/Desktop/SourceCode/venv310/lib/python3.10/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/home/panda

✅ Streaming stopped gracefully.
ℹ️ Reason: An error occurred while calling o401.awaitAnyTermination
