In [1]:
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col, pandas_udf
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

In [2]:
env_bootstrap = os.getenv("KAFKA_BOOTSTRAP_SERVERS", "localhost:9092")
env_topic     = os.getenv("KAFKA_TOPIC", "youtube-comments")
env_checkpoint= os.getenv("SPARK_CHECKPOINT", "/tmp/spark_ckpt_yt")



In [3]:
# 1. Initialize SparkSession with Kafka support
spark = (SparkSession.builder
    .appName("YouTubeCommentSentimentStreaming")
    .master("local[*]")
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.0")
    .config("spark.sql.streaming.forceDeleteTempCheckpointLocation", "true")
    .getOrCreate())

your 131072x1 screen size is bogus. expect trouble
25/05/20 07:19:09 WARN Utils: Your hostname, LAPTOP-9RDPJROS resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
25/05/20 07:19:09 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/opt/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/dtnghia/.ivy2/cache
The jars for the packages stored in: /home/dtnghia/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-4577ec07-ae1e-4e0e-a2da-b4c8c361f095;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.5.0 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.5.0 in central
	found org.apache.kafka#kafka-clients;3.4.1 in central
	found org.lz4#lz4-java;1.8.0 in central
	found org.xerial.snappy#snappy-java;1.1.10.3 in central
	found org.slf4j#slf4j-api;2.0.7 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.4 in central
	found org.apache.hadoop#hadoop-client-api;3.3.4 in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central
	found org.apache.commons#commons-pool2;2.11.1 in central
:: resolution report :: resolve 474ms :: artifacts dl 22ms
	

In [4]:
# 2. Define schema for incoming JSON
schema = StructType([
    StructField("comment_id", StringType()),
    StructField("author", StringType()),
    StructField("text", StringType()),
    StructField("published_at", StringType()),
    StructField("like_count", IntegerType())
])

# 3. Read Kafka stream
raw_df = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", env_bootstrap) \
    .option("subscribe", env_topic) \
    .option("startingOffsets", "latest") \
    .load()

# 4. Parse JSON values
json_df = raw_df.selectExpr("CAST(value AS STRING) as json_str") \
    .select(from_json(col("json_str"), schema).alias("c")) \
    .select("c.*")

In [7]:
@pandas_udf(StringType())
def classify_sentiment(text_col):
    import tensorflow_hub as hub
    import numpy as np
    import pandas as pd

    embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

    texts = text_col.str.slice(0, 512).str.replace("\n", " ")
    embeddings = embed(texts.tolist() + ["good", "bad"])
    comment_embs = embeddings[: len(texts)].numpy()
    good_vec = embeddings[len(texts)].numpy()
    bad_vec  = embeddings[len(texts) + 1].numpy()

    labels = []
    for vec in comment_embs:
        sim_good = np.dot(vec, good_vec)
        sim_bad  = np.dot(vec, bad_vec)
        labels.append("Positive" if sim_good > sim_bad else "Negative")

    return pd.Series(labels)

In [None]:

labeled_df = json_df.withColumn("sentiment", classify_sentiment(col("text")))

In [None]:

query = labeled_df.writeStream \
    .format("console") \
    .outputMode("append") \
    .option("checkpointLocation", env_checkpoint) \
    .trigger(processingTime="1 second") \
    .start()

query.awaitTermination()

25/05/20 07:31:16 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
25/05/20 07:31:17 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.


-------------------------------------------
Batch: 0
-------------------------------------------
+----------+------+----+------------+----------+---------+
|comment_id|author|text|published_at|like_count|sentiment|
+----------+------+----+------------+----------+---------+
+----------+------+----+------------+----------+---------+



25/05/20 07:31:19 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 1000 milliseconds, but spent 2327 milliseconds
25/05/20 07:31:47 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
2025-05-20 07:31:53.271623: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747726313.288613   50109 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747726313.294021   50109 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1747726313.308534   50109 computation_placer.cc:177] computation placer already registered

-------------------------------------------
Batch: 1
-------------------------------------------
+--------------------+--------+--------------------+--------------------+----------+---------+
|          comment_id|  author|                text|        published_at|like_count|sentiment|
+--------------------+--------+--------------------+--------------------+----------+---------+
|UCsNlU2qzkuORYd-q...|@VuPK274|Dạo này tâm lý họ...|2025-05-18T18:51:09Z|         0| Positive|
+--------------------+--------+--------------------+--------------------+----------+---------+



25/05/20 07:33:02 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 1000 milliseconds, but spent 76738 milliseconds
25/05/20 07:33:03 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/05/20 07:33:03 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/05/20 07:33:03 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/05/20 07:33:03 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/05/20 07:33:03 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang

-------------------------------------------
Batch: 2
-------------------------------------------
+--------------------+--------------------+--------------------+--------------------+----------+---------+
|          comment_id|              author|                text|        published_at|like_count|sentiment|
+--------------------+--------------------+--------------------+--------------------+----------+---------+
|UCpIanxWluVQyXtkS...|       @AnhĐức-t7i1z|Jorden phản bội v...|2025-05-14T08:00:13Z|         0| Negative|
|UCq3ADD4snPlolnmj...|        @phamhan7198|Dạ mn có thể sugg...|2025-05-14T01:18:43Z|         0| Positive|
|UC733AKuN6g4D1gqv...|         @Eleven1986| thuyết phân tâm học|2025-05-12T11:24:59Z|         0| Negative|
|UC-Loeoa_W1YDfoTD...|@QuangPhucNguyen-...|OMG LINH VECTERRR...|2025-05-11T12:19:36Z|         0| Positive|
|UCbpmtC9gf9aiK4Ss...| @phannhuquynhho2002|hay tuyet voi luo...|2025-05-11T05:46:20Z|         0| Positive|
|UCKJ6MNc6-SO91yZe...|    @LanNguyen-mg4jz|😂 ch

25/05/20 07:33:14 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/05/20 07:33:14 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/05/20 07:33:19 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 1000 milliseconds, but spent 5971 milliseconds


-------------------------------------------
Batch: 3
-------------------------------------------
+--------------------+-----------------+----------+--------------------+----------+---------+
|          comment_id|           author|      text|        published_at|like_count|sentiment|
+--------------------+-----------------+----------+--------------------+----------+---------+
|UCipJ2CQGCphfKGZS...|         @MrBobb6|Đinh cao ❤|2025-05-03T12:01:44Z|         0| Negative|
|UC8M_qpqx_qNFt03P...|@collaborator6650|      Nice|2025-05-03T12:01:37Z|         1| Positive|
+--------------------+-----------------+----------+--------------------+----------+---------+



25/05/20 07:34:06 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/05/20 07:34:11 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 1000 milliseconds, but spent 5376 milliseconds


-------------------------------------------
Batch: 4
-------------------------------------------
+--------------------+--------+--------------------+--------------------+----------+---------+
|          comment_id|  author|                text|        published_at|like_count|sentiment|
+--------------------+--------+--------------------+--------------------+----------+---------+
|UCsNlU2qzkuORYd-q...|@VuPK274|Dạo này tâm lý họ...|2025-05-18T18:51:09Z|         0| Positive|
+--------------------+--------+--------------------+--------------------+----------+---------+



25/05/20 07:34:11 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/05/20 07:34:11 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/05/20 07:34:11 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/05/20 07:34:11 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/05/20 07:34:11 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/05/20 07:34:16 WARN ProcessingTimeExecutor: Current batch is falling behind. 

-------------------------------------------
Batch: 5
-------------------------------------------
+--------------------+--------------------+--------------------+--------------------+----------+---------+
|          comment_id|              author|                text|        published_at|like_count|sentiment|
+--------------------+--------------------+--------------------+--------------------+----------+---------+
|UCpIanxWluVQyXtkS...|       @AnhĐức-t7i1z|Jorden phản bội v...|2025-05-14T08:00:13Z|         0| Negative|
|UCq3ADD4snPlolnmj...|        @phamhan7198|Dạ mn có thể sugg...|2025-05-14T01:18:43Z|         0| Positive|
|UC733AKuN6g4D1gqv...|         @Eleven1986| thuyết phân tâm học|2025-05-12T11:24:59Z|         0| Negative|
|UC-Loeoa_W1YDfoTD...|@QuangPhucNguyen-...|OMG LINH VECTERRR...|2025-05-11T12:19:36Z|         0| Positive|
|UCbpmtC9gf9aiK4Ss...| @phannhuquynhho2002|hay tuyet voi luo...|2025-05-11T05:46:20Z|         0| Positive|
+--------------------+--------------------+----

25/05/20 07:34:16 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/05/20 07:34:16 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/05/20 07:34:16 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/05/20 07:34:16 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/05/20 07:34:16 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/05/20 07:34:21 WARN ProcessingTimeExecutor: Current batch is falling behind. 

-------------------------------------------
Batch: 6
-------------------------------------------
+--------------------+-----------------+--------------------+--------------------+----------+---------+
|          comment_id|           author|                text|        published_at|like_count|sentiment|
+--------------------+-----------------+--------------------+--------------------+----------+---------+
|UCKJ6MNc6-SO91yZe...| @LanNguyen-mg4jz|😂 chưa đâu bạn n...|2025-05-10T07:28:36Z|         0| Positive|
|UCpjGJpZxoeBV6iY0...|@phamongphong4267|Ông Linh qua đây r à|2025-05-10T05:00:07Z|         0| Negative|
|UC3Celv6HTGjBB5ag...|    @hungphan3583|Hình như nói về i...|2025-05-10T02:25:52Z|         0| Positive|
|UCXpMQjiEYqBP6grR...|         @hoan194|tài trợ cho phim ...|2025-05-09T15:40:22Z|         0| Positive|
|UCeIXZpHG0YnLFRAa...|  @NhanTran-gh9dk|Nhờ Leo mà khối ô...|2025-05-09T11:34:29Z|         1| Positive|
+--------------------+-----------------+--------------------+-----------

25/05/20 07:34:21 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/05/20 07:34:21 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/05/20 07:34:21 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/05/20 07:34:21 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/05/20 07:34:21 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/05/20 07:34:22 ERROR Utils: Aborting task                        (0 + 1) / 1]

KeyboardInterrupt: 