In [1]:
import os
os.environ['HADOOP_HOME'] = r"C:\hadoop"
os.environ['PATH'] += r";C:\hadoop\bin"

In [2]:
import findspark
findspark.init("C:\spark-3.5.5-bin-hadoop3")
import pyspark
from pyspark.sql import SparkSession
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizerModel
from pyspark.ml.clustering import LDAModel

scala_version = "2.12"
spark_version = "3.5.5"
packages = [
    f"org.apache.spark:spark-sql-kafka-0-10_{scala_version}:{spark_version}",
    "org.apache.kafka:kafka-clients:3.6.0"
]

# spark = SparkSession.builder \
#     .master("local") \
#     .appName("kafka-example") \ 
#     .config("spark.jars.packages", ",".join(packages)) \
#     .getOrCreate()
    
spark = SparkSession.builder \
    .appName("RedditTopicModeling") \
    .master("local") \
    .config("spark.jars.packages", ",".join(packages)) \
    .config("spark.sql.streaming.forceDeleteTempCheckpointLocation", "true") \
    .config("spark.hadoop.io.nativeio.enabled", "false") \
    .config("spark.sql.broadcastTimeout", "600") \
    .getOrCreate()
    # .config("spark.hadoop.fs.file.impl", "org.apache.hadoop.fs.LocalFileSystem") \
    # .config("spark.hadoop.fs.AbstractFileSystem.file.impl", "org.apache.hadoop.fs.local.LocalFs") \
    

In [3]:
from pyspark.sql.functions import from_json, col, concat_ws, from_unixtime
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, LongType, TimestampType
import os

schema = StructType() \
    .add("id", StringType()) \
    .add("title", StringType()) \
    .add("selftext", StringType()) \
    .add("created_utc", LongType())

raw = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "localhost:9092") \
    .option("subscribe", "reddit_posts") \
    .load()

posts = (raw
    .selectExpr("CAST(value AS STRING) AS json")
    .select(from_json(col("json"), schema).alias("data"))
    .select("data.id", "data.title", "data.selftext", "data.created_utc")
    
    # ← here we add your `text` column and a proper timestamp
    .withColumn("text", concat_ws(" ", col("title"), col("selftext")))
    .withColumn("timestamp",
                from_unixtime(col("created_utc")).cast(TimestampType()))
)

In [4]:
from bertopic import BERTopic
topic_model = BERTopic.load("modeling/bertopic")
info = topic_model.get_topic_info()  
label_map = dict(zip(info["Topic"], info["Name"]))

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
from pyspark.sql.streaming import StreamingQuery
import pandas as pd
import numpy as np
from pyspark.sql.functions import (
    from_json, col, concat_ws, from_unixtime, pandas_udf
)
from pyspark.sql.functions import split, regexp_replace, col
from pyspark.sql.types import StringType
import os

MODEL_DIR = "./modeling/bertopic"
_betopic = None
def _get_model():
    global _betopic
    if _betopic is None:
        _betopic = BERTopic.load(MODEL_DIR)
    return _betopic

@pandas_udf(StringType())
def predict_label(texts: pd.Series) -> pd.Series:
    m = _get_model()
    topics, _ = m.transform(texts.tolist())
    # clean "1_top_words_here" → "top words here"
    cleaned = []
    for t in topics:
        if t is None:
            cleaned.append(None)
        else:
            label = label_map.get(int(t), None)
            cleaned.append(label.replace("_"," "))
    return pd.Series(cleaned)

annotated = posts.withColumn("topic", predict_label(col("text")))

csv_path        = "output/all_results.csv"
checkpoint_path = "output/checkpoints/bertopic_fbb"

# fresh start
os.makedirs(os.path.dirname(csv_path), exist_ok=True)
if os.path.exists(csv_path):
    os.remove(csv_path)
    
def save_batch(df, epoch_id):
    pdf = df.select("text","topic").toPandas()
    if pdf.empty:
        return
    # append, header only on first batch
    pdf.to_csv(csv_path, mode="a", index=False, header=(epoch_id==0))

if posts.isStreaming:
    print("We are streaming!")
    # (posts.writeStream
    #     .foreachBatch(foreach_batch)
    #     .option("checkpointLocation", "./tmp/checkpoints/bertopic")
    #     .trigger(processingTime="5 seconds")
    #     .start()
    #     .awaitTermination())
    
    # query = (
    #     annotated.writeStream
    #     # DataStream queries need to be named
    #     .queryName("posts")
    #     .format("memory")
    #     .outputMode("append")
    #     .trigger(processingTime="5 seconds")
    #     .option("checkpointLocation", r"C:\tmp\spark-checkpoint\posts_append_v23")
    #     .start()
    # )
    
    # import time
    # while query.isActive:
    #     # 2) Clear the terminal window
    #     os.system('clear')
    #     time.sleep(5)  # phải dài hơn trigger interval
    #     print("=== Latest batch snapshot ===")
    #     spark.table("posts").select("text", "topic").show(truncate=False)
    
    cleaned = annotated \
    .select(
        col("text"),
        col("topic")
    )
    
    query = (
        cleaned.writeStream               # file sink only supports append
        .foreachBatch(save_batch)        # where to write the CSV files
        .option("checkpointLocation", checkpoint_path)
        .trigger(processingTime="5 seconds")
        .start()
    )
    
    # Chờ query hoàn thành
    query.awaitTermination()

We are streaming!


StreamingQueryException: [STREAM_FAILED] Query [id = 6e4d6a76-aac1-4e2d-99ef-1d56e4b7a868, runId = d6359008-c770-4477-96ee-05e0e3fa62ce] terminated with exception: An exception was raised by the Python Proxy. Return Message: Traceback (most recent call last):
  File "C:\spark-3.5.5-bin-hadoop3\python\lib\py4j-0.10.9.7-src.zip\py4j\clientserver.py", line 617, in _call_proxy
    return_value = getattr(self.pool[obj_id], method)(*params)
  File "C:\spark-3.5.5-bin-hadoop3\python\pyspark\sql\utils.py", line 120, in call
    raise e
  File "C:\spark-3.5.5-bin-hadoop3\python\pyspark\sql\utils.py", line 117, in call
    self.func(DataFrame(jdf, wrapped_session_jdf), batch_id)
  File "C:\Users\tuanq\AppData\Local\Temp\ipykernel_25832\2239294911.py", line 48, in save_batch
    pdf.to_csv(csv_path, mode="a", index=False, header=(epoch_id==0))
  File "c:\Users\tuanq\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\util\_decorators.py", line 333, in wrapper
    return func(*args, **kwargs)
  File "c:\Users\tuanq\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\core\generic.py", line 3986, in to_csv
    return DataFrameRenderer(formatter).to_csv(
  File "c:\Users\tuanq\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\io\formats\format.py", line 1014, in to_csv
    csv_formatter.save()
  File "c:\Users\tuanq\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\io\formats\csvs.py", line 251, in save
    with get_handle(
  File "c:\Users\tuanq\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\io\common.py", line 873, in get_handle
    handle = open(
PermissionError: [Errno 13] Permission denied: 'output/all_results.csv'
