In [1]:
import os
os.environ['HADOOP_HOME'] = r"C:\hadoop"
os.environ['PATH'] += r";C:\hadoop\bin"

In [2]:
import findspark
findspark.init("C:\spark-3.5.5-bin-hadoop3")
import pyspark
from pyspark.sql import SparkSession

scala_version = "2.12"
spark_version = "3.5.5"
packages = [
    f"org.apache.spark:spark-sql-kafka-0-10_{scala_version}:{spark_version}",
    "org.apache.kafka:kafka-clients:3.6.0"
]

# spark = SparkSession.builder \
#     .master("local") \
#     .appName("kafka-example") \ 
#     .config("spark.jars.packages", ",".join(packages)) \
#     .getOrCreate()
    
spark = SparkSession.builder \
    .appName("RedditTopicModeling") \
    .master("local") \
    .config("spark.jars.packages", ",".join(packages)) \
    .config("spark.sql.streaming.forceDeleteTempCheckpointLocation", "true") \
    .config("spark.hadoop.io.nativeio.enabled", "false") \
    .config("spark.sql.broadcastTimeout", "600") \
    .getOrCreate()
    
hadoop_version = spark._jvm.org.apache.hadoop.util.VersionInfo.getVersion()
print("Hadoop version:", hadoop_version)    

Hadoop version: 3.3.4


In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, size
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer

spark = SparkSession.builder.master("local[*]").appName("LDAExample").getOrCreate()

# 1. Đọc CSV
df = spark.read.csv("modeling/data/reddit_posts_clean.csv", header=True, inferSchema=True)

# 2. Lọc bỏ dòng chưa có clean_text và ép kiểu
df = (
    df.filter(col("clean_text").isNotNull())
      .withColumn("clean_text", col("clean_text").cast("string"))
)

# 3. Tokenize
tokenizer = RegexTokenizer(
    inputCol="clean_text", 
    outputCol="tokens", 
    pattern="\\W+", 
    minTokenLength=2
)
df_tok = tokenizer.transform(df)

# 4. Remove stopwords
remover = StopWordsRemover(inputCol="tokens", outputCol="filtered")
df_fil = remover.transform(df_tok)

# 5. Loại bỏ documents không còn token nào
df_ok = df_fil.filter(col("filtered").isNotNull()) \
              .filter(size(col("filtered")) > 0)

print("Documents ready for vectorization:", df_ok.count())

# 6. Vectorize
cv = CountVectorizer(
    inputCol="filtered",
    outputCol="features",
    vocabSize=5000,
    minDF=5      # hoặc giảm xuống nếu còn ít docs
)
model_cv = cv.fit(df_ok)
df_featurized = model_cv.transform(df_ok)

df_featurized.select("features").show(5, truncate=False)


  import scipy.sparse


Documents ready for vectorization: 12063
+-----------------------------------------------------------------------------------------------------------------------------------------------------+
|features                                                                                                                                             |
+-----------------------------------------------------------------------------------------------------------------------------------------------------+
|(5000,[26,346,639,649,1774,3251],[1.0,1.0,1.0,1.0,1.0,1.0])                                                                                          |
|(5000,[23,38,78,338,1627,1746,1862,2078,2214,2486],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])                                                        |
|(5000,[3,23,50,72,168,234,373,804,927,1163,1183,1586,1733,2067,2907,3588,4326],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0])|
|(5000,[23,287,315,341,1339,1348,1812,2653,2657

In [4]:
from pyspark.ml.clustering import LDA
from pyspark.ml import Pipeline, PipelineModel

lda = LDA(k=15,        # tương đương num_topics=10
          maxIter=15,
          featuresCol="features")
ldaModel = lda.fit(df_featurized)

topics = ldaModel.describeTopics(5)  # top 5 terms
vocab  = model_cv.vocabulary
def termsUDF(termIndices):
    return [vocab[i] for i in termIndices]
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType

udfTerms = udf(termsUDF, ArrayType(StringType()))
topics.select(
    "topic", 
    udfTerms("termIndices").alias("keywords"), 
    "termWeights"
).show(truncate=False)

pipeline = Pipeline(stages=[tokenizer, remover, cv, lda])
pipelineModel = pipeline.fit(df)          # df_ok: DataFrame tĩnh đã clean/tokenize

cv_model  = pipelineModel.stages[2]
lda_model = pipelineModel.stages[3]

vocab = cv_model.vocabulary
topic_terms = {
    row.topic: [vocab[i] for i in row.termIndices]
    for row in lda_model.describeTopics(5).collect()
}

+-----+------------------------------------------------+-------------------------------------------------------------------------------------------------------------------+
|topic|keywords                                        |termWeights                                                                                                        |
+-----+------------------------------------------------+-------------------------------------------------------------------------------------------------------------------+
|0    |[games, died, weeks, left, told]                |[0.00221603755765598, 0.0020612517617784125, 0.002055028273079264, 0.001655495702353431, 0.001507592896189887]     |
|1    |[court, supreme, trump, citizenship, birthright]|[0.009282702062019522, 0.00831119293345803, 0.0060545753613106025, 0.004093753677551748, 0.0036899866467082093]    |
|2    |[homemade, trump, anime, ate, chicken]          |[0.013647862521525467, 0.011382129513244313, 0.005096628168955756, 0.0042524484

In [5]:
from pyspark.sql.functions import from_json, col, concat_ws
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, LongType

schema = StructType() \
    .add("id", StringType()) \
    .add("title", StringType()) \
    .add("selftext", StringType()) \
    .add("created_utc", LongType())

raw = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "localhost:9092") \
    .option("subscribe", "reddit_posts") \
    .load()

posts = raw.selectExpr("CAST(value AS STRING) as json") \
    .select(from_json(col("json"), schema).alias("data")) \
    .select("data.id", "data.title", "data.selftext")
    
posts = posts.withColumn("clean_text", concat_ws(" ", col("title"), col("selftext")))

In [8]:
from pyspark.sql.functions import pandas_udf, col
from pyspark.sql.types import StringType, IntegerType
import pandas as pd
import numpy as np
import re

# 2.1 Hàm tokenize + remove stopwords thuần
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
token_pattern = re.compile(r"\b[a-zA-Z]{2,}\b")

def preprocess_text(text):
    # lowercase, tách token, loại stopword
    tokens = token_pattern.findall(text.lower())
    return [t for t in tokens if t not in stop_words]

# 2.2 Load dictionary và LDA thuần (gensim) — hoặc bạn có thể build BOW theo vocab Spark
from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel

# Nếu bạn đã lưu gensim model:
dictionary = Dictionary.load("modeling/reddit.dict")
lda = LdaModel.load("modeling/reddit_lda.model")

# 2.3 pandas_udf để infer topic và keywords
@pandas_udf("topic int, keywords string", functionType="pandas_udf")
def infer_topics(texts: pd.Series) -> pd.DataFrame:
    topics, keywords = [], []
    for doc in texts:
        toks = preprocess_text(doc)
        bow = dictionary.doc2bow(toks)
        # lấy topic mạnh nhất
        topic_probs = lda.get_document_topics(bow, minimum_probability=0.0)
        top_t, _ = max(topic_probs, key=lambda x: x[1])
        topics.append(int(top_t))
        # lấy top 5 terms
        terms = lda.show_topic(top_t, topn=5)
        keywords.append(", ".join([w for w,_ in terms]))
    return pd.DataFrame({
        "topic":   topics,
        "keywords": keywords
    })

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tuanq\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

In [16]:
from pyspark.sql.streaming import StreamingQuery

if posts_pred.isStreaming:
    print("We are streaming!")
    # Creating a DataSreamWriter and StreamingQuery
    # ===
    # Calling .writeStream on a DataFrame returns an instance of DataStreamWriter
    query = (
        posts_pred.writeStream
        # DataStream queries need to be named
        .queryName("posts")
        .format("memory")
        .outputMode("append")
        .trigger(processingTime="5 seconds")
        .option("checkpointLocation", r"C:\tmp\spark-checkpoint\posts_append_v11")
        .start()
    )
    
    import time
    while query.isActive:
        time.sleep(5)  # phải dài hơn trigger interval
        print("=== Latest batch snapshot ===")
        spark.table("posts").show(truncate=False)
    
    # Chờ query hoàn thành
    query.awaitTermination()

We are streaming!
=== Latest batch snapshot ===
+---+-----+--------+----------+------+--------+--------+-----------------+--------------+
|id |title|selftext|clean_text|tokens|filtered|features|topicDistribution|predictedTopic|
+---+-----+--------+----------+------+--------+--------+-----------------+--------------+
+---+-----+--------+----------+------+--------+--------+-----------------+--------------+

=== Latest batch snapshot ===
+---+-----+--------+----------+------+--------+--------+-----------------+--------------+
|id |title|selftext|clean_text|tokens|filtered|features|topicDistribution|predictedTopic|
+---+-----+--------+----------+------+--------+--------+-----------------+--------------+
+---+-----+--------+----------+------+--------+--------+-----------------+--------------+



StreamingQueryException: [STREAM_FAILED] Query [id = cad01fea-b58b-4d8f-a9ba-f79003fdc22b, runId = def499ea-efbe-4fc1-ba43-21be81a43d4f] terminated with exception: Job aborted due to stage failure: Task 0 in stage 85.0 failed 1 times, most recent failure: Lost task 0.0 in stage 85.0 (TID 82) (DESKTOP-5CKD8IE executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "C:\Users\tuanq\AppData\Local\Temp\ipykernel_14388\1313041508.py", line 16, in top_topic
  File "c:\Users\tuanq\AppData\Local\Programs\Python\Python311\Lib\site-packages\pandas\core\frame.py", line 10468, in map
    return self.apply(infer).__finalize__(self, "map")
           ^^^^^^^^^^^^^^^^^
  File "c:\Users\tuanq\AppData\Local\Programs\Python\Python311\Lib\site-packages\pandas\core\frame.py", line 10374, in apply
    return op.apply().__finalize__(self, method="apply")
           ^^^^^^^^^^
  File "c:\Users\tuanq\AppData\Local\Programs\Python\Python311\Lib\site-packages\pandas\core\apply.py", line 916, in apply
    return self.apply_standard()
           ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\tuanq\AppData\Local\Programs\Python\Python311\Lib\site-packages\pandas\core\apply.py", line 1063, in apply_standard
    results, res_index = self.apply_series_generator()
                         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\tuanq\AppData\Local\Programs\Python\Python311\Lib\site-packages\pandas\core\apply.py", line 1081, in apply_series_generator
    results[i] = self.func(v, *self.args, **self.kwargs)
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\tuanq\AppData\Local\Programs\Python\Python311\Lib\site-packages\pandas\core\frame.py", line 10466, in infer
    return x._map_values(func, na_action=na_action)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\tuanq\AppData\Local\Programs\Python\Python311\Lib\site-packages\pandas\core\base.py", line 921, in _map_values
    return algorithms.map_array(arr, mapper, na_action=na_action, convert=convert)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\tuanq\AppData\Local\Programs\Python\Python311\Lib\site-packages\pandas\core\algorithms.py", line 1743, in map_array
    return lib.map_infer(values, mapper, convert=convert)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "lib.pyx", line 2972, in pandas._libs.lib.map_infer
  File "C:\Users\tuanq\AppData\Local\Temp\ipykernel_14388\1313041508.py", line 16, in <lambda>
AttributeError: 'int' object has no attribute 'toArray'

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:572)
	at org.apache.spark.sql.execution.python.PythonArrowOutput$$anon$1.read(PythonArrowOutput.scala:118)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:525)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:491)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
	at org.apache.spark.sql.execution.datasources.v2.WritingSparkTask.$anonfun$run$5(WriteToDataSourceV2Exec.scala:446)
	at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1397)
	at org.apache.spark.sql.execution.datasources.v2.WritingSparkTask.run(WriteToDataSourceV2Exec.scala:491)
	at org.apache.spark.sql.execution.datasources.v2.WritingSparkTask.run$(WriteToDataSourceV2Exec.scala:430)
	at org.apache.spark.sql.execution.datasources.v2.DataWritingSparkTask$.run(WriteToDataSourceV2Exec.scala:496)
	at org.apache.spark.sql.execution.datasources.v2.V2TableWriteExec.$anonfun$writeWithV2$2(WriteToDataSourceV2Exec.scala:393)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:750)

Driver stacktrace: