In [19]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, StringIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline

# 1. T·∫°o SparkSession
spark = SparkSession.builder.appName("SentimentClassification").getOrCreate()

# 2. ƒê·ªçc d·ªØ li·ªáu
data = spark.read.csv("data/sentiments.csv", header=True, inferSchema=True)

# 3. Lo·∫°i b·ªè h√†ng b·ªã NULL ·ªü text ho·∫∑c sentiment (c·∫©n th·∫≠n)
data = data.na.drop(subset=["text", "sentiment"])

# 4. C√°c b∆∞·ªõc x·ª≠ l√Ω
tokenizer = Tokenizer(inputCol="text", outputCol="words")
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
vectorizer = CountVectorizer(inputCol="filtered", outputCol="features")

# ‚ö†Ô∏è th√™m handleInvalid ƒë·ªÉ tr√°nh l·ªói
indexer = StringIndexer(inputCol="sentiment", outputCol="label", handleInvalid="keep")

# 5. M√¥ h√¨nh Logistic Regression
lr = LogisticRegression(featuresCol="features", labelCol="label", maxIter=10)

# 6. Pipeline
pipeline = Pipeline(stages=[tokenizer, remover, vectorizer, indexer, lr])

# 7. T√°ch train/test
train_data, test_data = data.randomSplit([0.8, 0.2], seed=42)

# 8. Train
model = pipeline.fit(train_data)

# 9. D·ª± ƒëo√°n
predictions = model.transform(test_data)

# 10. Xem k·∫øt qu·∫£
predictions.select("text", "label", "prediction").show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------+-----+----------+
|text                                                                                                                                               |label|prediction|
+---------------------------------------------------------------------------------------------------------------------------------------------------+-----+----------+
|  ISG An update to our Feb 20th video review..if it closes below 495 much lower to come soon                                                       |1.0  |1.0       |
|  The rodeo clown sent BK screaming into the SI weekly red zone...time to peel away before it turns...                                             |1.0  |1.0       |
| , ES,SPY, Ground Hog Week, distribution at highs..                                                                                                |1.0  |0.0       

In [20]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# ƒê√°nh gi√°
evaluator_acc = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
evaluator_prec = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedPrecision")
evaluator_rec = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedRecall")
evaluator_f1 = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")

accuracy = evaluator_acc.evaluate(predictions)
precision = evaluator_prec.evaluate(predictions)
recall = evaluator_rec.evaluate(predictions)
f1 = evaluator_f1.evaluate(predictions)

print("üìä Evaluation Results:")
print(f"  ‚û§ Accuracy  = {accuracy:.4f}")
print(f"  ‚û§ Precision = {precision:.4f}")
print(f"  ‚û§ Recall    = {recall:.4f}")
print(f"  ‚û§ F1 Score  = {f1:.4f}")

print("Confusion Matrix:")
predictions.groupBy("label", "prediction").count().orderBy("label", "prediction").show()

# D·ª´ng Spark
spark.stop()

üìä Evaluation Results:
  ‚û§ Accuracy  = 0.7665
  ‚û§ Precision = 0.7658
  ‚û§ Recall    = 0.7665
  ‚û§ F1 Score  = 0.7660
Confusion Matrix:
+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  0.0|       0.0|  570|
|  0.0|       1.0|  122|
|  0.0|       2.0|    1|
|  1.0|       0.0|  136|
|  1.0|       1.0|  280|
+-----+----------+-----+

