In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF
from pyspark.ml.classification import LogisticRegression, NaiveBayes
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

spark = SparkSession.builder \
    .appName("SentimentAnalysis") \
    .config("spark.jars", "/usr/local/spark/jars/postgresql-42.6.0.jar") \
    .getOrCreate()

# Lire les données
df = spark.read \
    .format("jdbc") \
    .option("url", "jdbc:postgresql://db:5432/5SPAR") \
    .option("dbtable", "mastodon_stream") \
    .option("user", "myuser") \
    .option("password", "mysecretpassword") \
    .load()

# Option 1 : Sentiment basique (positif si contient mots positifs)
from pyspark.sql.functions import *

positive_words = ["love", "great", "good", "happy", "excellent", "amazing"]
negative_words = ["hate", "bad", "terrible", "sad", "awful", "horrible"]

df_sentiment = df.withColumn(
    "sentiment",
    when(array_contains(split(lower(col("content")), " "), "love") | 
         array_contains(split(lower(col("content")), " "), "great"), "positive")
    .when(array_contains(split(lower(col("content")), " "), "hate") | 
          array_contains(split(lower(col("content")), " "), "bad"), "negative")
    .otherwise("neutral")
)

# Sauvegarder dans une nouvelle table
df_sentiment.write \
    .format("jdbc") \
    .option("url", "jdbc:postgresql://db:5432/5SPAR") \
    .option("dbtable", "mastodon_sentiment") \
    .option("user", "myuser") \
    .option("password", "mysecretpassword") \
    .mode("overwrite") \
    .save()