In [24]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col, window, length, avg, count
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType

# Création de la session Spark
spark = SparkSession.builder \
    .appName("Kafka Stream Example") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.0") \
    .getOrCreate()


# Schéma des données JSON reçues depuis Kafka
schema = StructType([
    StructField("user", StringType(), True),
    StructField("timestamp", StringType(), True),
    StructField("content", StringType(), True),
    StructField("hashtags", ArrayType(StringType()), True),
    StructField("favourites_count", IntegerType(), True),
    StructField("reblogs_count", IntegerType(), True)
])

In [25]:
# Configuration du topic Kafka et des serveurs
kafka_topic = "message_kafka"
kafka_bootstrap_servers = "localhost:9092"

# Lecture du flux depuis Kafka
df = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", kafka_bootstrap_servers) \
    .option("subscribe", kafka_topic) \
    .option("kafka.request.timeout.ms", "120000") \
    .load()

In [26]:
# Transformation des données JSON
structured_df = df.selectExpr("CAST(value AS STRING) AS json_value") \
    .select(from_json(col("json_value"), schema).alias("data")) \
    .select("data.*")

In [27]:
# Transformation 2 : Regrouper par fenêtres temporelles
windowed_df = structured_df.withColumn("timestamp", col("timestamp").cast("timestamp")) \
    .groupBy(window(col("timestamp"), "1 hour")) \
    .agg(
        count("*").alias("toot_count"),
        avg(length(col("content"))).alias("avg_toot_length")
    )

In [28]:
# Action 1 : Compter les toots par fenêtre de temps
# Action 2 : Calculer la longueur moyenne des toots par fenêtre de temps

# Connexion à PostgreSQL
postgres_url = "jdbc:postgresql://localhost:5432/mastodon_data"
db_properties = {
    "user": "user",
    "password": "password",
    "driver": "org.postgresql.Driver"
}

In [None]:
# Sauvegarder les résultats dans PostgreSQL
query = windowed_df.writeStream \
    .outputMode("update") \
    .foreachBatch(lambda batch_df, batch_id: batch_df.write \
        .format("jdbc") \
        .option("url", postgres_url) \
        .option("dbtable", "toot_statistics") \
        .options(**db_properties) \
        .mode("append") \
        .save()) \
    .option("checkpointLocation", "/tmp/spark_checkpoints") \
    .start()


# Attendre la fin du traitement
query.awaitTermination()