In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json
from pyspark.sql.types import StructType(), StringType()

spark = SparkSession.builder.appName("Wordle score streaming").getOrCreate()


lines = spark \
    .readStream \
    .format("socket") \
    .option("host", "localhost") \
    .option("port", 9008) \
    .load()

schema = StructType(). \
    add('id', LongType(), False). \
    add('created_at', StringType(), False) .\
    add('user', StructType().add("id_str",StringType(), False), False). \
    add('text', StringType(), False)


filtered_data = lines \
    .selectExpr('CAST(value AS STRING)') \
    .select(from_json('value', schema).alias('tweet_data')) \
    .select('tweet_data.id', 'tweet_data.created_at', 'tweet_data.user.id_str', 'tweet_data.text')


filtered_data.printSchema()

# Start running the query that prints tweet data to the console
query = filtered_data \
    .writeStream \
    .format("console") \
    .outputMode("append") \
    .trigger(processingTime= "5 seconds") \
    .start()

query.awaitTermination()