In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, udf, col
from pyspark.sql.types import ArrayType, IntegerType, LongType, StructType, StringType
from tweet_parser import TweetParser

spark = SparkSession.builder.appName("Wordle score streaming").getOrCreate()

def getResults(text):
    return TweetParser(text).wordle_result_exist()

## UDF declaration
attempts_fn = udf(lambda  x: getResults(x), StringType())

lines = spark \
    .readStream \
    .format("socket") \
    .option("host", "localhost") \
    .option("port", 9008) \
    .load()

schema = StructType(). \
    add('id', LongType(), False). \
    add('created_at', StringType(), False) .\
    add('user', StructType().add("id_str",StringType(), False), False). \
    add('text', StringType(), False)


filtered_data = lines \
    .selectExpr('CAST(value AS STRING)') \
    .select(from_json('value', schema).alias('tweet_data')) \
    .select('tweet_data.id', 'tweet_data.created_at', 'tweet_data.user.id_str', 'tweet_data.text') \
    .withColumn('results', attempts_fn(col('text')))
filtered_data = filtered_data.filter(col('results') != "false")


filtered_data.printSchema()

# Start running the query that prints tweet data to the console
query = filtered_data \
    .writeStream \
    .format("console") \
    .outputMode("append") \
    .trigger(processingTime= "5 seconds") \
    .start()

query.awaitTermination()