In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, udf, col
from pyspark.sql.types import ArrayType, IntegerType, LongType, StructType, StringType
import sys
# sys.path.append('../src')
from src.tweet_parser import TweetParser


spark = SparkSession.builder.appName("Wordle score streaming").getOrCreate()

def getResults(text):
    return TweetParser(text).wordle_result_exist()

## UDF declaration
attempts_fn = udf(lambda  x: getResults(x), StringType())

lines = spark \
    .readStream \
    .format("socket") \
    .option("host", "localhost") \
    .option("port", 9009) \
    .load()

schema = StructType(). \
    add('id', LongType(), False). \
    add('created_at', StringType(), False) .\
    add('user', StructType().add("id_str",StringType(), False), False). \
    add('text', StringType(), False)


filtered_data = lines \
    .selectExpr('CAST(value AS STRING)') \
    .select(from_json('value', schema).alias('tweet_data')) \
    .select('tweet_data.id', 'tweet_data.created_at', 'tweet_data.user.id_str', 'tweet_data.text') \
    .withColumn('results', attempts_fn(col('text')))

filtered_data = filtered_data.filter(col('results') != "{}")

results_schema = StructType(). \
    add('wordle_id', StringType(), False). \
    add('attempts_count', IntegerType(), False). \
    add('attempts', StringType(), False)


filtered_data = filtered_data \
    .withColumn('results', from_json('results', results_schema)) \
    .select('id', 'created_at', 'id_str', 'text', 'results.wordle_id', 'results.attempts_count', 'results.attempts')

filtered_data.printSchema()

# Start running the query that prints tweet data to the console
query = filtered_data \
    .writeStream \
    .format("console") \
    .outputMode("append") \
    .trigger(processingTime= "5 seconds") \
    .start()

query.awaitTermination()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/04/04 17:35:31 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/04/04 17:35:33 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/04/04 17:35:37 WARN TextSocketSourceProvider: The socket source should not be used for production applications! It does not support recovery.


root
 |-- id: long (nullable = true)
 |-- created_at: string (nullable = true)
 |-- id_str: string (nullable = true)
 |-- text: string (nullable = true)
 |-- wordle_id: string (nullable = true)
 |-- attempts_count: integer (nullable = true)
 |-- attempts: string (nullable = true)



22/04/04 17:35:40 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /private/var/folders/_p/fncz_94x46n3hpt2_kbtc8540000gn/T/temporary-99f771c7-d649-4b4a-88b6-bbf65ef6fae2. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
22/04/04 17:35:41 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+---+----------+------+----+---------+--------------+--------+
| id|created_at|id_str|text|wordle_id|attempts_count|attempts|
+---+----------+------+----+---------+--------------+--------+
+---+----------+------+----+---------+--------------+--------+

-------------------------------------------
Batch: 1
-------------------------------------------
+-------------------+--------------------+---------+--------------------+---------+--------------+--------------------+
|                 id|          created_at|   id_str|                text|wordle_id|attempts_count|            attempts|
+-------------------+--------------------+---------+--------------------+---------+--------------+--------------------+
|1511004571363065865|Mon Apr 04 15:35:...|159946682|😖I agree ⁦@helen...|      289|             6|{"1":["0","0","2"...|
|1511004577247940610|Mon Apr 04 15:35:...| 12909092|Wordle 290 3/6\n\...| 

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/Users/tcufer/delo/wordle_pulse/wordle_pulse_venv/lib/python3.8/site-packages/py4j/clientserver.py", line 475, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
RuntimeError: reentrant call inside <_io.BufferedReader name=79>

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/tcufer/delo/wordle_pulse/wordle_pulse_venv/lib/python3.8/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/Users/tcufer/delo/wordle_pulse/wordle_pulse_venv/lib/python3.8/site-packages/py4j/clientserver.py", line 503, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving
ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/Users/tcufer/delo/wordle_pulse/wordle_pulse_venv/lib/python3.

-------------------------------------------
Batch: 83
-------------------------------------------
+-------------------+--------------------+-------------------+--------------------+---------+--------------+--------------------+
|                 id|          created_at|             id_str|                text|wordle_id|attempts_count|            attempts|
+-------------------+--------------------+-------------------+--------------------+---------+--------------+--------------------+
|1511006298573533184|Mon Apr 04 15:42:...|         2883232406|Wordle 289 3/6\n\...|      289|             3|{"1":["0","1","2"...|
|1511006287953534977|Mon Apr 04 15:42:...|          111588782|Wordle 290 6/6\n\...|      290|             6|{"1":["0","0","0"...|
|1511006293674315785|Mon Apr 04 15:42:...| 815648894256349185|Wordle 289 4/6\n\...|      289|             4|{"1":["1","0","0"...|
|1511006299953065984|Mon Apr 04 15:42:...|         1969568455|Wordle 289 5/6\n\...|      289|             5|{"1":["0","1",

22/04/04 17:42:35 WARN TextSocketMicroBatchStream: Stream closed by localhost:9009


Py4JError: An error occurred while calling o84.awaitTermination

-------------------------------------------
Batch: 84
-------------------------------------------
+-------------------+--------------------+--------+--------------------+---------+--------------+--------------------+
|                 id|          created_at|  id_str|                text|wordle_id|attempts_count|            attempts|
+-------------------+--------------------+--------+--------------------+---------+--------------+--------------------+
|1511006301404516359|Mon Apr 04 15:42:...|74783302|Wordle 289 4/6\n\...|      289|             4|{"1":["0","0","0"...|
|1511006302746537989|Mon Apr 04 15:42:...|65588911|Wordle 289 5/6\n\...|      289|             5|{"1":["0","1","0"...|
+-------------------+--------------------+--------+--------------------+---------+--------------+--------------------+

