In [9]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, from_json, to_date, col, to_utc_timestamp, explode, split
from pyspark.sql.types import LongType, StructType, StringType
from datetime import datetime
import os
import pytz
import sys
import yaml
sys.path.append('../')

from tweet_parser import TweetParser

spark = SparkSession \
    .builder \
    .appName('Wordle score streaming') \
    .getOrCreate()

lines = spark \
    .readStream \
    .format("socket") \
    .option("host", "localhost") \
    .option("port", 9008) \
    .load()

## Converting date string format
def getDate(x):
    if x is not None:
        return str(datetime.strptime(x,'%a %b %d %H:%M:%S +0000 %Y').replace(tzinfo=pytz.UTC).strftime("%Y-%m-%d %H:%M:%S"))
    else:
        return None

def getResults(text):
    return TweetParser(text).wordle_result_exist()

## UDF declaration
date_fn = udf(getDate, StringType())
attempts_fn = udf(lambda  x: getResults(x), StringType())

schema = StructType(). \
    add('id', LongType(), False). \
    add('created_at', StringType(), False) .\
    add('user', StructType().add("id_str",StringType(), False), False). \
    add('text', StringType(), False)

filtered_data = lines \
    .selectExpr('CAST(value AS STRING)') \
    .select(from_json('value', schema).alias('tweet_data')) \
    .selectExpr('tweet_data.id', 'tweet_data.created_at', 'tweet_data.user.id_str AS user_id', 'tweet_data.text AS message') \
    .withColumn("created_at", to_utc_timestamp(date_fn("created_at"),"UTC")) \
    .withColumn('results', attempts_fn(col('message')))

filtered_data = filtered_data.filter(col('results') != "false")
filtered_data.printSchema()

    
def postgres_sink(df, batch_id):
    config = ""
    with open('secrets.yml', 'r') as file:
        config = yaml.safe_load(file)
        
    dbname = config['dbname']
    dbtable = 'tweets'
    dbuser = config['dbuser']
    dbpass = config['dbpass']
    dbhost = config['dbhost']
    dbport = config['dbport']

    url = "jdbc:postgresql://"+dbhost+":"+str(dbport)+"/"+dbname
    properties = {
        "driver": "org.postgresql.Driver",
        "user": dbuser,
        "password": dbpass,
        "stringtype":"unspecified"
    }
    df.write.jdbc(url=url, 
      table=dbtable, 
      mode="append",
      properties=properties)
    
# Write to Postgres
query = filtered_data \
    .writeStream \
    .trigger(processingTime='5 seconds') \
    .outputMode("append") \
    .foreachBatch(postgres_sink) \
    .start()


    
query.awaitTermination()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/03/01 17:37:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/03/01 17:37:09 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/03/01 17:37:11 WARN TextSocketSourceProvider: The socket source should not be used for production applications! It does not support recovery.


root
 |-- id: long (nullable = true)
 |-- created_at: timestamp (nullable = true)
 |-- user_id: string (nullable = true)
 |-- message: string (nullable = true)
 |-- results: string (nullable = true)



22/03/01 17:37:14 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /private/var/folders/_p/fncz_94x46n3hpt2_kbtc8540000gn/T/temporary-244d0298-d6b1-42dc-9bcd-159e72d0b648. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
22/03/01 17:37:14 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
22/03/01 17:37:16 ERROR MicroBatchExecution: Query [id = 07b0a771-c2c0-456d-b10d-67bf05a67470, runId = ed59bfe7-8b2c-46a3-9228-a7cea01cac69] terminated with error
py4j.Py4JException: An exception was raised by the Python Proxy. Return Message: Traceback (most recent call last):
  File "/Users/tcufer/delo/wordle_pulse/wordle_pulse_venv/lib/python3.8/site-packages/py4j/clientserver.py", line 581, in _call_proxy
    re

StreamingQueryException: An exception was raised by the Python Proxy. Return Message: Traceback (most recent call last):
  File "/Users/tcufer/delo/wordle_pulse/wordle_pulse_venv/lib/python3.8/site-packages/py4j/clientserver.py", line 581, in _call_proxy
    return_value = getattr(self.pool[obj_id], method)(*params)
  File "/Users/tcufer/delo/wordle_pulse/wordle_pulse_venv/lib/python3.8/site-packages/pyspark/sql/utils.py", line 196, in call
    raise e
  File "/Users/tcufer/delo/wordle_pulse/wordle_pulse_venv/lib/python3.8/site-packages/pyspark/sql/utils.py", line 193, in call
    self.func(DataFrame(jdf, self.sql_ctx), batch_id)
  File "/var/folders/_p/fncz_94x46n3hpt2_kbtc8540000gn/T/ipykernel_64516/315727131.py", line 58, in postgres_sink
    with open('secrets.yml', 'r') as file:
FileNotFoundError: [Errno 2] No such file or directory: 'secrets.yml'

=== Streaming Query ===
Identifier: [id = 07b0a771-c2c0-456d-b10d-67bf05a67470, runId = ed59bfe7-8b2c-46a3-9228-a7cea01cac69]
Current Committed Offsets: {}
Current Available Offsets: {TextSocketV2[host: localhost, port: 9008]: -1}

Current State: ACTIVE
Thread State: RUNNABLE

Logical Plan:
Filter NOT (results#23 = false)
+- Project [id#8L, created_at#17, user_id#6, message#7, <lambda>(message#7) AS results#23]
   +- Project [id#8L, to_utc_timestamp(cast(getDate(created_at#9) as timestamp), UTC) AS created_at#17, user_id#6, message#7]
      +- Project [tweet_data#4.id AS id#8L, tweet_data#4.created_at AS created_at#9, tweet_data#4.user.id_str AS user_id#6, tweet_data#4.text AS message#7]
         +- Project [from_json(StructField(id,LongType,false), StructField(created_at,StringType,false), StructField(user,StructType(StructField(id_str,StringType,false)),false), StructField(text,StringType,false), value#2, Some(Europe/Ljubljana)) AS tweet_data#4]
            +- Project [cast(value#0 as string) AS value#2]
               +- StreamingDataSourceV2Relation [value#0], org.apache.spark.sql.execution.streaming.sources.TextSocketTable$$anon$1@11ba245f, TextSocketV2[host: localhost, port: 9008]
