In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, from_json, to_date, col, to_utc_timestamp, explode, split
from pyspark.sql.types import LongType, StructType, StringType
from datetime import datetime
# from tweet_parser import TweetParser
import yaml
import configparser
import os
import pytz
import re, json

from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, from_json, to_date, col, to_utc_timestamp, explode, split, current_timestamp
from pyspark.sql.types import LongType, StructType, StringType
from datetime import datetime
# from tweet_parser import TweetParser
import yaml
import configparser
import os
import pytz
import json, re
from time import gmtime, strftime

PUZZLE_ID = r"\d{1,3}"
ATTEMPTS  = r"(?:[1-6]\/6|X\/6)"
HEADER    = rf"Wordle\s{PUZZLE_ID}\s{ATTEMPTS}"

WHITE     = r"⬜"
DARK      = r"⬛"
EMPTY     = rf"({WHITE}|{DARK})"

YELLOW    = r"🟨"
GREEN     = r"🟩"
SQUARE    = rf"({EMPTY}|{YELLOW}|{GREEN})"
ROW       = rf"{SQUARE}{SQUARE}{SQUARE}{SQUARE}{SQUARE}"

class TweetParser():

    def __init__(self, *args):
        self.text = args[0]

    def _number_of_attempts(self, text):
        m = re.findall(r"(.)/6", text)[0]
        if m == 'X':
            return 6
        else:
            return int(m)

    def _parse_attempts(self, grid):
        replacements = [
            (EMPTY, '0'),
            (YELLOW, '1'),
            (GREEN, '2')
        ]
        normalised_grid = []

        for char, number in replacements:
            grid = re.sub(char, number, grid)

        grid = grid.split()
        for row in grid:
            normalised_grid.append(list(row))

        # returns nested list [['0', '1', '0', '0', '0'], ['0', '0', '2', '0', '0'],..]
        return normalised_grid

    def _to_json(self, result):
        result_dict = {}
        for i, row in enumerate(result, start=1):
            result_dict[i] = row

        return json.dumps(result_dict)

    def wordle_result_exist(self):  # better like a function that returns true/false?
        result = []
        num_of_attempts = 0
        m = re.search(HEADER, self.text)
        if m is None:
            return False

        num_of_attempts = self._number_of_attempts(m[0])
        grid = rf"{ROW}\n"*num_of_attempts
        grid = grid[:-2] + ''
        m = re.search(grid, self.text)
        if m is None:
            return False
        else:
            result = self._parse_attempts(m.group(0))

        full_result = rf"{HEADER}\n\n{grid}"
        m = re.search(full_result, self.text)
        if m is None:
            return False

        print("Before returning result")
        return self._to_json(result)


spark = SparkSession.builder.appName("Wordle score streaming").getOrCreate()

config = ""
with open('../secrets.yml', 'r') as file:
    config = yaml.safe_load(file)

s3_conf = configparser.ConfigParser()
config_path = os.path.join(os.path.expanduser('~'), '.aws/credentials')
s3_conf.read(config_path)

def getResults(text):
    return TweetParser(text).wordle_result_exist()

## Converting date string format
def getDate(x):
    if x is not None:
        # return str(datetime.strptime(x,'%a %b %d %H:%M:%S +0000 %Y').replace(tzinfo=pytz.UTC).strftime("%Y-%m-%d %H:%M:%S"))
        return str(datetime.strptime(x,'%a %b %d %H:%M:%S +0000 %Y'))
    else:
        return None

def getSystemTimeZone():
    return strftime("%z", gmtime())

## UDF declaration
date_fn = udf(getDate, StringType())
attempts_fn = udf(lambda  x: getResults(x), StringType())

lines = spark \
    .readStream \
    .format("kinesis") \
    .option("streamName", "twitter_wordle_stream") \
    .option("endpointUrl", "https://kinesis.eu-central-1.amazonaws.com") \
    .option("awsAccessKeyId", s3_conf.get('development', 'aws_access_key_id')) \
    .option("awsSecretKey", s3_conf.get('development', 'aws_secret_access_key')) \
    .option("startingposition", "latest") \
    .load()

schema = StructType(). \
    add('id', LongType(), False). \
    add('created_at', StringType(), False) .\
    add('user', StructType().add("id_str",StringType(), False), False). \
    add('text', StringType(), False)

filtered_data = lines \
    .selectExpr('CAST(data AS STRING)') \
    .select(from_json('data', schema).alias('tweet_data')) \
    .selectExpr('tweet_data.id', 'tweet_data.created_at', 'tweet_data.user.id_str AS user_id', 'tweet_data.text AS message') \
    .withColumn("created_at", to_utc_timestamp(date_fn("created_at"), "UTC")) \
    .withColumn('processed_at', to_utc_timestamp(current_timestamp(), getSystemTimeZone())) \
    .withColumn('results', attempts_fn(col('message')))

filtered_data = filtered_data.filter(col('results') != "false")


filtered_data.printSchema()

# Start running the query that prints tweet data to the console
query = filtered_data \
    .writeStream \
    .format("console") \
    .outputMode("append") \
    .trigger(processingTime= "5 seconds") \
    .start()

query.awaitTermination()


spark = SparkSession.builder.appName("Wordle score streaming").getOrCreate()

config = ""
with open('../secrets.yml', 'r') as file:
    config = yaml.safe_load(file)

s3_conf = configparser.ConfigParser()
config_path = os.path.join(os.path.expanduser('~'), '.aws/credentials')
s3_conf.read(config_path)

def getResults(text):
    return TweetParser(text).wordle_result_exist()

## Converting date string format
def getDate(x):
    if x is not None:
        return str(datetime.strptime(x,'%a %b %d %H:%M:%S +0000 %Y').replace(tzinfo=pytz.UTC).strftime("%Y-%m-%d %H:%M:%S"))
    else:
        return None

## UDF declaration
date_fn = udf(getDate, StringType())
attempts_fn = udf(lambda  x: getResults(x), StringType())

lines = spark \
	.readStream \
	.format("kinesis") \
	.option("streamName", config['streamName']) \
   	.option("endpointUrl", "https://kinesis.eu-central-1.amazonaws.com") \
    .option("awsAccessKeyId", s3_conf.get('development', 'aws_access_key_id')) \
    .option("awsSecretKey", s3_conf.get('development', 'aws_secret_access_key')) \
    .option("startingposition", "latest") \
	.load()

schema = StructType(). \
    add('id', LongType(), False). \
    add('created_at', StringType(), False) .\
    add('user', StructType().add("id_str",StringType(), False), False). \
    add('text', StringType(), False)

filtered_data = lines \
    .selectExpr('CAST(data AS STRING)') \
    .select(from_json('data', schema).alias('tweet_data')) \
    .selectExpr('tweet_data.id', 'tweet_data.created_at', 'tweet_data.user.id_str AS user_id', 'tweet_data.text AS message') \
    .withColumn("created_at", to_utc_timestamp(date_fn("created_at"),"UTC")) \
    .withColumn('results', attempts_fn(col('message')))

filtered_data = filtered_data.filter(col('results') != "false")


filtered_data.printSchema()

# Start running the query that prints tweet data to the console
query = filtered_data \
    .writeStream \
    .format("console") \
    .outputMode("append") \
    .trigger(processingTime= "5 seconds") \
    .start()

query.awaitTermination()