In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode
from pyspark.sql.functions import split
from pyspark import SparkContext, SparkConf
import os
import yaml
import configparser


os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.hadoop:hadoop-aws:3.3.1 pyspark-shell'

s3_conf = ""
with open('../secrets.yml', 'r') as file:
    s3_conf = yaml.safe_load(file)
    

config = configparser.ConfigParser()
config_path = os.path.join(os.path.expanduser('~'), '.aws/credentials')
config.read(config_path)

#spark configuration
conf = SparkConf() \
    .set('spark.executor.extraJavaOptions','-Dcom.amazonaws.services.s3.enableV4=true') \
    .set('spark.driver.extraJavaOptions','-Dcom.amazonaws.services.s3.enableV4=true') \
    .setAppName('Wordle score streaming').setMaster('local[*]')

sc=SparkContext(conf=conf)
sc.setSystemProperty('com.amazonaws.services.s3.enableV4', 'true')
sc.setLogLevel("ERROR")

hadoopConf = sc._jsc.hadoopConfiguration()
hadoopConf.set('fs.s3a.access.key', config.get('development', 'aws_access_key_id'))
hadoopConf.set('fs.s3a.secret.key', config.get('development', 'aws_secret_access_key'))
hadoopConf.set('fs.s3a.endpoint', 's3-eu-central-1.amazonaws.com')
hadoopConf.set('fs.s3a.impl', 'org.apache.hadoop.fs.s3a.S3AFileSystem')


spark=SparkSession(sc)


lines = spark \
    .readStream \
    .format("socket") \
    .option("host", "localhost") \
    .option("port", 9009) \
    .load()


# Write to s3
query = lines \
    .writeStream \
    .format("parquet") \
    .option("path", "s3a://" +s3_conf['bucket'] + "/" + s3_conf['folder'] + "/") \
    .option("checkpointLocation", "s3a://" + s3_conf['bucket'] + "/" + s3_conf['folder'] + "/") \
    .start()

    
query.awaitTermination()