In [None]:
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.utils import AnalysisException
from pyspark.sql.types import StructType, StructField, StringType, BinaryType, IntegerType, DoubleType, TimestampType, DateType, LongType
from delta.tables import DeltaTable
from pyspark.sql.utils import AnalysisException
from pyspark.storagelevel import StorageLevel
from typing import Union, Optional
from pyspark.sql.functions import input_file_name

# --- Credenciais AWS ---
accessKeyId = ""
secretAccessKey = ""

# --- Sessão Spark ---
def create_spark_session() -> SparkSession:
    spark = (
        SparkSession
        .builder
        .appName("Silver Zone Streaming")
        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
        .enableHiveSupport()
        .getOrCreate()
    )
    
    spark.sparkContext.setLogLevel("WARN")

    conf = spark.sparkContext._jsc.hadoopConfiguration()
    conf.set("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.TemporaryAWSCredentialsProvider")
    conf.set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    conf.set("fs.s3a.fast.upload", "true")
    conf.set("fs.s3a.bucket.all.committer.magic.enabled", "true")
    conf.set("fs.s3a.directory.marker.retention", "keep")
    conf.set("spark.driver.extraClassPath", "/usr/local/spark/jars/*")
    conf.set("spark.driver.memory", "8g")
    conf.set("spark.executor.memory", "16g")
    conf.set("fs.s3a.access.key", accessKeyId)
    conf.set("fs.s3a.secret.key", secretAccessKey)

    return spark

spark = create_spark_session()

In [20]:
bronze_path = "s3a://dev-lab-02-us-east-2-bronze/spotify/"
silver_path = f"s3a://dev-lab-02-us-east-2-silver/"
silver_table = "fato_streamings"
silver_table_path = f"{silver_path}{silver_table}"
tables = ["usuarios","musicas","streamings"]
for t in tables:
    spark.readStream.format("delta").load(f"{bronze_path}{t}").createOrReplaceTempView(f"{t}")

In [21]:
df_result = spark.sql("""
select 
  s.id,
  u.id as id_usuario,
  CONCAT(
    SUBSTRING(email, 1, 1),                          -- primeira letra
    REPEAT('*', INSTR(email, '@') - 2),              -- repete '*' até o '@' (excluindo a 1ª letra)
    SUBSTRING(email, INSTR(email, '@'), LENGTH(email))  -- de '@' até o final
  ) AS masked_email,
  split(s.musica,'-')[0] as artista,
  split(s.musica,'-')[1] as musica,
  instr(split(s.musica,'-')[1], 'w/') > 0 AS flg_feat,
  s.timestamp,
  s.origem_arquivo
from streamings as s
inner join usuarios as u on (
    u.nome = s.nome
) 
""")

In [270]:
def upsert_to_delta(microbatch_df, batch_id):
    if microbatch_df.rdd.isEmpty():
        return

    windowed_df = (
        microbatch_df
        .withColumn("timestamp", F.col("timestamp").cast("timestamp"))
        .dropDuplicates(["id"])  # garante que não haja duplicatas por ID no microbatch
    )
    
    if DeltaTable.isDeltaTable(spark, silver_table_path):
        delta_table = DeltaTable.forPath(spark, silver_table_path)

        delta_table.alias("target").merge(
            windowed_df.alias("source"),
            "target.id = source.id"
        ).whenMatchedUpdateAll(
        ).whenNotMatchedInsertAll(
        ).execute()

    else:
        windowed_df.write.format("delta").mode("overwrite").save(silver_table_path)

In [275]:
query_silver = (
    df_result.writeStream
    .format("delta")
    .foreachBatch(upsert_to_delta)
    .outputMode("append")  
    .option("checkpointLocation", f"{silver_path}/checkpoints/{silver_table}")
    .start()
)

In [274]:
query_silver.awaitTermination()

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.3-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/spark/python/lib/py4j-0.10.9.3-src.zip/py4j/clientserver.py", line 475, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/opt/conda/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 