In [None]:
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.utils import AnalysisException
from pyspark.sql.types import StructType, StringType, BinaryType, IntegerType, DoubleType, TimestampType, DateType
from delta.tables import DeltaTable
from pyspark.storagelevel import StorageLevel

# --- Credenciais AWS ---
accessKeyId = ""
secretAccessKey = ""

# --- Sessão Spark ---
def create_spark_session() -> SparkSession:
    spark = (
        SparkSession
        .builder
        .appName("Bronze Zone")
        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
        .enableHiveSupport()
        .getOrCreate()
    )
    
    spark.sparkContext.setLogLevel("WARN")

    conf = spark.sparkContext._jsc.hadoopConfiguration()
    conf.set("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.TemporaryAWSCredentialsProvider")
    conf.set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    conf.set("fs.s3a.fast.upload", "true")
    conf.set("fs.s3a.bucket.all.committer.magic.enabled", "true")
    conf.set("fs.s3a.directory.marker.retention", "keep")
    conf.set("spark.driver.extraClassPath", "/usr/local/spark/jars/*")
    conf.set("spark.driver.memory", "8g")
    conf.set("spark.executor.memory", "16g")
    conf.set("fs.s3a.access.key", accessKeyId)
    conf.set("fs.s3a.secret.key", secretAccessKey)

    return spark

spark = create_spark_session()

# --- Paths ---
landing = "s3a://dev-lab-02-us-east-2-landing/db_barbearia/"
bronze = "s3a://dev-lab-02-us-east-2-bronze/db_barbearia/"

# --- Tabelas configuradas ---
tables_config = {
    "cliente": {"pk": "cliente_id"},
    # Adicione outras tabelas conforme necessário
}

# --- Funções auxiliares ---
def filter_by_max_ingestion(df: DataFrame, pk_column: str) -> DataFrame:
    if "ingestion_time" not in df.columns:
        print("Aviso: Coluna 'ingestion_time' não encontrada. Retornando DataFrame original.")
        return df

    window_spec = Window.partitionBy(pk_column).orderBy(F.col("ingestion_time").desc())

    return (
        df.withColumn("row_num", F.row_number().over(window_spec))
          .filter(F.col("row_num") == 1)
          .drop("row_num")
    )

def get_latest_ingestion(df: DataFrame) -> DataFrame:
    if "ingestion_time" not in df.columns:
        print("Aviso: Coluna 'ingestion_time' não encontrada. Retornando DataFrame vazio.")
        return df.limit(0)

    max_ingestion_time = df.select(F.max("ingestion_time")).first()[0]
    return df.filter(F.col("ingestion_time") == max_ingestion_time)

def ensure_column_exists(spark: SparkSession, delta_path: str, column_name: str, column_type: str):
    try:
        spark.read.format("delta").load(delta_path).limit(1)
    except AnalysisException as e:
        if "Path does not exist" in str(e):
            print(f"A tabela Delta não existe no caminho: {delta_path}.")
            return
        else:
            raise e

    temp_view = delta_path.replace("/", "_").replace(".", "_") + "_temp_check"

    try:
        spark.read.format("delta").load(delta_path).createOrReplaceTempView(temp_view)
        columns = spark.sql(f"DESCRIBE {temp_view}").select("col_name").rdd.flatMap(lambda x: x).collect()
        spark.catalog.dropTempView(temp_view)

        if column_name not in columns:
            print(f"A coluna '{column_name}' não existe em {delta_path}. Adicionando...")
            spark.sql(f"ALTER TABLE delta.`{delta_path}` ADD COLUMNS ({column_name} {column_type})")
            print("Coluna adicionada com sucesso.")
        else:
            print(f"A coluna '{column_name}' já existe.")
    except AnalysisException as e:
        print(f"Erro ao verificar/adicionar coluna: {e}")

def upsert_with_delete_track(source_df: DataFrame, delta_path: str, pk_column: str, ingestion_time_column: str = "ingestion_time"):
    delta_table = DeltaTable.forPath(spark, delta_path)
    target_df = delta_table.toDF()

    print("Schema da tabela Delta de destino:")
    target_df.printSchema()
    target_df.show(5, truncate=False)

    print("Schema do DataFrame de origem:")
    source_df.printSchema()
    source_df.show(5, truncate=False)

    source_df.createOrReplaceTempView("source_data")
    target_df.createOrReplaceTempView("target_data")

    records_to_deactivate = spark.sql(f"""
        SELECT t.{pk_column}
        FROM target_data t
        LEFT JOIN source_data s ON t.{pk_column} = s.{pk_column}
        WHERE s.{pk_column} IS NULL AND t.ativo = true
    """)

    count_to_deactivate = records_to_deactivate.count()
    print(f"Registros a desativar: {count_to_deactivate}")
    records_to_deactivate.show(truncate=False)
    ids_to_deactivate = records_to_deactivate.select(pk_column).rdd.flatMap(lambda x: x).collect()

    if ids_to_deactivate:
        print(f"Encontrados {len(ids_to_deactivate)} registros para desativar")

        delta_table.update(
            condition=F.col(pk_column).isin(ids_to_deactivate) & (F.col("ativo") == True),
            set={
                "ativo": F.lit(False),
                "deletion_time": F.current_timestamp(),
                "ingestion_time": F.current_timestamp()
            }
        )
        print("Registros desativados.")
    else:
        print("Nenhum registro para desativar encontrado.")

    # UPSERT
    delta_table.alias("target").merge(
        source_df.alias("source"),
        f"target.{pk_column} = source.{pk_column}"
    ).whenMatchedUpdate(
        condition=f"source.{ingestion_time_column} > target.{ingestion_time_column}",
        set={
            col: f"source.{col}"
            for col in source_df.columns
            if col != pk_column and col in target_df.columns
        }
    ).whenNotMatchedInsertAll().execute()

    print("UPSERT concluído.")


In [4]:
# --- Pipeline de execução ---
source_clientes = spark.read.format("parquet").load(f"{landing}cliente/")
source_clientes = source_clientes.withColumn("deletion_time", F.lit(None).cast(TimestampType()))
source_clientes = filter_by_max_ingestion(source_clientes, 'cliente_id')
source_clientes = get_latest_ingestion(source_clientes)

upsert_with_delete_track(
    source_df=source_clientes,
    delta_path=f"{bronze}cliente/",
    pk_column="cliente_id",
    ingestion_time_column="ingestion_time"
)

Schema da tabela Delta de destino:
root
 |-- cliente_id: integer (nullable = true)
 |-- nome: string (nullable = true)
 |-- telefone: string (nullable = true)
 |-- email: string (nullable = true)
 |-- data_nascimento: date (nullable = true)
 |-- data_cadastro: timestamp (nullable = true)
 |-- observacoes: string (nullable = true)
 |-- ativo: boolean (nullable = true)
 |-- ingestion_time: timestamp (nullable = true)
 |-- origem: string (nullable = true)
 |-- deletion_time: timestamp (nullable = true)

+----------+----------------------------+-------------+-----------------------------------+---------------+--------------------------+-----------+-----+--------------------------+----------+-------------+
|cliente_id|nome                        |telefone     |email                              |data_nascimento|data_cadastro             |observacoes|ativo|ingestion_time            |origem    |deletion_time|
+----------+----------------------------+-------------+-----------------------------

In [6]:
spark.read.format("delta").load("s3a://dev-lab-02-us-east-2-bronze/db_barbearia/cliente/").createOrReplaceTempView("cliente")

In [7]:
spark.sql("select count(1) from cliente").show()

+--------+
|count(1)|
+--------+
|     501|
+--------+



In [22]:
#Original
spark.sql("select nome,ativo from cliente where cliente_id in (1008,2)").show(truncate= False)

+------------------+-----+
|nome              |ativo|
+------------------+-----+
|Dr. Gabriel Pastor|false|
+------------------+-----+



In [25]:
#insert
spark.sql("select nome,ativo,deletion_time from cliente where cliente_id in (1008,2)").show(truncate= False)

+---------------------+-----+-------------+
|nome                 |ativo|deletion_time|
+---------------------+-----+-------------+
|Amaurir              |true |null         |
|Dr. Gabriel Pastor Dr|false|null         |
+---------------------+-----+-------------+



In [8]:
#delete
spark.sql("select nome,ativo,deletion_time from cliente where cliente_id in (1008,2)").show(truncate= False)

+-----------+-----+--------------------------+
|nome       |ativo|deletion_time             |
+-----------+-----+--------------------------+
|Amaurir    |false|2025-05-04 22:39:26.023927|
|Dr. Gabriel|false|null                      |
+-----------+-----+--------------------------+

