In [None]:
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.utils import AnalysisException
from pyspark.sql.types import StructType, StringType, BinaryType, IntegerType, DoubleType, TimestampType, DateType
from delta.tables import DeltaTable
from pyspark.sql.utils import AnalysisException
from pyspark.storagelevel import StorageLevel
from typing import Union, Optional

# --- Credenciais AWS ---
accessKeyId = ""
secretAccessKey = ""

# --- Sessão Spark ---
def create_spark_session() -> SparkSession:
    spark = (
        SparkSession
        .builder
        .appName("Bronze Zone")
        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
        .enableHiveSupport()
        .getOrCreate()
    )
    
    spark.sparkContext.setLogLevel("WARN")

    conf = spark.sparkContext._jsc.hadoopConfiguration()
    conf.set("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.TemporaryAWSCredentialsProvider")
    conf.set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    conf.set("fs.s3a.fast.upload", "true")
    conf.set("fs.s3a.bucket.all.committer.magic.enabled", "true")
    conf.set("fs.s3a.directory.marker.retention", "keep")
    conf.set("spark.driver.extraClassPath", "/usr/local/spark/jars/*")
    conf.set("spark.driver.memory", "8g")
    conf.set("spark.executor.memory", "16g")
    conf.set("fs.s3a.access.key", accessKeyId)
    conf.set("fs.s3a.secret.key", secretAccessKey)

    return spark

# --- Funções auxiliares ---
def filter_by_max_max_date(spark: SparkSession, df: DataFrame, pk_column: str, time_column: str = 'ingestion_time') -> DataFrame:
    if time_column not in df.columns:
        print(f"Aviso: Coluna '{time_column}' não encontrada. Retornando DataFrame original.")
        return df

    window_spec = Window.partitionBy(pk_column).orderBy(F.col(f"{time_column}").desc())

    return (
        df.withColumn("row_num", F.row_number().over(window_spec))
          .filter(F.col("row_num") == 1)
          .drop("row_num")
    )

def get_latest_ingestion(spark: SparkSession, df: DataFrame) -> DataFrame:
    if "ingestion_time" not in df.columns:
        print("Aviso: Coluna 'ingestion_time' não encontrada. Retornando DataFrame vazio.")
        return df.limit(0)

    max_ingestion_time = df.select(F.max("ingestion_time")).first()[0]
    return df.filter(F.col("ingestion_time") == max_ingestion_time)

def ensure_column_exists(
    spark: SparkSession,
    df: DataFrame,
    column_name: str,
    default_value: Union[str, int, float, bool, None] = None,
    column_type: Optional[str] = None
) -> DataFrame:
    """
    Verifica se uma coluna existe no DataFrame e a cria se necessário.
    
    Args:
        df: DataFrame do PySpark
        column_name: Nome da coluna a ser verificada/criada
        default_value: Valor padrão para a nova coluna (None por padrão)
        column_type: Tipo da coluna (opcional, inferido do default_value se não especificado)
    
    Returns:
        DataFrame com a coluna garantida
    
    Exemplos:
        >>> df = ensure_column_exists(df, "data_criacao", default_value=None, column_type="timestamp")
    """
    if column_name in df.columns:
        return df
    
    print(f"Coluna '{column_name}' não encontrada. Criando com valor padrão...")
    
    # Determina o tipo da coluna se não especificado
    if column_type is None:
        if isinstance(default_value, bool):
            column_type = "boolean"
        elif isinstance(default_value, int):
            column_type = "integer"
        elif isinstance(default_value, float):
            column_type = "double"
        elif isinstance(default_value, str):
            column_type = "string"
        else:
            column_type = "string"  # Default para outros tipos
    
    # Cria a coluna com o tipo apropriado
    if default_value is None:
        new_df = df.withColumn(column_name, F.lit(None).cast(column_type))
    else:
        new_df = df.withColumn(column_name, F.lit(default_value).cast(column_type))
    
    return new_df

def delta_table_exists(spark, path):
    """
    Verifica se uma tabela Delta existe no caminho especificado
    
    Args:
        spark: SparkSession
        path: Caminho para a tabela Delta (pode ser caminho S3, HDFS ou local)
        
    Returns:
        bool: True se a tabela existe, False caso contrário
    """
    try:
        DeltaTable.forPath(spark, path)
        return True
    except AnalysisException as e:
        if 'is not a Delta table' in str(e) or 'Path does not exist' in str(e):
            return False
        raise
    except Exception as e:
        # Captura outros possíveis erros
        if 'does not exist' in str(e):
            return False
        raise        

        
        
def upsert_with_delete_track(spark: SparkSession, source_df: DataFrame, delta_path: str, pk_column: str, ingestion_time_column: str = "ingestion_time", table_name: str = None):
    delta_table = DeltaTable.forPath(spark, delta_path)
    target_df = delta_table.toDF()
    ### LOG SCHEMAS ####
    #print("Schema da tabela Delta de destino:")
    #target_df.printSchema()
    #target_df.show(5, truncate=False)

    #print("Schema do DataFrame de origem:")
    #source_df.printSchema()
    #source_df.show(5, truncate=False)
    ####################
    
    source_df.createOrReplaceTempView("source_data")
    target_df.createOrReplaceTempView("target_data")
    
    
    records_to_deactivate = spark.sql(f"""
        SELECT t.{pk_column}
        FROM target_data t
        LEFT JOIN source_data s ON t.{pk_column} = s.{pk_column}
        WHERE s.{pk_column} IS NULL AND t.ativo = true
    """)

    count_to_deactivate = records_to_deactivate.count()
    print(f"Registros a desativar: {count_to_deactivate}")
    records_to_deactivate.show(truncate=False)
    ids_to_deactivate = records_to_deactivate.select(pk_column).rdd.flatMap(lambda x: x).collect()

    if ids_to_deactivate:
        print(f"Encontrados {len(ids_to_deactivate)} registros para desativar")

        delta_table.update(
            condition=F.col(pk_column).isin(ids_to_deactivate) & (F.col("ativo") == True),
            set={
                "ativo": F.lit(False),
                "deletion_time": F.current_timestamp(),
                "ingestion_time": F.current_timestamp()
            }
        )
        print("Registros desativados.")
    else:
        print("Nenhum registro para desativar encontrado.")
    
    # UPSERT
    delta_table.alias("target").merge(
        source_df.alias("source"),
        f"target.{pk_column} = source.{pk_column}"
    ).whenMatchedUpdate(
        condition=f"source.{ingestion_time_column} > target.{ingestion_time_column}",
        set={
            col: f"source.{col}"
            for col in source_df.columns
            if col != pk_column and col in target_df.columns
        }
    ).whenNotMatchedInsertAll().execute()

    print("UPSERT concluído!\n")


In [19]:
%%time
spark = create_spark_session()

# --- Paths ---
landing = "s3a://dev-lab-02-us-east-2-landing/db_barbearia/"
bronze = "s3a://dev-lab-02-us-east-2-bronze/db_barbearia/"

# --- Tabelas configuradas ---
tables_configs = {
    "cliente": {"pk": "cliente_id"},
    "profissional": {"pk": "profissional_id"},
    "servico": {"pk": "servico_id"},
    "agendamento": {"pk": "agendamento_id"},
    "pagamento": {"pk": "pagamento_id"},
    "horario_profissional": {"pk": "horario_id"},
    "promocao": {"pk": "promocao_id"},
    "servico_promocao": {"pk": "servico_promocao_id"}
}

CPU times: user 2.05 ms, sys: 1.49 ms, total: 3.53 ms
Wall time: 4.86 ms


In [None]:
%%time
contador = 0
for table_name, table_config in tables_configs.items():
    contador+=1
    print(f"{contador} / {len(tables_configs)} - Inicando ingestão na bronze para a tabela de {table_name}")
    
    # --- Pre-processamento ---
    df_landing = spark.read.format("parquet").load(f"{landing}{table_name}/")
    df_landing = ensure_column_exists(spark, df_landing, "ativo", default_value=True, column_type="boolean")
    df_landing = df_landing.withColumn("deletion_time", F.lit(None).cast(TimestampType()))
    df_landing = filter_by_max_max_date(spark, df_landing, table_config['pk'])
    df_landing = get_latest_ingestion(spark, df_landing)
    
    
    # --- Ingestão ---
    if delta_table_exists(spark, f"{bronze}{table_name}/"):
        upsert_with_delete_track(
            spark,
            source_df=df_landing,
            delta_path=f"{bronze}{table_name}/",
            pk_column=table_config['pk'],
            ingestion_time_column="ingestion_time",
            table_name=table_name
        )
    else:
        df_landing.write.format("delta").save(f"{bronze}{table_name}/")
        

1 / 8 - Inicando ingestão na bronze para a tabela de cliente
Registros a desativar: 0
+----------+
|cliente_id|
+----------+
+----------+

Nenhum registro para desativar encontrado.
UPSERT concluído.
2 / 8 - Inicando ingestão na bronze para a tabela de profissional
Registros a desativar: 0
+---------------+
|profissional_id|
+---------------+
+---------------+

Nenhum registro para desativar encontrado.
UPSERT concluído.
3 / 8 - Inicando ingestão na bronze para a tabela de servico
Registros a desativar: 0
+----------+
|servico_id|
+----------+
+----------+

Nenhum registro para desativar encontrado.
UPSERT concluído.
4 / 8 - Inicando ingestão na bronze para a tabela de agendamento
Coluna 'ativo' não encontrada. Criando com valor padrão...
Registros a desativar: 0
+--------------+
|agendamento_id|
+--------------+
+--------------+

Nenhum registro para desativar encontrado.
UPSERT concluído.
5 / 8 - Inicando ingestão na bronze para a tabela de pagamento
Coluna 'ativo' não encontrada. Cr

In [6]:
spark.read.format("delta").load("s3a://dev-lab-02-us-east-2-bronze/db_barbearia/cliente/").createOrReplaceTempView("cliente")

In [7]:
spark.sql("select count(1) from cliente").show()

+--------+
|count(1)|
+--------+
|     501|
+--------+



In [22]:
#Original
spark.sql("select nome,ativo from cliente where cliente_id in (1008,2)").show(truncate= False)

+------------------+-----+
|nome              |ativo|
+------------------+-----+
|Dr. Gabriel Pastor|false|
+------------------+-----+



In [25]:
#insert
spark.sql("select nome,ativo,deletion_time from cliente where cliente_id in (1008,2)").show(truncate= False)

+---------------------+-----+-------------+
|nome                 |ativo|deletion_time|
+---------------------+-----+-------------+
|Amaurir              |true |null         |
|Dr. Gabriel Pastor Dr|false|null         |
+---------------------+-----+-------------+



In [8]:
#delete
spark.sql("select nome,ativo,deletion_time from cliente where cliente_id in (1008,2)").show(truncate= False)

+-----------+-----+--------------------------+
|nome       |ativo|deletion_time             |
+-----------+-----+--------------------------+
|Amaurir    |false|2025-05-04 22:39:26.023927|
|Dr. Gabriel|false|null                      |
+-----------+-----+--------------------------+



0 / 1 - Inicando ingestão na bronze para a tabela de servico_promocao
