In [1]:
# Importar bibliotecas necessárias para Spark
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, lit, when, trim, coalesce, create_map, array, lower,
    year, month, dayofmonth, dayofweek, weekofyear, quarter, date_format
)
from pyspark.sql.types import DoubleType, IntegerType, StringType, DateType, DecimalType, BooleanType

# Importar a biblioteca holidays para gerar os feriados da Turquia
import holidays
import pandas as pd
from datetime import date as dt_date # Para uso com a biblioteca holidays

# --- 1. Configuração de Lakehouses e Schemas ---
# (Manter conforme já definido e corrigido)
bronze_lakehouse_name = "Projeto_II_Bronze_"
bronze_schema_name = "Projeto_II_Bronze_"

silver_lakehouse_name = "Projeto_II_Silver_"
silver_schema_name = "Projeto_II_Silver_"

gold_lakehouse_name = "Projeto_II_Gold_"
gold_schema_name = "Projeto_II_Gold_"


# --- Carregar tabelas Silver para processamento Gold ---
print("\n--- Carregando tabelas Silver para processamento Gold ---")
try:
    df_cities_silver = spark.read.format("delta").table(f"{silver_lakehouse_name}.{silver_schema_name}.cities_silver")
    df_product_silver = spark.read.format("delta").table(f"{silver_lakehouse_name}.{silver_schema_name}.product_silver")
    df_sales_silver = spark.read.format("delta").table(f"{silver_lakehouse_name}.{silver_schema_name}.sales_silver")
    print("Tabelas Silver carregadas com sucesso.")
except Exception as e:
    print(f"ERRO: Falha ao carregar tabelas Silver. Detalhes do erro: {e}")
    raise # É importante levantar o erro para parar a execução se as tabelas não puderem ser carregadas

print("\n" + "="*80)
print(f"--- Processamento da Camada Silver para Gold Iniciado. ---")
print(f"Escrevendo no Lakehouse Gold: '{gold_lakehouse_name}', Schema: '{gold_schema_name}'")
print("="*80)

# --- Criação das Tabelas da Camada Gold ---
#
# --- 1. dim_stores ---
print("\n--- Criando dim_stores na Camada Gold ---")
try:
    dim_stores = df_cities_silver.select(
        col("store_id").alias("store_id"),
        col("storetype_id"),
        col("store_size"),
        col("city_name")
    ).dropDuplicates(["store_id"])

    dim_stores.write \
        .format("delta") \
        .mode("overwrite") \
        .option("delta.logRetentionDuration", "interval 30 days") \
        .option("delta.deletedFileRetentionDuration", "interval 30 days") \
        .saveAsTable(f"{gold_lakehouse_name}.{gold_schema_name}.dim_stores")

    print(f"Tabela 'dim_stores' criada com sucesso no Lakehouse '{gold_lakehouse_name}'.")
    dim_stores.printSchema()
    dim_stores.show(5)

except Exception as e:
    print(f"ERRO: Falha ao criar 'dim_stores'. Detalhes do erro: {e}")

# --- 2. dim_product ---
print("\n--- Criando dim_product na Camada Gold ---")
try:
    dim_product = df_product_silver.select(
        col("product_id").alias("product_id"),
        col("product_length"),
        col("product_depth"),
        col("product_width"),
        col("cluster_id"),
        col("hierarchy1_id"),
        col("hierarchy2_id"),
        col("hierarchy3_id"),
        col("hierarchy4_id"),
        col("hierarchy5_id")
    ).dropDuplicates(["product_id"])

    dim_product.write \
        .format("delta") \
        .mode("overwrite") \
        .option("delta.logRetentionDuration", "interval 30 days") \
        .option("delta.deletedFileRetentionDuration", "interval 30 days") \
        .saveAsTable(f"{gold_lakehouse_name}.{gold_schema_name}.dim_product")

    print(f"Tabela 'dim_product' criada com sucesso no Lakehouse '{gold_lakehouse_name}'.")
    dim_product.printSchema()
    dim_product.show(5)

except Exception as e:
    print(f"ERRO: Falha ao criar 'dim_product'. Detalhes do erro: {e}")


# --- 3. dim_date (com is_holiday) ---
print("\n--- Criando dim_date na Camada Gold (com is_holiday) ---")
try:
    # 1. Obter todas as datas únicas da tabela de fatos para definir o intervalo
    df_distinct_dates = df_sales_silver.select(col("date")).distinct().filter(col("date").isNotNull())

    # Converter para Pandas para encontrar o min/max date para a geração de feriados
    min_date_val = df_distinct_dates.agg({"date": "min"}).collect()[0][0]
    max_date_val = df_distinct_dates.agg({"date": "max"}).collect()[0][0]

    # Ajustar para incluir o ano completo de início e fim
    start_year = min_date_val.year
    end_year = max_date_val.year

    # 2. Gerar feriados da Turquia usando a biblioteca 'holidays'
    # 'TR' é o código ISO para Turquia
    turkey_holidays = holidays.CountryHoliday('TR', years=range(start_year, end_year + 1))

    # Criar uma lista de tuplas (data_objeto, True) para os feriados
    holiday_data = [(date_obj, True) for date_obj in turkey_holidays.keys()]

    # Criar um Pandas DataFrame e depois converter para Spark DataFrame
    df_holidays_pandas = pd.DataFrame(holiday_data, columns=['full_date', 'is_holiday_temp'])
    df_holidays_spark = spark.createDataFrame(df_holidays_pandas) \
                             .withColumn("full_date", col("full_date").cast(DateType())) # Garantir DateType

    # 3. Gerar a dim_date inicial sem is_holiday
    dim_date_base = df_distinct_dates.select(
        date_format(col("date"), "yyyyMMdd").cast(IntegerType()).alias("date_id"),
        col("date").alias("full_date"),
        year(col("date")).alias("year"),
        month(col("date")).alias("month"), # Aqui estava faltando um `.alias("month")`
        dayofmonth(col("date")).alias("day"),
        dayofweek(col("date")).alias("day_of_week_num"),
        date_format(col("date"), "E").alias("day_of_week_short"),
        date_format(col("date"), "EEEE").alias("day_of_week_long"),
        weekofyear(col("date")).alias("week_of_year"),
        quarter(col("date")).alias("quarter"),
        date_format(col("date"), "MMMM").alias("month_name_long"),
        date_format(col("date"), "MMM").alias("month_name_short"),
        when(dayofweek(col("date")).isin([1, 7]), True).otherwise(False).alias("is_weekend")
    ).dropDuplicates(["date_id"])

    # 4. Unir dim_date_base com df_holidays_spark para adicionar a coluna is_holiday
    dim_date_final = dim_date_base.alias("d") \
        .join(df_holidays_spark.alias("h"), col("d.full_date") == col("h.full_date"), "left_outer") \
        .select(
            col("d.date_id"),
            col("d.full_date"),
            col("d.year"),
            col("d.month"),
            col("d.day"),
            col("d.day_of_week_num"),
            col("d.day_of_week_short"),
            col("d.day_of_week_long"),
            col("d.week_of_year"),
            col("d.quarter"),
            col("d.month_name_long"),
            col("d.month_name_short"),
            col("d.is_weekend"),
            coalesce(col("h.is_holiday_temp"), lit(False)).cast(BooleanType()).alias("is_holiday")
        )

    dim_date_final.write \
        .format("delta") \
        .mode("overwrite") \
        .option("delta.logRetentionDuration", "interval 30 days") \
        .option("delta.deletedFileRetentionDuration", "interval 30 days") \
        .saveAsTable(f"{gold_lakehouse_name}.{gold_schema_name}.dim_date")

    print(f"Tabela 'dim_date' criada com sucesso no Lakehouse '{gold_lakehouse_name}'.")
    dim_date_final.printSchema()
    dim_date_final.show(5)

except Exception as e:
    print(f"ERRO: Falha ao criar 'dim_date'. Detalhes do erro: {e}")

# --- 4. fact_sales ---
print("\n--- Criando fact_sales na Camada Gold ---")
try:
    fact_sales = df_sales_silver.alias("s") \
        .join(dim_date_final.alias("dd"), col("s.date") == col("dd.full_date"), "inner") \
        .select(
            col("dd.date_id"),
            col("s.store_id"),
            col("s.product_id"),
            
            col("s.sales"),
            col("s.revenue"),
            col("s.stock"),
            col("s.price"),
            
            col("s.promo_type_1"),
            col("s.promo_bin_1"),
            col("s.promo_type_2"),
            col("s.promo_bin_2"),
            
            col("s.promo_discount_2"),
            col("s.promo_discount_type_2")
        )

    fact_sales.write \
        .format("delta") \
        .mode("overwrite") \
        .option("delta.logRetentionDuration", "interval 30 days") \
        .option("delta.deletedFileRetentionDuration", "interval 30 days") \
        .saveAsTable(f"{gold_lakehouse_name}.{gold_schema_name}.fact_sales")

    print(f"Tabela 'fact_sales' criada com sucesso no Lakehouse '{gold_lakehouse_name}'.")
    fact_sales.printSchema()
    fact_sales.show(5)

except Exception as e:
    print(f"ERRO: Falha ao criar 'fact_sales'. Detalhes do erro: {e}")

print("\n" + "="*80)
print("--- Processamento da Camada Silver para Gold Concluído! ---")
print(f"As tabelas do Star Schema foram criadas no Lakehouse '{gold_lakehouse_name}', Schema '{gold_schema_name}'.")
print("="*80)

StatementMeta(, 98114a7f-1dfb-4ec0-a4f3-669ce03f6fb9, 3, Finished, Available, Finished)


--- Carregando tabelas Silver para processamento Gold ---
Tabelas Silver carregadas com sucesso.

--- Processamento da Camada Silver para Gold Iniciado. ---
Escrevendo no Lakehouse Gold: 'Projeto_II_Gold_', Schema: 'Projeto_II_Gold_'

--- Criando dim_stores na Camada Gold ---
Tabela 'dim_stores' criada com sucesso no Lakehouse 'Projeto_II_Gold_'.
root
 |-- store_id: string (nullable = true)
 |-- storetype_id: string (nullable = true)
 |-- store_size: integer (nullable = true)
 |-- city_name: string (nullable = true)

+--------+------------+----------+---------+
|store_id|storetype_id|store_size|city_name|
+--------+------------+----------+---------+
|   S0002|        ST04|        39|    Adana|
|   S0003|        ST03|        17| Istanbul|
|   S0005|        ST04|        19|  Denizli|
|   S0007|        ST03|        16| Istanbul|
|   S0010|        ST04|        17| Istanbul|
+--------+------------+----------+---------+
only showing top 5 rows


--- Criando dim_product na Camada Gold ---
Ta