In [0]:
orders = spark.read.format("delta").load("/Volumes/workspace/default/ifood-files/bronze/orders")
orders.printSchema()

In [0]:
deliveries = spark.read.format("delta").load("/Volumes/workspace/default/ifood-files/bronze/deliveries")
deliveries.printSchema()

In [0]:
display(deliveries)

In [0]:
from pyspark.sql.functions import col, to_timestamp, concat_ws

# Caminhos das camadas Bronze e Silver
bronze_base = "/Volumes/workspace/default/ifood-files/bronze/"
silver_base = "/Volumes/workspace/default/ifood-files/silver/delivery_center_enriched"

# Leitura das tabelas Bronze
channels   = spark.read.format("delta").load(f"{bronze_base}channels")
deliveries = spark.read.format("delta").load(f"{bronze_base}deliveries")
drivers    = spark.read.format("delta").load(f"{bronze_base}drivers")
hubs       = spark.read.format("delta").load(f"{bronze_base}hubs")
orders     = spark.read.format("delta").load(f"{bronze_base}orders")
payments   = spark.read.format("delta").load(f"{bronze_base}payments")
stores     = spark.read.format("delta").load(f"{bronze_base}stores")

# Reconstruindo coluna de timestamp do pedido
orders = orders.withColumn(
    "order_purchase_timestamp",
    to_timestamp(concat_ws("-", "order_created_year", "order_created_month", "order_created_day"))
)

# Realizando os joins com aliases
df = orders.alias("o") \
    .join(payments.alias("p"), col("o.order_id") == col("p.payment_order_id"), "left") \
    .join(stores.alias("s"), col("o.store_id") == col("s.store_id"), "left") \
    .join(channels.alias("c"), col("o.channel_id") == col("c.channel_id"), "left") \
    .join(deliveries.alias("d"), col("o.order_id") == col("d.delivery_order_id"), "left") 
   

# Seleção das colunas com alias para nomes limpos (sem prefixos)
df = df.select(
    col("o.order_id").alias("order_id"),
    col("o.order_status").alias("order_status"),
    col("order_purchase_timestamp"),
    col("p.payment_method").alias("payment_method"),
    col("p.payment_amount").alias("payment_amount"),
    col("s.store_id").alias("store_id"),
    col("s.store_name").alias("store_name"),  # Corrected column name
    col("s.hub_id").alias("hub_id"),  # Corrected column name
    col("c.channel_id").alias("channel_id"),
    col("c.channel_type").alias("channel_type"),
    col("d.delivery_id").alias("delivery_id"),
    col("d.delivery_distance_meters").alias("delivery_distance_meters")
)

# Agora sim, deduplicar pelo nome limpo
df = df.dropDuplicates(["order_id"])

# Escrita final na camada Silver
df.write.format("delta").mode("overwrite").save(silver_base)

# Visualizar os dados
df_check = spark.read.format("delta").load(silver_base)
display(df_check.limit(10))

In [0]:
from pyspark.sql.functions import col, to_timestamp, concat_ws

# Caminhos das camadas Bronze e Silver
bronze_base = "/Volumes/workspace/default/ifood-files/bronze/"
silver_base = "/Volumes/workspace/default/ifood-files/silver/delivery_center_enriched"

# Leitura das tabelas Bronze
channels   = spark.read.format("delta").load(f"{bronze_base}channels")
deliveries = spark.read.format("delta").load(f"{bronze_base}deliveries")
drivers    = spark.read.format("delta").load(f"{bronze_base}drivers")
hubs       = spark.read.format("delta").load(f"{bronze_base}hubs")
orders     = spark.read.format("delta").load(f"{bronze_base}orders")
payments   = spark.read.format("delta").load(f"{bronze_base}payments")
stores     = spark.read.format("delta").load(f"{bronze_base}stores")

# Reconstruindo coluna de timestamp do pedido
orders = orders.withColumn(
    "order_purchase_timestamp",
    to_timestamp(concat_ws("-", "order_created_year", "order_created_month", "order_created_day"))
)

# Realizando os joins com aliases
df = orders.alias("o") \
    .join(payments.alias("p"), col("o.order_id") == col("p.payment_order_id"), "left") \
    .join(stores.alias("s"), col("o.store_id") == col("s.store_id"), "left") \
    .join(channels.alias("c"), col("o.channel_id") == col("c.channel_id"), "left") \
    .join(deliveries.alias("d"), col("o.order_id") == col("d.delivery_order_id"), "left") \
    .join(hubs.alias("h"), col("s.hub_id") == col("h.hub_id"), "left") \
    .join(drivers.alias("dr"), col("d.driver_id") == col("dr.driver_id"), "left")

# Seleção das colunas com alias para nomes limpos (sem prefixos)
df = df.select(
    col("o.order_id").alias("order_id"),
    col("o.order_status").alias("order_status"),
    col("order_purchase_timestamp"),
    col("p.payment_method").alias("payment_method"),
    col("p.payment_amount").alias("payment_amount"),
    col("s.store_id").alias("store_id"),
    col("s.store_name").alias("store_name"),
    col("s.hub_id").alias("hub_id"),
    col("c.channel_id").alias("channel_id"),
    col("c.channel_type").alias("channel_type"),
    col("d.delivery_id").alias("delivery_id"),
    col("d.delivery_distance_meters").alias("delivery_distance_meters"),
    col("h.hub_city").alias("hub_city"),
    col("h.hub_state").alias("hub_state"),
    col("dr.driver_id").alias("driver_id"),
    col("dr.driver_name").alias("driver_name")
)

# Remover duplicatas pelo order_id
df = df.dropDuplicates(["order_id"])

# Escrita final na camada Silver
df.write.format("delta").mode("overwrite").save(silver_base)

# Visualizar os dados
df_check = spark.read.format("delta").load(silver_base)
display(df_check.limit(10))
