In [67]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.sql.window import Window
from pyspark.sql.functions import dense_rank

In [68]:
spark = SparkSession.builder.master("local[*]") \
    .config("spark.executor.instances", "5") \
    .config("spark.sql.legacy.timeParserPolicy", "LEGACY").getOrCreate()

In [69]:
diretorio_parquet = "spark-warehouse/bronze/supermarket_sales"

dados = spark.read.parquet(diretorio_parquet)

In [70]:
dados.show()

+-----------+------+---------+-------------+------+--------------------+----------+--------+-------------+--------+-----------+------+-----------------------+------------+------+----+-----+---+----+------+
| invoice_id|branch|     city|customer_type|gender|        product_line|unit_price|quantity|tax_5_percent|   total|    payment|  cogs|gross_margin_percentage|gross_income|rating|year|month|day|hour|minute|
+-----------+------+---------+-------------+------+--------------------+----------+--------+-------------+--------+-----------+------+-----------------------+------------+------+----+-----+---+----+------+
|308-81-0538|     A|   Yangon|       Normal|Female| Fashion accessories|     73.05|       4|        14.61|  306.81|Credit card| 292.2|            4.761904762|       14.61|   4.9|2019|    2| 25|  17|    16|
|834-83-1826|     B| Mandalay|       Member|Female|  Home and lifestyle|     82.04|       5|        20.51|  430.71|Credit card| 410.2|            4.761904762|       20.51|   7.

In [74]:
# Criando DataFrames a partir das consultas SQL
df_silver_dim_cliente = spark.sql("SELECT DISTINCT branch, city, customer_type, gender FROM transacoes")
df_silver_dim_produto = spark.sql("SELECT DISTINCT product_line, unit_price FROM transacoes")
df_silver_fato_transacao = spark.sql("SELECT * FROM transacoes order by total")

In [75]:
windowSpecCliente = Window.orderBy("branch", "city", "customer_type", "gender")
df_silver_dim_cliente = df_silver_dim_cliente.withColumn("customer_id", dense_rank().over(windowSpecCliente))

# Criando IDs correspondentes aos produtos
windowSpecProduto = Window.orderBy("product_line", "unit_price")
df_silver_dim_produto = df_silver_dim_produto.withColumn("product_id", dense_rank().over(windowSpecProduto))

# Relacionando Fato Transação com Dimensões Cliente e Produto
df_silver_fato_transacao = df_silver_fato_transacao.join(df_silver_dim_cliente.select("customer_id", "branch", "city", "customer_type", "gender"), ["branch", "city", "customer_type", "gender"], "left")
df_silver_fato_transacao = df_silver_fato_transacao.join(df_silver_dim_produto.select("product_id", "product_line", "unit_price"), ["product_line", "unit_price"], "left")

# Selecionando apenas as colunas necessárias na tabela Fato Transação
df_silver_fato_transacao = df_silver_fato_transacao.select("invoice_id", "product_id", "customer_id", "quantity", "tax_5_percent", "total", "payment", "year", "month", "day", "hour", "minute")

# Salvando as tabelas Dimensão
df_silver_dim_cliente.write.mode("overwrite").parquet("spark-warehouse/silver/dim_cliente")
df_silver_dim_produto.write.mode("overwrite").parquet("spark-warehouse/silver/dim_produto")

# Salvando a tabela de Fato Transação
df_silver_fato_transacao.write.mode("overwrite").parquet("spark-warehouse/silver/fato_transacao")

In [73]:
df_silver_fato_transacao.show()

+-----------+----------+-----------+------+---------+--------------------+--------+-------------+----------+--------+-----------+----+-----+---+----+------+
| invoice_id|product_id|customer_id|branch|     city|        product_line|quantity|tax_5_percent|unit_price|   total|    payment|year|month|day|hour|minute|
+-----------+----------+-----------+------+---------+--------------------+--------+-------------+----------+--------+-----------+----+-----+---+----+------+
|308-81-0538|       286|          3|     A|   Yangon| Fashion accessories|       4|        14.61|     73.05|  306.81|Credit card|2019|    2| 25|  17|    16|
|834-83-1826|       797|          5|     B| Mandalay|  Home and lifestyle|       5|        20.51|     82.04|  430.71|Credit card|2019|    2| 25|  17|    16|
|873-95-4984|       633|          5|     B| Mandalay|   Health and beauty|       7|       26.915|      76.9| 565.215|       Cash|2019|    2| 15|  20|    21|
|400-80-4065|       616|         10|     C|Naypyitaw|   He