In [24]:
from pyspark.sql.functions import window, col

inputPath = "/home/jovyan/work/parquet_output"  # Caminho para seus Parquet

# 1. Inferir schema de forma estática
inferredSchema = spark.read.parquet(inputPath).schema

# 2. Usar o schema no streaming
streamingInputDF = (
    spark
      .readStream
      .schema(inferredSchema)  # ← agora temos o schema inferido
      .option("maxFilesPerTrigger", 1)
      .parquet(inputPath)
)

# Agrupamento por tipo de ação e janela de 1 hora
streamingCountsDF = (
    streamingInputDF
      .withColumn("createdAt", col("createdAt").cast("timestamp"))  # ajuste se seu campo tiver outro nome
      .groupBy(
          "department",
          window(col("createdAt"), "5 minutes")
      )
      .count()
)



In [31]:
# Query em memória para análise interativa
query = (
    streamingCountsDF
      .writeStream
      .format("memory")
      .queryName("contagem")
      .outputMode("complete")
      .start()
)


In [32]:
spark.streams.active

[<pyspark.sql.streaming.query.StreamingQuery at 0x7fe18f065490>]

In [33]:
spark.catalog.listTables()


[Table(name='contagem', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True)]

In [37]:
spark.sql("""
SELECT department, 
       date_format(window.end, "HH:mm") as fim_janela, 
       count 
FROM contagem 
ORDER BY fim_janela DESC, count DESC
""").show(truncate=False)


+-----------+----------+-----+
|department |fim_janela|count|
+-----------+----------+-----+
|Music      |01:30     |424  |
|Books      |01:30     |417  |
|Sports     |01:30     |411  |
|Beauty     |01:30     |394  |
|Home       |01:30     |393  |
|Industrial |01:30     |393  |
|Movies     |01:30     |392  |
|Garden     |01:30     |390  |
|Tools      |01:30     |386  |
|Games      |01:30     |384  |
|Jewelery   |01:30     |384  |
|Computers  |01:30     |383  |
|Health     |01:30     |380  |
|Clothing   |01:30     |380  |
|Shoes      |01:30     |376  |
|Electronics|01:30     |372  |
|Kids       |01:30     |370  |
|Baby       |01:30     |368  |
|Grocery    |01:30     |366  |
|Automotive |01:30     |359  |
+-----------+----------+-----+
only showing top 20 rows



In [38]:
spark.sql("""
SELECT sum(count)
FROM contagem 
""").show(truncate=False)

+----------+
|sum(count)|
+----------+
|56700     |
+----------+



In [39]:
spark.sql("""
SELECT sum(count)
FROM contagem 
""").show(truncate=False)

+----------+
|sum(count)|
+----------+
|63600     |
+----------+



In [40]:
spark.sql("""
SELECT sum(count)
FROM contagem 
""").show(truncate=False)

+----------+
|sum(count)|
+----------+
|82100     |
+----------+

