In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, window
from pyspark.sql.types import StructType, StringType, TimestampType


In [4]:
# Criação da sessão Spark
spark = SparkSession.builder \
    .appName("ParquetAggregation") \
    .getOrCreate()


In [5]:


inputPath = "/home/jovyan/work/parquet_output"  # Caminho para o parquet

# 1. Inferir schema de forma estática
inferredSchema = spark.read.parquet(inputPath).schema

# 2. Usar o schema no streaming
streamingInputDF = (
    spark
      .readStream
      .schema(inferredSchema)  # ← agora temos o schema inferido
      .option("maxFilesPerTrigger", 1)
      .parquet(inputPath)
)

# Agrupamento por tipo de ação e janela de 1 hora
streamingCountsDF = (
    streamingInputDF
      .withColumn("createdAt", col("createdAt").cast("timestamp"))  # ajuste se seu campo tiver outro nome
      .groupBy(
          "department",
          window(col("createdAt"), "5 minutes")
      )
      .count()
)



In [6]:
# Query em memória para análise interativa
query = (
    streamingCountsDF
      .writeStream
      .format("memory")
      .queryName("contagem")
      .outputMode("complete")
      .start()
)


In [7]:
spark.streams.active

[<pyspark.sql.streaming.query.StreamingQuery at 0x7fce5c206710>]

In [8]:
spark.catalog.listTables()


[Table(name='contagem', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True)]

In [14]:
spark.sql("""
SELECT department, 
       date_format(window.end, "HH:mm") as fim_janela, 
       count 
FROM contagem 
ORDER BY fim_janela DESC, count DESC
""").show(truncate=False)


+-----------+----------+-----+
|department |fim_janela|count|
+-----------+----------+-----+
|Toys       |00:50     |233  |
|Shoes      |00:50     |221  |
|Clothing   |00:50     |220  |
|Industrial |00:50     |213  |
|Kids       |00:50     |211  |
|Jewelery   |00:50     |211  |
|Home       |00:50     |211  |
|Baby       |00:50     |210  |
|Outdoors   |00:50     |210  |
|Movies     |00:50     |207  |
|Health     |00:50     |206  |
|Computers  |00:50     |206  |
|Grocery    |00:50     |202  |
|Automotive |00:50     |201  |
|Garden     |00:50     |200  |
|Beauty     |00:50     |198  |
|Games      |00:50     |194  |
|Electronics|00:50     |194  |
|Tools      |00:50     |194  |
|Books      |00:50     |192  |
+-----------+----------+-----+
only showing top 20 rows



In [11]:
spark.sql("""
SELECT sum(count)
FROM contagem 
""").show(truncate=False)

+----------+
|sum(count)|
+----------+
|1299      |
+----------+



In [12]:
spark.sql("""
SELECT sum(count)
FROM contagem 
""").show(truncate=False)

+----------+
|sum(count)|
+----------+
|2299      |
+----------+



In [13]:
spark.sql("""
SELECT sum(count)
FROM contagem 
""").show(truncate=False)

+----------+
|sum(count)|
+----------+
|2898      |
+----------+

