In [87]:
import requests
import json
import pyspark
from delta import *

In [88]:
builder = (
    pyspark.sql.SparkSession.builder
    .appName("delta")
    .master("spark://spark-master:7077")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .config("spark.hadoop.fs.defaultFS", "file:///")  # Define local como padrão
    .config("spark.hadoop.fs.s3a.access.key", "projeto_final")
    .config("spark.hadoop.fs.s3a.secret.key", "projeto_final")
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000")
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
)
spark = configure_spark_with_delta_pip(builder).enableHiveSupport().getOrCreate()





In [89]:

df = (
    spark.read
    .option("multiLine", "true")
    .json("s3a://raw/sptrans/position/")
)


In [90]:
from pyspark.sql.types import *

schema_l = StructType([
    StructField("c", StringType()),
    StructField("cl", LongType()),
    StructField("sl", IntegerType()),
    StructField("lt0", StringType()),
    StructField("lt1", StringType()),
    StructField("qv", IntegerType()),
    StructField("vs", ArrayType(
        StructType([
            StructField("p", LongType()),
            StructField("a", BooleanType()),
            StructField("ta", TimestampType()),
            StructField("py", DoubleType()),
            StructField("px", DoubleType()),
            StructField("sv", StringType()),
            StructField("is", StringType())
        ])
    ))
])


In [91]:
df_vs = df_l.withColumn("veiculo", explode("linha.vs"))


In [92]:
df.show()

+-----+--------------------+----+---+---+
|   hr|                   l| ano|mes|dia|
+-----+--------------------+----+---+---+
|19:27|[{2770-10, 33011,...|2026|  1|  9|
|19:30|[{745M-21, 34752,...|2026|  1|  9|
|19:34|[{745M-21, 34752,...|2026|  1|  9|
|19:37|[{745M-21, 34752,...|2026|  1|  9|
|19:40|[{745M-21, 34752,...|2026|  1|  9|
|19:44|[{2770-10, 243, M...|2026|  1|  9|
|19:47|[{745M-21, 34752,...|2026|  1|  9|
|19:51|[{407L-10, 34967,...|2026|  1|  9|
|19:54|[{745M-21, 34752,...|2026|  1|  9|
|19:57|[{407L-10, 34967,...|2026|  1|  9|
|20:00|[{2770-10, 243, M...|2026|  1|  9|
|20:03|[{2770-10, 243, M...|2026|  1|  9|
|20:06|[{2770-10, 243, M...|2026|  1|  9|
|20:09|[{2770-10, 243, M...|2026|  1|  9|
|20:12|[{675N-10, 33130,...|2026|  1|  9|
|20:15|[{675N-10, 33130,...|2026|  1|  9|
|20:19|[{2770-10, 243, M...|2026|  1|  9|
|20:22|[{2770-10, 243, M...|2026|  1|  9|
|20:25|[{407L-10, 34967,...|2026|  1|  9|
|20:28|[{407L-10, 34967,...|2026|  1|  9|
+-----+--------------------+----+-

In [93]:
from pyspark.sql.functions import col, to_timestamp

df_flat = df_vs.select(
    col("linha.c").alias("codigo_linha"),
    col("linha.cl").alias("codigo_linha_id"),
    col("linha.sl").alias("sentido"),
    col("linha.lt0").alias("origem"),
    col("linha.lt1").alias("destino"),
    col("linha.qv").alias("qtd_veiculos"),

    col("veiculo.p").alias("prefixo"),
    col("veiculo.a").alias("ativo"),
    col("veiculo.px").alias("longitude"),
    col("veiculo.py").alias("latitude"),
    col("veiculo.sv").alias("velocidade"),
    col("veiculo.is").alias("situacao"),
    to_timestamp(col("veiculo.ta")).alias("timestamp_posicao")
)


In [123]:
df_flat.to_pandas_on_spark()

  series = series.astype(t, copy=False)


Unnamed: 0,codigo_linha,codigo_linha_id,sentido,origem,destino,qtd_veiculos,prefixo,ativo,longitude,latitude,velocidade,situacao,timestamp_posicao
0,2770-10,33011,2,METRÔ VL. MATILDE,CPTM JOSÉ BONIFÁCIO,4,31007,True,-46.431186,-23.538865,,,2026-01-09 22:27:34
1,2770-10,33011,2,METRÔ VL. MATILDE,CPTM JOSÉ BONIFÁCIO,4,31667,True,-46.431331,-23.539111,,,2026-01-09 22:27:13
2,2770-10,33011,2,METRÔ VL. MATILDE,CPTM JOSÉ BONIFÁCIO,4,31625,True,-46.471188,-23.502336,,,2026-01-09 22:27:18
3,2770-10,33011,2,METRÔ VL. MATILDE,CPTM JOSÉ BONIFÁCIO,4,31664,True,-46.414,-23.523166,,,2026-01-09 22:27:18
4,6003-21,2576,1,EST. MENDES/VL. NATAL,TERM. VARGINHA,4,61764,True,-46.709183,-23.754978,,,2026-01-09 22:27:25
5,6003-21,2576,1,EST. MENDES/VL. NATAL,TERM. VARGINHA,4,61645,True,-46.71643,-23.766613,,,2026-01-09 22:27:34
6,6003-21,2576,1,EST. MENDES/VL. NATAL,TERM. VARGINHA,4,61318,True,-46.71643,-23.766613,,,2026-01-09 22:27:20
7,6003-21,2576,1,EST. MENDES/VL. NATAL,TERM. VARGINHA,4,61640,True,-46.71643,-23.766613,,,2026-01-09 22:27:22
8,3725-10,1022,1,VL. DALILA,METRÔ VL. MATILDE,3,45479,True,-46.5236,-23.54169,,,2026-01-09 22:27:05
9,3725-10,1022,1,VL. DALILA,METRÔ VL. MATILDE,3,45393,True,-46.532461,-23.533489,,,2026-01-09 22:27:33


In [95]:
from pyspark.sql.functions import (
    col, to_timestamp, to_date, hour, minute
)

df_final = df_flat.withColumn(
    "data", to_date(col("timestamp_posicao"))
).withColumn(
    "hora", hour(col("timestamp_posicao"))
).withColumn(
    "minuto", minute(col("timestamp_posicao"))
)


In [96]:
df_final =  df_final.where('ativo != False')

In [100]:
df_final.columns

['codigo_linha',
 'codigo_linha_id',
 'sentido',
 'origem',
 'destino',
 'qtd_veiculos',
 'prefixo',
 'ativo',
 'longitude',
 'latitude',
 'velocidade',
 'situacao',
 'timestamp_posicao',
 'data',
 'hora',
 'minuto']

In [None]:
df_final.where('codigo_linha_id = 1059 and prefixo = 47821 and timestamp_posicao = "2026-01-09 23:05:54"').to_pandas_on_spark()

  series = series.astype(t, copy=False)


Unnamed: 0,codigo_linha,codigo_linha_id,sentido,origem,destino,qtd_veiculos,prefixo,ativo,longitude,latitude,velocidade,situacao,timestamp_posicao,data,hora,minuto
0,3773-10,1059,1,METRÔ CARRÃO,RES. STA. BÁRBARA,6,47821,True,-46.496814,-23.578528,,,2026-01-09 23:05:54,2026-01-09,23,5
1,3773-10,1059,1,METRÔ CARRÃO,RES. STA. BÁRBARA,6,47821,True,-46.496814,-23.578528,,,2026-01-09 23:05:54,2026-01-09,23,5


In [113]:
from pyspark.sql import functions as F

df_final.groupBy('codigo_linha_id', 'prefixo','timestamp_posicao') \
        .agg(F.countDistinct('timestamp_posicao').alias('qtd_posicoes_distintas')) \
        .sort(F.desc('qtd_posicoes_distintas')) \
        .show()


+---------------+-------+-------------------+----------------------+
|codigo_linha_id|prefixo|  timestamp_posicao|qtd_posicoes_distintas|
+---------------+-------+-------------------+----------------------+
|          34977|  52833|2026-01-09 22:50:46|                     1|
|          33859|  55130|2026-01-09 22:51:02|                     1|
|          34920|  73214|2026-01-09 22:50:41|                     1|
|          33933|  66597|2026-01-09 22:50:54|                     1|
|          33550|  85583|2026-01-09 22:50:40|                     1|
|          32769|  71756|2026-01-09 22:50:58|                     1|
|          33393|  22732|2026-01-09 22:50:41|                     1|
|          33708|  35946|2026-01-09 22:50:57|                     1|
|           2209|  52731|2026-01-09 22:50:33|                     1|
|          34744|  68766|2026-01-09 22:50:31|                     1|
|          33571|  16515|2026-01-09 22:50:28|                     1|
|            921|  36217|2026-01-0

In [114]:
from pyspark.sql import functions as F

df_final.groupBy("codigo_linha_id", "prefixo", "timestamp_posicao") \
        .count() \
        .filter(F.col("count") > 1) \
        .show()


+---------------+-------+-------------------+-----+
|codigo_linha_id|prefixo|  timestamp_posicao|count|
+---------------+-------+-------------------+-----+
|           1671|  55445|2026-01-09 22:26:19|    2|
|          34507|  66903|2026-01-09 22:42:32|    2|
|          34448|  55367|2026-01-09 22:25:32|    2|
|           1333|  82446|2026-01-09 22:44:07|    2|
|           2633|  73424|2026-01-09 22:47:05|    2|
|           1371|  81195|2026-01-09 22:31:51|    2|
|           1680|  55367|2026-01-09 22:40:15|    2|
|           1921|  45196|2026-01-09 22:36:08|    2|
|          33043|  64723|2026-01-09 22:43:55|    2|
|           1116|  68504|2026-01-09 22:24:41|    2|
|           1186|  66100|2026-01-09 22:31:00|    2|
|          35271|  75250|2026-01-09 22:36:21|    2|
|            408|  10430|2026-01-09 22:42:14|    2|
|          34660|  80314|2026-01-09 22:37:00|    2|
|            376|  31066|2026-01-09 22:34:44|    2|
|           1042|  47174|2026-01-09 22:45:34|    2|
|          3

In [115]:
df_final = df_final.dropDuplicates(["codigo_linha_id", "prefixo", "timestamp_posicao"])


In [116]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Criar uma janela por veículo e linha, ordenando pelo timestamp decrescente
window = Window.partitionBy("codigo_linha_id", "prefixo").orderBy(F.col("timestamp_posicao").desc())

# Adicionar número de linha dentro da janela
df_final = df_final.withColumn("rank", F.row_number().over(window)) \
                   .filter(F.col("rank") == 1) \
                   .drop("rank")


In [117]:
df_final.count()

15762

In [118]:
from pyspark.sql import functions as F

df_final.groupBy("codigo_linha_id", "prefixo", "timestamp_posicao") \
        .count() \
        .filter(F.col("count") > 1) \
        .show()


+---------------+-------+-----------------+-----+
|codigo_linha_id|prefixo|timestamp_posicao|count|
+---------------+-------+-----------------+-----+
+---------------+-------+-----------------+-----+



In [119]:
from delta.tables import DeltaTable
delta_path = "s3a://trusted/sptrans/position"
if DeltaTable.isDeltaTable(spark, delta_path):
    delta_table = DeltaTable.forPath(spark, delta_path)
else:
    df_final.write.format("delta") \
        .mode("overwrite") \
        .partitionBy("data") \
        .save(delta_path)

    delta_table = DeltaTable.forPath(spark, delta_path)


In [120]:
from delta.tables import DeltaTable

delta_table = DeltaTable.forPath(spark, "s3a://trusted/sptrans/position")

(
    delta_table.alias("t")
    .merge(
        df_final.alias("s"),
        """
        t.codigo_linha_id = s.codigo_linha_id
        AND t.prefixo = s.prefixo
        """
    )
    .whenMatchedUpdateAll()  # Atualiza a última posição
    .whenNotMatchedInsertAll()  # Insere se não existir
    .execute()
)


In [121]:
spark.read.delta()

AttributeError: 'DeltaTable' object has no attribute 'show'