In [1]:
import requests
import json
import pyspark
from delta import *

In [2]:
builder = (
    pyspark.sql.SparkSession.builder
    .appName("delta")
    .master("spark://spark-master:7077")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .config("spark.hadoop.fs.defaultFS", "file:///")  # Define local como padrão
    .config("spark.hadoop.fs.s3a.access.key", "projeto_final")
    .config("spark.hadoop.fs.s3a.secret.key", "projeto_final")
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000")
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
)
spark = configure_spark_with_delta_pip(builder).enableHiveSupport().getOrCreate()





In [5]:

df = (
    spark.read
    .option("multiLine", "true")
    .json("s3a://raw/sptrans/position/")
)


In [16]:
from pyspark.sql.types import *

schema_l = StructType([
    StructField("c", StringType()),
    StructField("cl", LongType()),
    StructField("sl", IntegerType()),
    StructField("lt0", StringType()),
    StructField("lt1", StringType()),
    StructField("qv", IntegerType()),
    StructField("vs", ArrayType(
        StructType([
            StructField("p", LongType()),
            StructField("a", BooleanType()),
            StructField("ta", TimestampType()),
            StructField("py", DoubleType()),
            StructField("px", DoubleType()),
            StructField("sv", StringType()),
            StructField("is", StringType())
        ])
    ))
])


In [17]:
from pyspark.sql.functions import from_json, col

df_parsed = df.withColumn("l", from_json(col("l"), schema_l))


AnalysisException: cannot resolve 'from_json(l)' due to data type mismatch: argument 1 requires string type, however, 'l' is of array<struct<c:string,cl:bigint,lt0:string,lt1:string,qv:bigint,sl:bigint,vs:array<struct<a:boolean,is:string,p:bigint,px:double,py:double,sv:string,ta:string>>>> type.;
'Project [hr#12, from_json(StructField(c,StringType,true), StructField(cl,LongType,true), StructField(sl,IntegerType,true), StructField(lt0,StringType,true), StructField(lt1,StringType,true), StructField(qv,IntegerType,true), StructField(vs,ArrayType(StructType(StructField(p,LongType,true),StructField(a,BooleanType,true),StructField(ta,TimestampType,true),StructField(py,DoubleType,true),StructField(px,DoubleType,true),StructField(sv,StringType,true),StructField(is,StringType,true)),true),true), l#13, Some(Etc/UTC)) AS l#77, ano#14, mes#15, dia#16]
+- Relation [hr#12,l#13,ano#14,mes#15,dia#16] json


In [19]:
df_vs = df_l.withColumn("veiculo", explode("linha.vs"))


In [35]:
df.show()

+-----+--------------------+----+---+---+
|   hr|                   l| ano|mes|dia|
+-----+--------------------+----+---+---+
|19:27|[{2770-10, 33011,...|2026|  1|  9|
|19:30|[{745M-21, 34752,...|2026|  1|  9|
|19:34|[{745M-21, 34752,...|2026|  1|  9|
|19:37|[{745M-21, 34752,...|2026|  1|  9|
|19:40|[{745M-21, 34752,...|2026|  1|  9|
|19:44|[{2770-10, 243, M...|2026|  1|  9|
|19:47|[{745M-21, 34752,...|2026|  1|  9|
|19:51|[{407L-10, 34967,...|2026|  1|  9|
|19:54|[{745M-21, 34752,...|2026|  1|  9|
|19:57|[{407L-10, 34967,...|2026|  1|  9|
|20:00|[{2770-10, 243, M...|2026|  1|  9|
|20:03|[{2770-10, 243, M...|2026|  1|  9|
|20:06|[{2770-10, 243, M...|2026|  1|  9|
|20:09|[{2770-10, 243, M...|2026|  1|  9|
|20:12|[{675N-10, 33130,...|2026|  1|  9|
+-----+--------------------+----+---+---+



In [24]:
from pyspark.sql.functions import col, to_timestamp

df_flat = df_vs.select(
    col("linha.c").alias("codigo_linha"),
    col("linha.cl").alias("codigo_linha_id"),
    col("linha.sl").alias("sentido"),
    col("linha.lt0").alias("origem"),
    col("linha.lt1").alias("destino"),
    col("linha.qv").alias("qtd_veiculos"),

    col("veiculo.p").alias("prefixo"),
    col("veiculo.a").alias("ativo"),
    col("veiculo.px").alias("longitude"),
    col("veiculo.py").alias("latitude"),
    col("veiculo.sv").alias("velocidade"),
    col("veiculo.is").alias("situacao"),
    to_timestamp(col("veiculo.ta")).alias("timestamp_posicao")
)


In [44]:
df_flat.show()

+------------+---------------+-------+--------------------+-------------------+------------+-------+-----+-------------------+-------------------+----------+--------+-------------------+
|codigo_linha|codigo_linha_id|sentido|              origem|            destino|qtd_veiculos|prefixo|ativo|          longitude|           latitude|velocidade|situacao|  timestamp_posicao|
+------------+---------------+-------+--------------------+-------------------+------------+-------+-----+-------------------+-------------------+----------+--------+-------------------+
|     2770-10|          33011|      2|   METRÔ VL. MATILDE|CPTM JOSÉ BONIFÁCIO|           4|  31007| true|-46.431186499999995|         -23.538865|      null|    null|2026-01-09 22:27:34|
|     2770-10|          33011|      2|   METRÔ VL. MATILDE|CPTM JOSÉ BONIFÁCIO|           4|  31667| true|        -46.4313315|         -23.539111|      null|    null|2026-01-09 22:27:13|
|     2770-10|          33011|      2|   METRÔ VL. MATILDE|CPTM J

In [30]:
from pyspark.sql.functions import (
    col, to_timestamp, to_date, hour, minute
)

df_final = df_flat.withColumn(
    "data", to_date(col("timestamp_posicao"))
).withColumn(
    "hora", hour(col("timestamp_posicao"))
).withColumn(
    "minuto", minute(col("timestamp_posicao"))
)


In [48]:
df_final =  df_final.where('ativo != False')

In [49]:
import pyspark.pandas as ps

psdf = df_final.limit(100).to_pandas_on_spark()
display(psdf)


  series = series.astype(t, copy=False)


Unnamed: 0,codigo_linha,codigo_linha_id,sentido,origem,destino,qtd_veiculos,prefixo,ativo,longitude,latitude,velocidade,situacao,timestamp_posicao,data,hora,minuto
0,407L-10,34967,2,METRÔ GUILHERMINA/ESPERANÇA,BARRO BRANCO,17,48277,True,-46.426711,-23.538772,,,2026-01-09 22:50:52,2026-01-09,22,50
1,407L-10,34967,2,METRÔ GUILHERMINA/ESPERANÇA,BARRO BRANCO,17,48628,True,-46.495247,-23.531027,,,2026-01-09 22:50:48,2026-01-09,22,50
2,407L-10,34967,2,METRÔ GUILHERMINA/ESPERANÇA,BARRO BRANCO,17,48002,True,-46.465231,-23.536119,,,2026-01-09 22:50:49,2026-01-09,22,50
3,407L-10,34967,2,METRÔ GUILHERMINA/ESPERANÇA,BARRO BRANCO,17,48879,True,-46.394528,-23.591108,,,2026-01-09 22:51:03,2026-01-09,22,51
4,407L-10,34967,2,METRÔ GUILHERMINA/ESPERANÇA,BARRO BRANCO,17,48454,True,-46.425599,-23.541319,,,2026-01-09 22:50:55,2026-01-09,22,50
5,407L-10,34967,2,METRÔ GUILHERMINA/ESPERANÇA,BARRO BRANCO,17,48968,True,-46.392881,-23.582254,,,2026-01-09 22:50:45,2026-01-09,22,50
6,407L-10,34967,2,METRÔ GUILHERMINA/ESPERANÇA,BARRO BRANCO,17,48624,True,-46.394862,-23.590416,,,2026-01-09 22:50:32,2026-01-09,22,50
7,407L-10,34967,2,METRÔ GUILHERMINA/ESPERANÇA,BARRO BRANCO,17,48629,True,-46.415624,-23.542699,,,2026-01-09 22:50:27,2026-01-09,22,50
8,407L-10,34967,2,METRÔ GUILHERMINA/ESPERANÇA,BARRO BRANCO,17,48631,True,-46.391979,-23.582608,,,2026-01-09 22:50:25,2026-01-09,22,50
9,407L-10,34967,2,METRÔ GUILHERMINA/ESPERANÇA,BARRO BRANCO,17,48817,True,-46.39492,-23.590318,,,2026-01-09 22:50:45,2026-01-09,22,50


In [63]:
from pyspark.sql import functions as F

df_final.groupBy('codigo_linha_id', 'prefixo','timestamp_posicao') \
        .agg(F.countDistinct('timestamp_posicao').alias('qtd_posicoes_distintas')) \
        .sort(F.desc('qtd_posicoes_distintas')) \
        .show()


+---------------+-------+-------------------+----------------------+
|codigo_linha_id|prefixo|  timestamp_posicao|qtd_posicoes_distintas|
+---------------+-------+-------------------+----------------------+
|          34977|  52833|2026-01-09 22:50:46|                     1|
|          33859|  55130|2026-01-09 22:51:02|                     1|
|          34920|  73214|2026-01-09 22:50:41|                     1|
|          33933|  66597|2026-01-09 22:50:54|                     1|
|          33550|  85583|2026-01-09 22:50:40|                     1|
|          32769|  71756|2026-01-09 22:50:58|                     1|
|          33393|  22732|2026-01-09 22:50:41|                     1|
|          33708|  35946|2026-01-09 22:50:57|                     1|
|           2209|  52731|2026-01-09 22:50:33|                     1|
|          34744|  68766|2026-01-09 22:50:31|                     1|
|          33571|  16515|2026-01-09 22:50:28|                     1|
|            921|  36217|2026-01-0

In [64]:
from pyspark.sql import functions as F

df_final.groupBy("codigo_linha_id", "prefixo", "timestamp_posicao") \
        .count() \
        .filter(F.col("count") > 1) \
        .show()


+---------------+-------+-------------------+-----+
|codigo_linha_id|prefixo|  timestamp_posicao|count|
+---------------+-------+-------------------+-----+
|           1059|  47821|2026-01-09 23:05:54|    2|
|           1059|  47231|2026-01-09 23:08:52|    2|
|           1041|  47274|2026-01-09 23:06:00|    2|
|          33229|  11511|2026-01-09 22:51:02|    3|
|           2633|  73424|2026-01-09 22:47:05|    2|
|           1042|  47603|2026-01-09 22:52:49|    2|
|          32858|  66133|2026-01-09 22:58:47|    2|
|           1156|  66062|2026-01-09 22:59:48|    2|
|           1316|  77862|2026-01-09 22:50:43|    2|
|           2182|  48220|2026-01-09 22:50:05|    2|
|          33699|  45245|2026-01-09 23:00:16|    2|
|          33841|  36621|2026-01-09 22:55:44|    2|
|            973|  45356|2026-01-09 23:03:16|    2|
|          35271|  75432|2026-01-09 22:52:38|    2|
|           1038|  47402|2026-01-09 22:53:40|    2|
|           1253|  10659|2026-01-09 23:02:50|    2|
|           

In [82]:
df_final = df_final.dropDuplicates(["codigo_linha_id", "prefixo", "timestamp_posicao"])


In [83]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Criar uma janela por veículo e linha, ordenando pelo timestamp decrescente
window = Window.partitionBy("codigo_linha_id", "prefixo").orderBy(F.col("timestamp_posicao").desc())

# Adicionar número de linha dentro da janela
df_final = df_final.withColumn("rank", F.row_number().over(window)) \
                   .filter(F.col("rank") == 1) \
                   .drop("rank")


In [84]:
df_final.count()

15762

In [85]:
from pyspark.sql import functions as F

df_final.groupBy("codigo_linha_id", "prefixo", "timestamp_posicao") \
        .count() \
        .filter(F.col("count") > 1) \
        .show()


+---------------+-------+-----------------+-----+
|codigo_linha_id|prefixo|timestamp_posicao|count|
+---------------+-------+-----------------+-----+
+---------------+-------+-----------------+-----+



In [86]:
from delta.tables import DeltaTable
delta_path = "s3a://trusted/sptrans/position"
if DeltaTable.isDeltaTable(spark, delta_path):
    delta_table = DeltaTable.forPath(spark, delta_path)
else:
    df_final.write.format("delta") \
        .mode("overwrite") \
        .partitionBy("data") \
        .save(delta_path)

    delta_table = DeltaTable.forPath(spark, delta_path)


In [80]:
from delta.tables import DeltaTable

delta_table = DeltaTable.forPath(spark, "s3a://trusted/sptrans/position")

(
    delta_table.alias("t")
    .merge(
        df_final.alias("s"),
        """
        t.codigo_linha_id = s.codigo_linha_id
        AND t.prefixo = s.prefixo
        """
    )
    .whenMatchedUpdateAll()  # Atualiza a última posição
    .whenNotMatchedInsertAll()  # Insere se não existir
    .execute()
)


In [122]:
df_delta = spark.read.format("delta").load("s3a://trusted/sptrans/position")

df_delta.to_pandas_on_spark()


  series = series.astype(t, copy=False)


Unnamed: 0,codigo_linha,codigo_linha_id,sentido,origem,destino,qtd_veiculos,prefixo,ativo,longitude,latitude,velocidade,situacao,timestamp_posicao,data,hora,minuto
0,5119-10,1,1,LGO. SÃO FRANCISCO,TERM. CAPELINHA,6,71166,True,-46.643634,-23.560762,,,2026-01-09 23:12:34,2026-01-09,23,12
1,5119-10,1,1,LGO. SÃO FRANCISCO,TERM. CAPELINHA,6,71173,True,-46.680768,-23.584486,,,2026-01-09 23:12:54,2026-01-09,23,12
2,5119-10,1,1,LGO. SÃO FRANCISCO,TERM. CAPELINHA,6,71692,True,-46.637157,-23.552389,,,2026-01-09 22:57:01,2026-01-09,22,57
3,669A-10,4,1,TERM. PRINC. ISABEL,TERM. STO. AMARO,7,75401,True,-46.710583,-23.654157,,,2026-01-09 23:12:41,2026-01-09,23,12
4,669A-10,4,1,TERM. PRINC. ISABEL,TERM. STO. AMARO,8,75402,True,-46.710583,-23.654157,,,2026-01-09 23:02:59,2026-01-09,23,2
5,669A-10,4,1,TERM. PRINC. ISABEL,TERM. STO. AMARO,7,75405,True,-46.6575,-23.575655,,,2026-01-09 23:12:58,2026-01-09,23,12
6,669A-10,4,1,TERM. PRINC. ISABEL,TERM. STO. AMARO,9,75504,True,-46.6434,-23.53723,,,2026-01-09 22:57:30,2026-01-09,22,57
7,669A-10,4,1,TERM. PRINC. ISABEL,TERM. STO. AMARO,7,75505,True,-46.710583,-23.654157,,,2026-01-09 23:12:26,2026-01-09,23,12
8,669A-10,4,1,TERM. PRINC. ISABEL,TERM. STO. AMARO,7,75507,True,-46.710583,-23.654157,,,2026-01-09 23:12:48,2026-01-09,23,12
9,669A-10,4,1,TERM. PRINC. ISABEL,TERM. STO. AMARO,8,77853,True,-46.645591,-23.538772,,,2026-01-09 22:36:46,2026-01-09,22,36
