In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, LongType, ArrayType, BooleanType, TimestampType, DoubleType
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from delta.tables import DeltaTable
from delta import *
from datetime import datetime

In [None]:
builder = (
    SparkSession.builder
    .appName("Trusted")
    .master("spark://spark-master:7077")
    .config("spark.sql.extensions","io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog","org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .config("spark.hadoop.fs.defaultFS", "file:///")  # Define local como padrão
    .config("spark.hadoop.fs.s3a.access.key", "projeto_final")
    .config("spark.hadoop.fs.s3a.secret.key", "projeto_final")
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000")
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")
)

In [None]:
spark = configure_spark_with_delta_pip(builder).getOrCreate()
spark.conf.set("spark.sql.repl.eagerEval.enabled", True)

In [None]:
df = (
    spark.read
    .option("multiLine", "true")
    .option("recursiveFileLookup", "true")
    .json("s3a://raw/sptrans/position/")
    
)
#df.show()

In [None]:
df_linhas = df.select(
    explode(col("l")).alias("linha") 
)

In [None]:
df_veiculos = df_linhas.select(
    col("linha.c").alias("codigo_linha"),
    col("linha.cl").alias("codigo_linha_id"),
    col("linha.sl").alias("sentido"),
    col("linha.lt0").alias("origem"),
    col("linha.lt1").alias("destino"),
    explode(col("linha.vs")).alias("veiculo")
)

In [None]:
df_flat = df_veiculos.select(
    col("codigo_linha"),
    col("codigo_linha_id"),
    col("sentido"),
    col("origem"),
    col("destino"),
    col("veiculo.p").alias("prefixo"),
    col("veiculo.a").alias("ativo"),
    col("veiculo.py").alias("latitude"),
    col("veiculo.px").alias("longitude"),
    to_timestamp(col("veiculo.ta")).alias("timestamp_posicao") 
)
#df_flat

In [None]:
df_flat.createOrReplaceTempView('position_raw')

In [None]:
query = """
        WITH
        posicoes AS
        (
            SELECT 
                codigo_linha,
                codigo_linha_id,
                CASE
                WHEN sentido = 1
                    THEN 'TERMINAL PRINCIPAL PARA SECUNDÁRIO'
                    ELSE 'TERMINAL SECUNDÁRIO PARA PRINCIPAL'
                END AS sentido,
                origem,
                destino,
                prefixo,
                ativo,
                latitude,
                longitude,
                timestamp_posicao,
                DATE(timestamp_posicao) AS data,
                DATE_FORMAT(timestamp_posicao,'HH') AS hora,
                DATE_FORMAT(timestamp_posicao,'mm') AS minuto,
                ROW_NUMBER() OVER(PARTITION BY codigo_linha_id, prefixo ORDER BY timestamp_posicao DESC) AS rn
            FROM position_raw
        )
        SELECT
            *
        FROM posicoes
        WHERE
            rn = 1          
            """

resultado_df = spark.sql(query).drop('rn')
#resultado_df

In [None]:
delta_path = "s3a://trusted/sptrans/position"
if DeltaTable.isDeltaTable(spark, delta_path):
    delta_table = DeltaTable.forPath(spark, delta_path)
else:
    resultado_df.write.format("delta") \
        .mode("overwrite") \
        .partitionBy("data") \
        .save(delta_path)
    delta_table = DeltaTable.forPath(spark, delta_path)

In [None]:
delta_table = DeltaTable.forPath(spark, "s3a://trusted/sptrans/position")

(
    delta_table.alias("t")
    .merge(
        resultado_df.alias("s"),
        """
        t.codigo_linha_id = s.codigo_linha_id
        AND t.prefixo = s.prefixo
        """
    )
    .whenMatchedUpdateAll()  # Atualiza a última posição
    .whenNotMatchedInsertAll()  # Insere se não existir
    .execute()
)
