In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, col

spark = SparkSession.builder \
    .appName('User Raw to Trusted') \
    .config('spark.sql.extensions','io.delta.sql.DeltaSparkSessionExtension') \
    .config('spark.sql.catalog.spark_catalog','org.apache.spark.sql.delta.catalog.DeltaCatalog') \
    .getOrCreate()

In [5]:
raw_path_previsao_chegada = 's3a://raw/previsao_chegada'

In [6]:
df_previsao_chegada = spark.read.json(raw_path_previsao_chegada)

In [7]:
df_explode = df_previsao_chegada.withColumn("ps_explod", explode("ps"))\
    .withColumn("vs_explod", explode("ps_explod.vs")).select(col("hr").alias("dat_ref_carga"),"ps_explod","vs_explod")

In [8]:
df_prev_chegada = df_explode.select(
    col("dat_ref_carga"),
    col("ps_explod.cp").alias("ID_PARADA"), 
    col("ps_explod.np").alias("nome_parada"),
    col("ps_explod.px").alias("latitude_loc"), 
    col("ps_explod.py").alias("longtitude"),
    col("vs_explod.a").alias("VEICULO_ACESSIVEL"),
    col("vs_explod.is").alias("TIMESTAMP"),
    col("vs_explod.p").alias("PREFIXO_VEICULO"),
    col("vs_explod.px").alias("LATITUDE_VEICULO"),
    col("vs_explod.py").alias("LONGITUDE_VEICULO"),
    col("vs_explod.t").alias("HORARIO_PREVISTO_CHEADA")    
)

In [35]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, radians, sin, cos, atan2, sqrt
from pyspark.sql.types import DoubleType
R = 6371.0

# Converter as colunas de latitude e longitude para radianos
df = df_prev_chegada.withColumn("lat1_rad", radians(col("latitude_loc"))) \
       .withColumn("lon1_rad", radians(col("longtitude"))) \
       .withColumn("lat2_rad", radians(col("LATITUDE_VEICULO"))) \
       .withColumn("lon2_rad", radians(col("LONGITUDE_VEICULO")))

# Calcular a diferença entre as coordenadas
df = df.withColumn("dlat", col("lat2_rad") - col("lat1_rad")) \
       .withColumn("dlon", col("lon2_rad") - col("lon1_rad"))

# Aplicar a fórmula de Haversine
df = df.withColumn("a", sin(col("dlat") / 2)**2 + cos(col("lat1_rad")) * cos(col("lat2_rad")) * sin(col("dlon") / 2)**2)

df = df.withColumn("c", 2 * atan2(sqrt(col("a")), sqrt(1 - col("a"))))

# Calcular a distância final
df = df.withColumn("distancia_km", col("c") * R)

# Selecionar as colunas de interesse
df_final = df.select("dat_ref_carga","id_parada","horario_previsto_cheada","PREFIXO_VEICULO","latitude_loc", "longtitude", "LATITUDE_VEICULO", "LONGITUDE_VEICULO", "distancia_km")

In [40]:
from pyspark.sql.functions import to_timestamp, unix_timestamp, expr, lit, abs

df_final = df_final.withColumn("diff_in_minute", abs(((unix_timestamp("dat_ref_carga","HH:mm")-unix_timestamp("horario_previsto_cheada","HH:mm"))% lit(3600))/60))

In [41]:
df_final.show()

+-------------+---------+-----------------------+---------------+------------+----------+-------------------+-------------------+------------------+--------------+
|dat_ref_carga|id_parada|horario_previsto_cheada|PREFIXO_VEICULO|latitude_loc|longtitude|   LATITUDE_VEICULO|  LONGITUDE_VEICULO|      distancia_km|diff_in_minute|
+-------------+---------+-----------------------+---------------+------------+----------+-------------------+-------------------+------------------+--------------+
|        20:47|360004796|                  20:53|          31943|  -46.444853|-23.513681| -46.46042166666666| -23.51553333333333|1.7369629819857777|           6.0|
|        20:47|360004796|                  21:03|          31936|  -46.444853|-23.513681|-46.485756666666674|-23.515643333333333| 4.550762764692933|          16.0|
|        20:47|360004796|                  21:04|          31840|  -46.444853|-23.513681|-46.490275000000004|         -23.515145| 5.051940356784393|          17.0|
|        20:47|3

In [42]:
silver_path_posicao_veiculo = 's3a://trusted/previsao_chegada'

In [43]:
df_final.write.format('delta')\
    .mode('overwrite').option("overwriteSchema", "true").save(silver_path_posicao_veiculo)