## Tiempo en parada

In [None]:
!sudo conda install -c conda-forge -y pyarrow

In [2]:
import pandas as pd 
import numpy
import matplotlib.pyplot as plt 
from pyspark.sql.types import DoubleType
from pyspark.sql import SparkSession,Row
from pyspark.sql.functions import lit, udf, col, unix_timestamp, pandas_udf, PandasUDFType
from pyspark.sql import functions as F
from struct import *

In [3]:
spark = SparkSession.builder.appName("TimeInStop").getOrCreate()

In [4]:
points_df = spark.read.csv('../Datasets/time_in_stop/points.csv', inferSchema=True, header=True)
stops_df = spark.read.csv('../Datasets/time_in_stop/stops.csv', inferSchema=True, header=True)
vehicle_stops_df = spark.read.csv('../Datasets/time_in_stop/vehicle_route.csv', inferSchema=True, header=True)

In [4]:
vehicle_and_stops = vehicle_stops_df.join(stops_df, 'route_id')
vs = vehicle_and_stops.alias('vs')
vs.show()

+--------+-------------------+----------+-------+----------+----------+-----------------+
|route_id|               date|vehicle_id|stop_id|  stop_lat|  stop_lon|             name|
+--------+-------------------+----------+-------+----------+----------+-----------------+
|       1|2019-08-18 00:00:00|         1|      1|-34.398942|-58.861926|   Pilar del Este|
|       1|2019-08-18 00:00:00|         1|      2|-34.406633|-58.857023|Caamaño y Ruta 25|
|       1|2019-08-18 00:00:00|         1|      3|-34.413502|-58.847281|       La Pradera|
+--------+-------------------+----------+-------+----------+----------+-----------------+



In [5]:
points_and_stops = vs.join(points_df, 'vehicle_id')
ps = points_and_stops.alias('ps')
ps.orderBy(ps.timestamp).show()

+----------+--------+-------------------+-------+----------+----------+-----------------+-------------------+----------+----------+
|vehicle_id|route_id|               date|stop_id|  stop_lat|  stop_lon|             name|          timestamp|       lat|       lon|
+----------+--------+-------------------+-------+----------+----------+-----------------+-------------------+----------+----------+
|         1|       1|2019-08-18 00:00:00|      3|-34.413502|-58.847281|       La Pradera|2019-08-18 10:01:00|-34.397771|-58.862199|
|         1|       1|2019-08-18 00:00:00|      1|-34.398942|-58.861926|   Pilar del Este|2019-08-18 10:01:00|-34.397771|-58.862199|
|         1|       1|2019-08-18 00:00:00|      2|-34.406633|-58.857023|Caamaño y Ruta 25|2019-08-18 10:01:00|-34.397771|-58.862199|
|         1|       1|2019-08-18 00:00:00|      1|-34.398942|-58.861926|   Pilar del Este|2019-08-18 10:03:00|-34.398603|-58.861995|
|         1|       1|2019-08-18 00:00:00|      3|-34.413502|-58.847281|     

In [14]:
@pandas_udf('float', PandasUDFType.SCALAR)
def distance(lat1,lon1,lat2,lon2):
    return geodesicdistance(lat1, lon1, lat2, lon2)

@pandas_udf('int', PandasUDFType.GROUPED_AGG)
def time_spread(timestamp):
    return timestamp.max() - timestamp.min()

In [13]:
from math import sin, cos, atan, sqrt, pi

earthRadius = 6.371e3

def toRadians(series):
    return series.mul(pi).div(180.0)

def cosS(series):
    return series.apply(cos)

def sinS(series):
    return series.apply(sin)

def absS(series):
    return series.abs()

def sqrtS(series):
    return series.apply(sqrt)

def atan2S(series1, series2):
    return (series1/series2).apply(atan)

def geodesicdistance(point1Lat, point1Lng, point2Lat, point2Lng):
    # Geodesic distance between two points on the Earth 
    # computed using Vincenty inverse problem formula 
    lat1, lng1 = toRadians(point1Lat), toRadians(point1Lng)
    lat2, lng2 = toRadians(point2Lat), toRadians(point2Lng)
    a = cosS(lat2)*sinS(absS(lng2 - lng1))
    b = cosS(lat1)*sinS(lat2)-sinS(lat1)*cosS(lat2)*cosS(abs(lng2 - lng1))
    c = sinS(lat1)*sinS(lat2)+cosS(lat1)*cosS(lat2)*cosS(abs(lng2 - lng1))
    return earthRadius*atan2S(sqrtS(a*a+b*b),c)*1000

In [18]:
report = ps.drop('date') \
  .withColumn("distance", distance('stop_lat', 'stop_lon','lat', 'lon')) \
  .orderBy("timestamp") \
  .filter(col("distance") < 50) \
  .drop('stop_lat','stop_lon','route_id','lat','lon') \
  .withColumn('timestamp', unix_timestamp(ps["timestamp"])) \
  .groupby("vehicle_id", "stop_id") \
  .agg(time_spread(col("timestamp")).alias('tiempo')) \
  .show()

+----------+-------+------+
|vehicle_id|stop_id|tiempo|
+----------+-------+------+
|         1|      2|     0|
|         1|      1|   120|
|         1|      3|     0|
+----------+-------+------+

