In [1]:
!pip install geopy
!sudo conda install -c conda-forge -y pyarrow

Collecting package metadata: done
Solving environment: \ 
The environment is inconsistent, please check the package plan carefully
The following packages are causing the inconsistency:

  - conda-forge/linux-64::matplotlib==3.0.3=py37_1
done


  current version: 4.6.14
  latest version: 4.7.11

Please update conda by running

    $ conda update -n base conda



# All requested packages already installed.



In [2]:
import pandas as pd 
import numpy
import matplotlib.pyplot as plt 
from pyspark.sql.types import DoubleType
from pyspark.sql import SparkSession,Row
from pyspark.sql.functions import lit, udf, col, unix_timestamp, pandas_udf, PandasUDFType
from pyspark.sql import functions as F
from geopy.distance import geodesic
from struct import *

In [3]:
spark = SparkSession.builder.appName("TimeInStop").getOrCreate()

In [4]:
points_df = spark.read.csv('../Datasets/time_in_stop/points.csv', inferSchema=True, header=True)
stops_df = spark.read.csv('../Datasets/time_in_stop/stops.csv', inferSchema=True, header=True)
vehicle_stops_df = spark.read.csv('../Datasets/time_in_stop/vehicle_stops.csv', inferSchema=True, header=True)

In [31]:
vehicle_and_stops = vehicle_stops_df.join(stops_df, 'route_id')
vs = vehicle_and_stops.alias('vs')
vs.show()

DataFrame[route_id: int, date: timestamp, vehicle_id: int, stop_id: int, stop_lat: double, stop_lon: double, name: string]

In [35]:
points_and_stops = vs.join(points_df, 'vehicle_id')
ps = points_and_stops.alias('ps')
ps.orderBy(ps.timestamp).show()

DataFrame[vehicle_id: int, route_id: int, date: timestamp, stop_id: int, stop_lat: double, stop_lon: double, name: string, timestamp: timestamp, lat: double, lon: double]

In [58]:
@pandas_udf('float', PandasUDFType.SCALAR)
def distance(lat1,lon1,lat2,lon2):
    return geodesicdistance(lat1, lon1, lat2, lon2)

@pandas_udf('int', PandasUDFType.GROUPED_AGG)
def time_spread(timestamp):
    return timestamp.max() - timestamp.min()

In [57]:
from math import sin, cos, atan2, sqrt, pi

def toRadians(series):
    return series.mul(pi).div(180.0)

def cosS(series):
    return series.apply(cos)

def sinS(series):
    return series.apply(sin)

def absS(series):
    return series.abs()

def sqrtS(series):
    return series.apply(sqrt)

def atan2S(series1, series2):
    return (series1/series2).apply(atan)

def geodesicdistance(point1Lat, point1Lng, point2Lat, point2Lng):
    # Geodesic distance between two points on the Earth 
    # computed using Vincenty inverse problem formula 
    lat1, lng1 = toRadians(point1Lat), toRadians(point1Lng)
    lat2, lng2 = toRadians(point2Lat), toRadians(point2Lng)
    a = cosS(lat2)*sinS(absS(lng2 - lng1))
    b = cosS(lat1)*sinS(lat2)-sinS(lat1)*cosS(lat2)*cosS(abs(lng2 - lng1))
    c = sinS(lat1)*sinS(lat2)+cosS(lat1)*cosS(lat2)*cosS(abs(lng2 - lng1))
    return earthRadius*atan2S(sqrtS(a*a+b*b),c)*1000

In [59]:
ps.drop('date') \
  .withColumn("distance", distance('stop_lat', 'stop_lon','lat', 'lon')) \
  .orderBy("distance") \
  .drop('stop_lat','stop_lon','route_id','lat','lon') \
  .withColumn('timestamp', unix_timestamp(ps["timestamp"])) \
  .show()

+----------+-------+-----------------+----------+------------+
|vehicle_id|stop_id|             name| timestamp|    distance|
+----------+-------+-----------------+----------+------------+
|         1|      3|       La Pradera|1566124140|3.5494424E-5|
|         1|      2|Caamaño y Ruta 25|1566123540|3.6474565E-5|
|         1|      1|   Pilar del Este|1566122580|3.8222995E-5|
|         1|      1|   Pilar del Este|1566122700| 4.866268E-5|
|         1|      3|       La Pradera|1566124260|5.0537354E-5|
|         1|      2|Caamaño y Ruta 25|1566123420| 8.532597E-5|
|         1|      1|   Pilar del Este|1566122460|1.3259654E-4|
|         1|      1|   Pilar del Este|1566122820|1.4766537E-4|
|         1|      3|       La Pradera|1566124380|1.6049383E-4|
|         1|      3|       La Pradera|1566124020|  1.92617E-4|
|         1|      2|Caamaño y Ruta 25|1566123660|2.3270582E-4|
|         1|      2|Caamaño y Ruta 25|1566123300|2.3968019E-4|
|         1|      3|       La Pradera|1566124500|3.0703