In [1]:
import numpy as np
import pandas as pd
import json
import folium
import os
from datetime import datetime
from datetime import timedelta
import scipy.spatial
import geopy
from geopy.distance import distance
import pytz
from math import radians, cos, sin, asin, sqrt, pi
import random
import rdp
import srtm
import gpxpy
import gpxpy
import gpxpy.gpx
import mplleaflet

import matplotlib.pyplot as plt
plt.ioff()

# Load GPS Data

In [226]:
# Load user gps data
with open('./TPF/tao&marced/response_tao_260319_fribourg.json') as f:
    data = json.load(f)

user_gps_df = pd.DataFrame.from_dict(data['userLocationTrace'])

# Set timezone to UTC+1
timezone = pytz.timezone("Europe/Zurich")

In [227]:
def convert_timestamp(ts):
    if len(ts) > 20:
        return timezone.localize(datetime.strptime(ts, "%Y-%m-%dT%H:%M:%S.%fZ"))
    else:
        return timezone.localize(datetime.strptime(ts, "%Y-%m-%dT%H:%M:%SZ"))

In [228]:
# Assign appropriate data type to numerical attributes
user_gps_df = user_gps_df.astype({'altitude': float, 
                                  'latitude': float,
                                  'longitude': float,
                                  'speed': float, 
                                  'accuracy': float, 
                                  'altitudeAccuracy': float,
                                  'heading': float})

# Exctract UNIX timestamp in seconds from timestamp string
user_gps_df['timestamp'] = user_gps_df.apply(lambda x: convert_timestamp(x.timestamp), axis=1)
user_gps_df['timestamp_unix'] = user_gps_df.apply(lambda x: round(x.timestamp.timestamp()), axis=1)

user_gps_df = user_gps_df.sort_values('timestamp_unix').reset_index(drop=True)

In [230]:
user_gps_df.head()

Unnamed: 0,isMoving,uuid,timestamp,latitude,longitude,accuracy,speed,heading,altitude,altitudeAccuracy,user,timestamp_unix
0,True,B0765517-FBCD-487D-994E-DB687AD69BDC,2019-03-26 08:34:45.282000+01:00,46.802862,7.150586,14.7,1.93,41.13,629.2,4.0,3,1553585685
1,True,663F2FDB-448D-4462-9705-56BF1B215DB5,2019-03-26 08:34:50.033000+01:00,46.802952,7.150634,36.1,1.44,15.12,627.0,4.0,3,1553585690
2,True,E28C019C-97E8-4253-9588-BB2EFFF4C8B6,2019-03-26 08:34:56.028000+01:00,46.803023,7.151205,50.0,1.45,76.29,626.9,6.0,3,1553585696
3,True,CF388A95-8F50-48B8-97CA-91A285028F98,2019-03-26 08:35:00.028000+01:00,46.803013,7.151326,30.0,2.37,86.13,625.1,4.0,3,1553585700
4,True,C9F5C453-F44B-4D4B-A9F6-C7E901EA4363,2019-03-26 08:35:04.029000+01:00,46.803047,7.151455,50.0,1.56,54.84,624.6,4.0,3,1553585704


Due to the potential for innacurate altitude readings from GPS sensors, the altitude readings were replaced using the data from the Shuttle Radar Topography Mission ([SRTM](https://en.wikipedia.org/wiki/Shuttle_Radar_Topography_Mission)). Python's gpxpy library allows for an easy inclusion of the aforemnetioned data, hence the data was converted to gpx to which SRTM data was added and then added back to the original user gps data.

In [231]:
# Create GPX object
gpx = gpxpy.gpx.GPX()

# Create first track in our GPX:
gpx_track = gpxpy.gpx.GPXTrack()
gpx.tracks.append(gpx_track)

# Create first segment in our GPX track:
gpx_segment = gpxpy.gpx.GPXTrackSegment()
gpx_track.segments.append(gpx_segment)

# Create points:
for idx in user_gps_df.index:
    gpx_segment.points.append(gpxpy.gpx.GPXTrackPoint(longitude=user_gps_df.loc[idx, 'longitude'], 
                                                      latitude=user_gps_df.loc[idx, 'latitude']))

In [232]:
# Add elevation data to GPX object
elevation_data = srtm.get_data()
elevation_data.add_elevations(gpx, smooth=True)

# Convert obtained elevation data back to a Dataframe
trip = gpx.tracks[0].segments[0]
new_altitude = pd.DataFrame([{'latitude': p.latitude, 'longitude': p.longitude, 'alt': p.elevation} for p in trip.points])
new_altitude = new_altitude.drop_duplicates()

# Add newly obtained elevation data to user gps Dataframe
user_gps_df = pd.merge(new_altitude, user_gps_df, on=['latitude', 'longitude'])
user_gps_df['altitude'] = user_gps_df['alt']

# Remove no longer needed columns
user_gps_df = user_gps_df.drop(columns=['alt', 'altitudeAccuracy'])

In [233]:
# Remove inaccurate positional GPS readings
user_gps_df = user_gps_df[user_gps_df.accuracy<30].copy()
user_gps_df['id'] = range(user_gps_df.shape[0])

# Public Transport Network

Load TPF stops data

In [234]:
# Load provided TPF data
tpf_stops = pd.read_csv('./TPF/Planned/stops_extend.txt')
tpf_stops.head()

Unnamed: 0,stop_id,stop_code,stop_name,stop_desc,stop_lat,stop_lon,zone_id,stop_url,location_type,parent_station,stp_correspondances,mp3_filename,stop_area,stop_shortname,thoreb_no
0,850488110,,"Fribourg/Freiburg, gare routière",,46.803239,7.150122,,,0,place_FRIgar,,2021.mp3,8504881,FRIgar,15000.0
1,858916300,,"Fribourg, Stade-Patinoire",,46.817895,7.154564,,,0,,,1084.mp3,8589163,Sta,14999.0
2,850023801,,"Bulle, Verdel",,46.624838,7.065969,,,0,,,4152.mp3,8500238,BULrtv,17903.0
3,850343501,,"Riaz, CO",,46.645104,7.065568,,,0,,,4159.mp3,8503435,RIA-CO,17935.0
4,850371801,,"Avry-sur-Matran, CO",,46.786202,7.077425,,,0,,,4083.mp3,8503718,AVSmco,16590.0


After investigation, decided to use SBB gtfs data instead as there were missing data points in TPC dataset

In [235]:
def read_sbb(filename, path):
    with open(path + filename) as f:
        raw = f.readlines()
    
    col_names = raw[0][1:-1].split(',')
    data = [l[:-1].replace('","', ';').replace('"', '').split(';') for l in raw[1:]]
    
    df = pd.DataFrame(data, columns=col_names)
    
    return df

In [11]:
path = 'gtfsfp20192019-03-20/'

In [12]:
# Load SBB GTFS data
stops_df = read_sbb('stops.txt', path)
stop_times_df = read_sbb('stop_times.txt', path)
routes_df = read_sbb('routes.txt', path)
trips_df = read_sbb('trips.txt', path)

As the GPS user data is solely in the Fribourg region of Switzerland, the GTFS data is filtered to the Transport Public Fribourgeois (TPF). In an alternative broader application, a location based filter could be applied where only data for public transport lines covering stops within a certain distance of the user gps points is kept.

In [13]:
# Filter to TPF
routes_df = routes_df.astype({'agency_id': str})
routes_df = routes_df[routes_df.agency_id=='834'][['route_id', 'route_desc', 'route_short_name']]

# Combine columns from GTFS tables loaded above
fribourg_sbb = pd.merge(trips_df, routes_df, on='route_id')
fribourg_sbb = pd.merge(stop_times_df, fribourg_sbb, on='trip_id')
fribourg_sbb = pd.merge(fribourg_sbb, stops_df, on='stop_id')
fribourg_sbb = fribourg_sbb.drop(columns=['pickup_type', 'drop_off_type','location_type', 'parent_station'])
fribourg_sbb = fribourg_sbb.astype({'stop_sequence': int})
fribourg_sbb.head()

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,route_id,service_id,trip_headsign,trip_short_name,direction_id,route_desc,route_short_name,stop_name,stop_lat,stop_lon
0,1.TA.6-260-A-j19-1.1.H,06:59:00,06:59:00,8504938,1,6-260-A-j19-1,TA+b002a,"Boltigen, Bahnhof",26003,0,Bus,260,"Jaun, Kappelboden",46.6102353307346,7.28713358579193
1,2.TA.6-260-A-j19-1.1.H,07:31:00,07:31:00,8504938,1,6-260-A-j19-1,TA+b08x0,"Boltigen, Bahnhof",26005,0,Bus,260,"Jaun, Kappelboden",46.6102353307346,7.28713358579193
2,4.TA.6-260-A-j19-1.1.H,08:31:00,08:31:00,8504938,1,6-260-A-j19-1,TA+b06uz,"Boltigen, Bahnhof",26009,0,Bus,260,"Jaun, Kappelboden",46.6102353307346,7.28713358579193
3,6.TA.6-260-A-j19-1.1.H,09:31:00,09:31:00,8504938,1,6-260-A-j19-1,TA+b0cnb,"Boltigen, Bahnhof",26013,0,Bus,260,"Jaun, Kappelboden",46.6102353307346,7.28713358579193
4,7.TA.6-260-A-j19-1.1.H,11:08:00,11:08:00,8504938,1,6-260-A-j19-1,TA+b0023,"Boltigen, Bahnhof",26017,0,Bus,260,"Jaun, Kappelboden",46.6102353307346,7.28713358579193


In [14]:
# Retain columns of interest
fribourg_stops = fribourg_sbb[['stop_id', 'stop_name', 'stop_lat', 'stop_lon']].drop_duplicates().reset_index(drop=True)

In [15]:
def get_closest_stop(lat, lon, id_, stops=fribourg_stops):
    temp = stops.copy()
    temp = temp.astype({'stop_lat': float, 'stop_lon': float})
    temp['diff'] = abs(temp['stop_lat'] - lat) + abs(temp['stop_lon'] - lon)
    temp = temp.sort_values('diff')
    return [id_, temp.iloc[0].stop_name, temp.iloc[0].stop_lat, temp.iloc[0].stop_lon]

In [16]:
def calc_dist(lat_s, long_s, lat_f, long_f, alt_s=None, alt_f=None):
    dist_2d = distance((lat_s, long_s), (lat_f, long_f)).m
    if alt_f is None or alt_s is None:
        return dist_2d
    else:
        dist_3d = (dist_2d**2 + (alt_s-alt_f)**2)**0.5
        return dist_3d

In [236]:
# Obtain the closest public transport stop to each user gps point
user_gps_to_stops = pd.DataFrame(list(user_gps_df.apply(lambda x: get_closest_stop(x.latitude, x.longitude, x.id), 
                                                        axis=1)), 
                                 columns=['id', 'closest_stop_name', 'closest_stop_lat', 'closest_stop_lon'])

user_gps_df = pd.merge(user_gps_df, user_gps_to_stops, on='id')

In [237]:
# Calculate the distance of the closest public transport stop for each user gps point
user_gps_df['closest_stop_dist'] = user_gps_df.apply(lambda x: calc_dist(x.latitude, x.longitude, x.closest_stop_lat, x.closest_stop_lon), axis=1)

## GPS to Edges

In [238]:
# Remove unecessary columns in user data
user_gps_df = user_gps_df.drop(columns={'uuid', 'id', 'user'}).drop_duplicates()
user_gps_df = user_gps_df.sort_values('timestamp_unix').reset_index(drop=True)

In [239]:
def points_to_edges(df):
    
    df = df.reset_index(drop=True)
    df['temp_id'] = range(0, df.shape[0])
    df['temp_id'] = df.temp_id - 1 

    temp_df = df[df.temp_id>=0]

    edge_df = pd.merge(df, temp_df, right_on='temp_id', left_index=True, how='inner', suffixes=('_s', '_f'))
    edge_df = edge_df.drop(columns={'temp_id', 'temp_id_s'})
    edge_df = edge_df.reset_index(drop=True)

    edge_df['duration'] = edge_df.timestamp_unix_f - edge_df.timestamp_unix_s

    edge_df['distance'] = edge_df.apply(lambda x: calc_dist(x.latitude_s, x.longitude_s, x.latitude_f, 
                                                            x.longitude_f, x.altitude_s, x.altitude_f), 
                                        axis=1)

    edge_df['meters_per_s'] = edge_df['distance'] / edge_df['duration']

    edge_df = edge_df.drop(columns=['temp_id_f'])

    edge_df['index_s'] = range(0, edge_df.shape[0])
    edge_df['index_f'] = edge_df.index_s + 1
    
    return edge_df

In [240]:
# Convert GPS data from points to edges to allow for speed calculation
edge_df = points_to_edges(user_gps_df)

## Split walk/non-walk segments

In [241]:
def generate_segment_ids(df, distance_thershold=100, duration_threshold=60):
    # define walk edges as ones where below criteria is met
    df['is_walk'] = False
    df.loc[(df.meters_per_s<2.79)&(df.speed_s<2.79)&(df.speed_f<2.79), 'is_walk'] = True
    df.loc[(df.meters_per_s<0.5)&(df.duration>=70), 'is_walk'] = True
    
    # obtain certain walk and non walk segments
    walk_segments, non_walk_segments = get_segments(df, distance_thershold, duration_threshold)
    
    df['is_walk'] = None
    df.loc[[x for y in walk_segments for x in y], 'is_walk'] = True
    df.loc[[x for y in non_walk_segments for x in y], 'is_walk'] = False
    
    # for uncertain edges, merge them into preceding segment
    uncertain_points = []
    last_certain_point = df[~df.is_walk.isnull()].is_walk.iloc[0]
    
    for i, r in df.iterrows():
        if r.is_walk is None:
            uncertain_points.append(i)
            if i == df.index.max():
                df.loc[uncertain_points, 'is_walk'] = last_certain_point
        else:
            if len(uncertain_points) == 0:
                last_certain_point = r.is_walk
            else:
                df.loc[uncertain_points, 'is_walk'] = last_certain_point
                uncertain_points = []
    
    walk_segments, non_walk_segments = get_segments(df, distance_thershold, duration_threshold)
    
    return df, walk_segments, non_walk_segments

In [242]:
def get_segments(df, distance_thershold=100, duration_threshold=60):
    distance, duration = 0, 0
    walk_segments, non_walk_segments = [], []
    w_seg, nw_seg = [], []
    max_i = df.index.max()
    
    for i, l in df.iterrows():
        if l.is_walk:
            duration += l.duration
            w_seg.append(i)
            
            # if previous non walk segment exceeds distance threshold, record it
            if distance >= distance_thershold:
                non_walk_segments.append(nw_seg)

            distance = 0
            nw_seg = []
        else:
            distance += l.distance
            nw_seg.append(i)
            
            # if previous walk segment exceeds duration threshold, record it
            if duration >= duration_threshold:
                walk_segments.append(w_seg)
            duration = 0
            w_seg = []
            
    # verify if last segment meets its respective threshold
    if distance >= distance_thershold:
        non_walk_segments.append(nw_seg)
    if duration >= duration_threshold:
        walk_segments.append(w_seg)
                
    return walk_segments, non_walk_segments

In [243]:
# obtain walk segments and non walk segments
edge_df, walk_segments, non_walk_segments = generate_segment_ids(edge_df)

In [244]:
def edge_to_point_id(df, walk_segs, non_walk_segs):
    # If either all walk or all non-walk return all gps point indices as one or the other
    if len(non_walk_segs)==0:
        return [list(range(0, df.shape[0]+1))], None
    
    if len(walk_segs)==0:
        return None, [list(range(0, df.shape[0]+1))]
    
    # Generate gps point indices from edge indices
    non_walk_points = []
    for nws in non_walk_segs:
        # Non-walk segments' extremeties are inclusive
        nwp = list(set(df.loc[nws][['index_s', 'index_f']].values.reshape(-1)))
        nwp.sort()
        non_walk_points.append(nwp)
        
    walk_points = []
    # Every point between two non-walk segments is a walk point
    for segs in zip(non_walk_points[:-1], non_walk_points[1:]):
        # Walk segments' extremeties are exclusive
        walk_points.append(list(range(segs[0][-1]+1, segs[1][0])))
                
    # If trip starts with walk segment, generate those initial gps point ids
    if non_walk_points[0][0] != 0:
        # In this case the first point is included
        walk_points = [list(list(range(0, non_walk_points[0][0])))] + walk_points

    # If trip ends with walk segment, generate those ending gps point ids
    trip_max_id = df.index_f.max()
    if non_walk_points[-1][-1] != trip_max_id:
        # In this case the last point is included
        walk_points.append(list(range(non_walk_points[-1][-1]+1, trip_max_id+1)))
        
    walk_points = [x for x in walk_points if len(x)>0]
    non_walk_points = [x for x in non_walk_points if len(x)>0]
        
    return walk_points, non_walk_points

In [245]:
# convert edge ids to point ids
walk_points, non_walk_points = edge_to_point_id(edge_df, walk_segments, non_walk_segments)

In [264]:
print('Non-walk segments:\n')
for i in range(len(non_walk_points)):
    print('Segment {}'.format(i)) 
    print('Departure time: {}, Closest stop: {}'.format(user_gps_df.loc[non_walk_points[i][0]].timestamp.strftime('%HH%M'),
                                                        user_gps_df.loc[non_walk_points[i][0]].closest_stop_name))
    print('Arrival time: {}, Closest stop: {} \n'.format(user_gps_df.loc[non_walk_points[i][-1]].timestamp.strftime('%HH%M'), 
                                                         user_gps_df.loc[non_walk_points[i][-1]].closest_stop_name))

Non-walk segments:

Segment 0
Departure time: 08H48, Closest stop: Fribourg, St-Pierre
Arrival time: 08H53, Closest stop: Fribourg, Pont-Zaehringen 

Segment 1
Departure time: 08H56, Closest stop: Fribourg, Pont-Zaehringen
Arrival time: 09H14, Closest stop: Fribourg, Bertigny 

Segment 2
Departure time: 09H17, Closest stop: Fribourg, Bertigny
Arrival time: 09H25, Closest stop: Fribourg, Tivoli 

Segment 3
Departure time: 09H30, Closest stop: Fribourg, St-Pierre
Arrival time: 09H35, Closest stop: Fribourg, Poya 

Segment 4
Departure time: 09H38, Closest stop: Fribourg, Poya
Arrival time: 09H45, Closest stop: Fribourg, Neuveville 



# Map plot

In [27]:
fig = plt.figure(figsize=(10,8))
for s in non_walk_points:
    display_points = rdp.rdp(user_gps_df.loc[s][['longitude', 'latitude']].values, 1e-4)
    plt.plot(display_points[:,0], display_points[:,1], linewidth=2, color='r')
    plt.plot(display_points[:,0], display_points[:,1], 'ro', markersize=10)
    
for s in walk_points:
    display_points = rdp.rdp(user_gps_df.loc[s][['longitude', 'latitude']].values, 1e-4)
    plt.plot(display_points[:,0], display_points[:,1], linewidth=4, color='g')
    plt.plot(display_points[:,0], display_points[:,1], 'go', markersize=10)
    
from IPython.display import IFrame
mplleaflet.display(fig)



# Transport Public Fribourgeois - OpenStreetMap Data

In [28]:
# Load OSM data for Fribourg
with open('fribourg_routes.geojson') as f:
    data = json.load(f)

In [29]:
# Define public transport modes of interest (taken from OSM documentation)
transport_types = ["train", "subway", "monorail", "tram", "light_rail", "bus", "trolleybus", "railway"]

In [30]:
# Create dictionary for OSM data
fribourg_routes = dict()
i=0
for route in data['features']:
    route_dict = dict()
    
    if route['geometry']['type'] != 'Point' \
    and route['properties']['route'] in transport_types \
    and route['properties'].get('network') != 'Flixbus':
        
        route_dict['name'] = route['properties'].get('name')
        route_dict['description'] = route['properties'].get('description')
        route_dict['network'] = route['properties'].get('network')
        route_dict['operator'] = route['properties'].get('operator')
        route_dict['ref'] = route['properties'].get('ref')
        route_dict['type'] = route['properties'].get('route')
        route_dict['line'] = route['geometry']['coordinates']
        
        fribourg_routes[i] = route_dict
        i += 1

In [31]:
fig = plt.figure(figsize=(10,8))
line_viz = [x for a in fribourg_routes.get(1).get('line') for x in a]
plt.plot([x[0] for x in line_viz], [x[1] for x in line_viz], 'ro')

from IPython.display import IFrame
mplleaflet.display(fig)

In [32]:
# collect all gps points from OSM into one array with an index for each line
labeled_points = []
for r in fribourg_routes:
    coordinates = fribourg_routes.get(r).get('line')
    if type(coordinates[0][0]) == list:
        labeled_points.append([x+[r] for a in coordinates for x in a])
    else:
        labeled_points.append([x+[r] for x in coordinates])
        
labeled_points = np.array([np.array(x) for a in labeled_points for x in a])

In [33]:
# construct KD tree from array of OSM gps points
fbg_tree = scipy.spatial.KDTree(labeled_points[:, :2])

In [95]:
def get_closest_pt_lines(tree, segments, user_gps, labeled_points, lat=True, lon=True, dist_threshold=70):

    possible_transports = []
    path_id = 0
    
    # Each degree of the radius line of the earth corresponds to 111,139 meters
    dist_threshold = dist_threshold / 111139

    for path in segments:
        
        # collect public transport points within defined threshold of segment gps point
        closest_points = tree.query_ball_point(user_gps.loc[path, ['longitude', 'latitude']].values, dist_threshold)

        if len([x for a in closest_points for x in a])==0:
            return None
        
        # for each point found calculate its distance from its respective segment gps point
        for i in range(len(path)):
            for p in closest_points[i]:
                gps_id = path[i]

                user_lat, user_lon = user_gps.loc[gps_id].latitude, user_gps.loc[gps_id].longitude

                close_point_lat, close_point_lon = labeled_points[p][1], labeled_points[p][0]
                
                # calculate distance based on parameters defined in arguments
                if lat and lon:
                    dist = calc_dist(user_lat, user_lon, close_point_lat, close_point_lon)
                elif lat:
                    dist = calc_dist(user_lat, close_point_lon, close_point_lat, close_point_lon)
                elif lon:
                    dist = calc_dist(close_point_lat, user_lon, close_point_lat, close_point_lon)
                else:
                    raise ValueError('Need to use at least one of latitude/longitude')
                    
                possible_transports.append([path_id, gps_id, labeled_points[p][-1], dist])

        path_id += 1
        
    pt_overlap = pd.DataFrame(possible_transports, columns=['path_id', 'gps_id', 'transport_id', 'dist'])
    pt_overlap = pt_overlap.sort_values(['path_id', 'gps_id', 'transport_id', 'dist'])
    pt_overlap = pt_overlap.groupby(['path_id', 'gps_id', 'transport_id'], as_index=False).agg({'dist': 'min'})
    
    return pt_overlap

In [96]:
def score_lines(overlap, nw_points, threshold=0.65, discount_factor=0.99, route_dict=fribourg_routes):
    
    # create dict mapping gps point -> distance to closest public transport line point
    closest_points = dict(overlap.groupby('gps_id', as_index=False).agg({'dist': 'min'}).values)
    
    # creat dict mapping segment -> number of gps points in segment
    path_lengths = dict(zip(range(len(nw_points)), [len(x) for x in nw_points]))
    
    # score closests points according to distance
    overlap['score'] = overlap\
    .apply(lambda x: discount_factor**((x.dist - closest_points[x.gps_id])/closest_points[x.gps_id]), axis=1)
    
    # score each line for each segment
    overlap = overlap.groupby(['path_id', 'transport_id'], as_index=False)\
    .agg({'score': 'prod', 'gps_id': ['count', 'first', 'last']})

    overlap.columns = ['path_id', 'transport_id', 'score', 'overlap', 'first_point_id', 'last_point_id']

    overlap['overlap'] = overlap.apply(lambda x: x.overlap / path_lengths[x.path_id], axis=1)
    
    overlap['score'] = overlap['score'] * overlap['overlap']
    overlap = overlap.drop(columns=['overlap'])
    
    # filter lines that score below set threshold
    overlap = overlap[overlap.score>threshold]
    
    if len(overlap)==0:
        raise Exception('No public transport lines score above the set threhold')
    
    overlap = overlap.sort_values(['path_id', 'score'], ascending=[True, False]).reset_index(drop=True)
    
    # add official line names to resulting dataframe
    overlap['route_short_name'] = overlap.apply(lambda x: route_dict[x.transport_id].get('ref'), axis=1)
    overlap = overlap.drop(columns=['transport_id'])
    
    return overlap

# GTFS - Timetable data

In [36]:
# user travel date
travel_day = datetime(2019,3,26)

In [37]:
# convert timestamp to unix timestamp (taking timezone into account)
fribourg_sbb['arrival_timestamp'] = fribourg_sbb['arrival_time'].apply(lambda x: timezone.localize(travel_day + timedelta(hours=int(x[:2]), minutes=int(x[3:5]), seconds=int(x[6:]))).timestamp())
fribourg_sbb['departure_timestamp'] = fribourg_sbb['departure_time'].apply(lambda x: timezone.localize(travel_day + timedelta(hours=int(x[:2]), minutes=int(x[3:5]), seconds=int(x[6:]))).timestamp())

In [97]:
def check_direction(trip_id, source_stop_ID, min_start_dist, seg_start_id, seg_end_id, route_name, trips, user_df):

    # get trip information
    trip = trips[trips.trip_id==trip_id].copy()
    
    # get coordinates for end of segment
    segment_end_coords = user_df.loc[seg_end_id][['latitude', 'longitude']].values
    
    # get the trip stop that is nearest to the segment end
    dist2end = []
    
    for stop in trip.itertuples():
            dist2end.append(calc_dist(stop[14], stop[15], segment_end_coords[0], segment_end_coords[1]))
            
    trip['dist2end'] = dist2end
    min_end_dist = trip.dist2end.min()
    
    destination_stop_ID = trip[trip.dist2end==min_end_dist].stop_sequence.values[0]
    
    start = [trip_id, source_stop_ID, seg_start_id, min_start_dist, True]
    end = [trip_id, destination_stop_ID, seg_end_id, min_end_dist, False]
    
    # if the stop closest to end is a later stop than the one closest to start, return start and end data
    if destination_stop_ID > source_stop_ID:
        return [start, end]
    else:
        return None

In [111]:
overlap_lat_lon_scores_df

Unnamed: 0,path_id,score,first_point_id,last_point_id,route_short_name
0,0,0.93617,1,47,123
1,0,0.93617,1,47,2
2,0,0.93617,1,47,6
3,0,0.673952,1,47,181
4,0,0.657422,1,47,125
5,0,0.657422,1,47,127
6,1,0.970203,48,122,2
7,1,0.832522,48,122,6
8,2,1.0,134,170,2
9,2,1.0,134,170,6


In [133]:
def gen_candidates(scores_df, user_df, transport_df, lat=True, lon=True):
    candidates = []

    for tup in scores_df.itertuples():

        # get time of segment start
        start_time = user_df.loc[tup[3]].timestamp_unix
        
        # get trip ids that could be up to 15 minutes late or up to 5 minutes early
        trip_ids = transport_df[(transport_df.departure_timestamp>start_time-900)
                                &(transport_df.departure_timestamp<start_time+300)].trip_id.unique()
        
        # get trip information for those trips that are associated to the line of interest
        line_trips = transport_df[(transport_df.route_short_name==tup[5])&(transport_df.trip_id.isin(trip_ids))].copy()
        
        # identify each line's stop closest to the segment start
        if lat and lon:
            start_loc = user_df.loc[tup[3]][['latitude', 'longitude']].values
            distances = [calc_dist(start_loc[0], start_loc[1], x[14], x[15]) for x in line_trips.itertuples()]
        elif lat:
            start_loc = user_df.loc[tup[3]][['latitude']].values
            distances = [calc_dist(start_loc[0], x[15], x[14], x[15]) for x in line_trips.itertuples()]
        elif lon:
            start_loc = user_df.loc[tup[3]][['longitude']].values
            distances = [calc_dist(x[14], start_loc[0], x[14], x[15]) for x in line_trips.itertuples()]
        else:
            raise ValueError('Need to use at least one of latitude/longitude')

        line_trips['dist2start'] = distances

        line_trips = line_trips.sort_values(['trip_id', 'dist2start'])

        candidate_line_trips = line_trips.groupby('trip_id', as_index=False).agg({'stop_sequence':'first', 
                                                                                  'departure_timestamp':'first',
                                                                                  'dist2start': 'first'})
        
        candidate_line_trips = candidate_line_trips[(candidate_line_trips.departure_timestamp>start_time-900)
                                                    &(candidate_line_trips.departure_timestamp<start_time+300)]

        candidate_line_trips = candidate_line_trips[['trip_id', 'stop_sequence', 'dist2start']].values

        # check that the line's trip is heading in right direction
        direction = [check_direction(x[0], x[1], x[2], tup[3], tup[4], tup[5], line_trips, user_df) for x in candidate_line_trips]

        candidate_line_trips = [x for a in direction if a !=None for x in a]
        candidate_line_trips = [np.append(x, tup[1]) for x in candidate_line_trips]

        candidates = candidates + candidate_line_trips
        
    # add gtfs data
    candidates_df = pd.DataFrame(candidates, columns=['trip_id', 'stop_sequence', 'gps_id', 'dist_to_stop', 'is_start', 'path_id'])
    candidates_df = candidates_df.astype(dtype={'stop_sequence': int, 'dist_to_stop': float, 'path_id': int})
    candidates_df = pd.merge(fribourg_sbb, candidates_df, on=['trip_id', 'stop_sequence'])
    candidates_df = candidates_df.sort_values(['path_id', 'trip_id', 'arrival_timestamp', 'stop_sequence'])
    candidates_df = candidates_df.reset_index(drop=True)
    
    return candidates_df

In [134]:
def top_candidates(candidates_df, user_df):
    
    # calculate absolute time difference with departure/arrival of line and segment start/end time
    candidates_df = candidates_df.copy()
    time_diffs = []
    for tup in candidates_df.itertuples():
        if tup[-2]:
            time_diffs.append(abs(tup[-5]-user_df.loc[int(tup[-4])].timestamp_unix))
        else:
            time_diffs.append(abs(tup[-6]-user_df.loc[int(tup[-4])].timestamp_unix))
            
    candidates_df['time_diff'] = time_diffs
    
    # average the computed time difference and distances for each trip id per segment
    candidates_df = candidates_df.groupby(['path_id', 'route_short_name', 'trip_id'], as_index=False)\
    .agg({'time_diff': 'mean', 'dist_to_stop': 'mean'})
    
    candidates_df = candidates_df.round(2)
    
    candidates_df = candidates_df.sort_values(['path_id', 'dist_to_stop', 'time_diff'])
    
    candidates_df = candidates_df.groupby(['path_id', 'route_short_name'], as_index=False)\
    .agg({'time_diff': 'first', 'dist_to_stop': 'first'})
    
    return candidates_df

# Transport Public Fribourgeois - Live Data

In [41]:
def read_tpc(path, date=None):
    filenames = os.listdir(path)
    
    if date is not None:
        filenames = [x for x in filenames if date in x]
    
    files = []
    for f in filenames:
        try:
            files.append(pd.read_csv(path+f, sep=';'))
        except:
            print("Empty file: ", f)
            continue

    return pd.concat(files)

In [42]:
# get the day of travel from user gps data
date = datetime.fromtimestamp(user_gps_df.iloc[0].timestamp_unix).strftime("%Y-%m-%d")
print(date)

2019-03-26


In [43]:
# read public transport gps data
citylines_df = read_tpc('./TPF/CityLine_2019-03/', '2019-03-26')
regionallines_df = read_tpc('./TPF/RegionalLine 2019-03/', '2019-03-26')

Empty file:  2019-03-26_1002_1.plc


In [44]:
lines_df = pd.concat([citylines_df, regionallines_df])

In [45]:
# select columns of interest
tpf_df = lines_df[['Vehicle', 'Day', 'Arrival', 'Departure', 'Latitude', 'Longitude', 'Line', 
                   'Journey', 'Block', 'JourneySeq', 'Stop', 'StopName', 'StopSeq']].copy()

In [46]:
# convert date to datetime
tpf_df['Day'] = tpf_df['Day'].apply(lambda x: timezone.localize(datetime.strptime(x, "%Y-%m-%d")))

In [47]:
# compute actual arrival/departure times and respective unix timestamp
actual_arrivals = []
actual_arrival_timestamps = []
actual_departures = []
actual_departure_timestamps = []

for tup in tpf_df.itertuples():
    actual_arrival = tup[2] + timedelta(hours=int(tup[3][:2]), minutes=int(tup[3][3:5]), seconds=int(tup[3][6:8]))
    actual_arrivals.append(actual_arrival)
    actual_arrival_timestamps.append(actual_arrival.timestamp())
    
    actual_departure = tup[2] + timedelta(hours=int(tup[4][:2]), minutes=int(tup[4][3:5]), seconds=int(tup[4][6:8]))
    actual_departures.append(actual_departure)
    actual_departure_timestamps.append(actual_departure.timestamp())
    
tpf_df.loc[: ,'actual_arrival'] = actual_arrivals
tpf_df.loc[: ,'actual_departure'] = actual_departures

tpf_df.loc[:, 'arrival_timestamp'] = actual_arrival_timestamps
tpf_df.loc[:, 'departure_timestamp'] = actual_departure_timestamps

In [48]:
# create dict mapping stop -> coordinates
stop_dict = tpf_df[~tpf_df.Stop.isnull()][['Stop', 'Latitude', 'Longitude']]
stop_dict = stop_dict.groupby('Stop').agg({'Latitude': 'mean', 'Longitude': 'mean'})
stop_dict = stop_dict.to_dict('index')

In [49]:
# for those rows having stop name but no coordinates, populate coordinates using above dict
tpf_df.loc[(tpf_df.Latitude.isnull())&(~tpf_df.Stop.isnull()), 'Latitude'] = tpf_df[(tpf_df.Latitude.isnull())&(~tpf_df.Stop.isnull())].apply(lambda x: stop_dict.get(x.Stop)['Latitude'], axis=1)
tpf_df.loc[(tpf_df.Longitude.isnull())&(~tpf_df.Stop.isnull()), 'Longitude'] = tpf_df[(tpf_df.Longitude.isnull())&(~tpf_df.Stop.isnull())].apply(lambda x: stop_dict.get(x.Stop)['Longitude'], axis=1)
tpf_df = tpf_df[(~tpf_df.Latitude.isnull())&(~tpf_df.Longitude.isnull())]

In [50]:
# dynamic time warping implementation
def dtw(s, t, window):
    n, m = len(s), len(t)
    w = np.max([window, abs(n-m)])
    dtw_matrix = np.zeros((n+1, m+1))
    
    for i in range(n+1):
        for j in range(m+1):
            dtw_matrix[i, j] = np.inf
    dtw_matrix[0, 0] = 0
    
    for i in range(1, n+1):
        for j in range(np.max([1, i-w]), np.min([m, i+w])+1):
            dtw_matrix[i, j] = 0
    
    for i in range(1, n+1):
        for j in range(np.max([1, i-w]), np.min([m, i+w])+1):
            cost = abs(s[i-1] - t[j-1])
            # take last min from a square box
            last_min = np.min([dtw_matrix[i-1, j], dtw_matrix[i, j-1], dtw_matrix[i-1, j-1]])
            dtw_matrix[i, j] = sum(cost) + last_min
    return dtw_matrix[-1][-1]

In [65]:
def get_top_scorers(nw_points, transport_df, user_df, max_dist=0.2, threshold=1.5, lat=True, lon=True):
    first = []
    top = []
    for s in nw_points:
        # filter the day's gps data to time window of non walk segment with 30s buffer on each end
        candidates = transport_df[(transport_df.departure_timestamp>user_df.loc[s[0]].timestamp_unix-30)
                                  &(transport_df.arrival_timestamp<user_df.loc[s[-1]].timestamp_unix+30)]

        candidate_vehicles = candidates.Vehicle.unique()
        
        # use specified information
        if lat and lon:
            user_coords = user_df.loc[s][['latitude', 'longitude']].values
        elif lat:
            user_coords = user_df.loc[s][['latitude']].values
        elif lon:
            user_coords = user_df.loc[s][['longitude']].values
        else:
            raise ValueError('Need to make use of at least one of latitude or longitude')
        
        scores = []
        lines = []
        vehicles = []
        
        # for each vehicle in time window compute distance using dynamic time warping
        for v in candidate_vehicles:

            bus = candidates[(candidates.Vehicle==v)]
            line = bus[~bus.Line.isnull()].Line.unique()
            
            # if vehicle covers multiple lines in time window or there is only one GPS point for it, skip vehicle
            if len(line) != 1 or len(bus)==1:
                continue
            
            # use specified information
            if lat and lon:
                bus_coords = bus[['Latitude', 'Longitude']].values
            elif lat:
                bus_coords = bus[['Latitude']].values
            elif lon:
                bus_coords = bus[['Longitude']].values
            
            # compute distance
            score = dtw(user_coords, bus_coords, 2)

            lines.append(line[0])
            scores.append(score)
            vehicles.append(v)

        min_score = min(scores)
        
        # collect results for those vehicles having a distance of less than max_dist with user segment
        # additionally discard vehicles having a score significantly worse than the best
        first_candidate = [x for x in list(zip(vehicles, scores, lines)) if x[1]==min_score and x[1]<max_dist]
        top_candidates = [x for x in list(zip(vehicles, scores, lines)) if x[1]<max_dist and x[1] < threshold*min_score]
        
        if len(first_candidate)>0:
            first.append(first_candidate[0]) 
        if len(top_candidates)>0:
            top.append(top_candidates)
        
    return first, top

# Testing

### OSM

In [100]:
overlap_lat_lon_df = get_closest_pt_lines(fbg_tree, non_walk_points, user_gps_df, labeled_points)
overlap_lat_df = get_closest_pt_lines(fbg_tree, non_walk_points, user_gps_df, labeled_points, 
                                      lon=False, dist_threshold=50)
overlap_lon_df = get_closest_pt_lines(fbg_tree, non_walk_points, user_gps_df, labeled_points, 
                                      lat=False, dist_threshold=50)

In [101]:
overlap_lat_lon_scores_df = score_lines(overlap_lat_lon_df, non_walk_points)
overlap_lat_scores_df = score_lines(overlap_lat_df, non_walk_points, discount_factor=1, threshold=0.5)
overlap_lon_scores_df = score_lines(overlap_lon_df, non_walk_points, discount_factor=1, threshold=0.5)

### Timetable

In [135]:
candidates_lat_lon_df = gen_candidates(overlap_lat_lon_scores_df, user_gps_df, fribourg_sbb)
candidates_lat_df = gen_candidates(overlap_lat_scores_df, user_gps_df, fribourg_sbb, lon=False)
candidates_lon_df = gen_candidates(overlap_lon_scores_df, user_gps_df, fribourg_sbb, lat=False)

In [136]:
top_candidates_lat_lon_df = top_candidates(candidates_lat_lon_df, user_gps_df)
top_candidates_lat_df = top_candidates(candidates_lat_df, user_gps_df)
top_candidates_lon_df = top_candidates(candidates_lon_df, user_gps_df)

In [174]:
top_lat_lon = pd.merge(top_candidates_lat_lon_df, overlap_lat_lon_scores_df, on=['path_id', 'route_short_name'])
top_lat_lon['line_rank'] = top_lat_lon.groupby('path_id')['score'].rank('dense', ascending=False)
top_lat_lon[top_lat_lon.line_rank==1][['path_id', 'route_short_name', 'score', 'time_diff', 'dist_to_stop']]

Unnamed: 0,path_id,route_short_name,score,time_diff,dist_to_stop
0,0,2,0.93617,66.0,68.77
1,0,6,0.93617,195.0,68.77
2,1,2,0.970203,173.0,45.61
4,2,2,1.0,50.0,100.96
5,2,6,1.0,130.0,100.96
6,3,1,1.0,33.5,16.54


In [175]:
top_lat = pd.merge(top_candidates_lat_df, overlap_lat_scores_df, on=['path_id', 'route_short_name'])
top_lat['line_rank'] = top_lat.groupby('path_id')['score'].rank('dense', ascending=False)
top_lat[top_lat.line_rank==1][['path_id', 'route_short_name', 'score', 'time_diff', 'dist_to_stop']]

Unnamed: 0,path_id,route_short_name,score,time_diff,dist_to_stop
1,0,2,0.914894,66.0,41.38
2,0,6,0.914894,294.0,40.35
4,1,2,0.986667,173.0,41.09
6,2,2,1.0,50.0,83.72
8,2,6,1.0,130.0,83.72
9,3,1,0.976744,33.5,10.18
10,4,1,0.688889,90.5,123.47


In [176]:
top_lon = pd.merge(top_candidates_lon_df, overlap_lon_scores_df, on=['path_id', 'route_short_name'])
top_lon['line_rank'] = top_lon.groupby('path_id')['score'].rank('dense', ascending=False)
top_lon[top_lon.line_rank==1][['path_id', 'route_short_name', 'score', 'time_diff', 'dist_to_stop']]

Unnamed: 0,path_id,route_short_name,score,time_diff,dist_to_stop
1,0,2,0.914894,66.0,62.77
2,0,6,0.914894,195.0,62.77
4,1,2,0.986667,143.0,38.1
6,2,2,1.0,50.0,100.75
7,2,6,1.0,190.0,94.92
8,3,1,0.976744,33.5,16.52
9,4,1,0.688889,90.5,112.52


### Bus GPS

In [66]:
first_lat_lon, top_lat_lon = get_top_scorers(non_walk_points, tpf_df, user_gps_df)
first_lat, top_lat = get_top_scorers(non_walk_points, tpf_df, user_gps_df, lon=False)
first_lon, top_lon = get_top_scorers(non_walk_points, tpf_df, user_gps_df, lat=False)

In [67]:
first_lat_lon

[(522, 0.06452907999999002, 2.0),
 (528, 0.13569899999996604, 2.0),
 (561, 0.03837370999999301, 6.0),
 (597, 0.07644924000005648, 1.0)]

In [68]:
top_lat_lon

[[(522, 0.06452907999999002, 2.0)],
 [(561, 0.16515490000003297, 6.0), (528, 0.13569899999996604, 2.0)],
 [(521, 0.04392035000000494, 2.0), (561, 0.03837370999999301, 6.0)],
 [(597, 0.07644924000005648, 1.0)]]

In [69]:
first_lat

[(522, 0.01665822999999733, 2.0),
 (528, 0.027673339999957136, 2.0),
 (561, 0.006762810000012109, 6.0),
 (597, 0.043538900000058334, 1.0),
 (533, 0.06788880999999236, 2.0)]

In [70]:
first_lon

[(522, 0.0446792500000015, 2.0),
 (528, 0.1056244400000077, 2.0),
 (529, 0.02617522000000072, 3.0),
 (597, 0.0227453000000013, 1.0),
 (352, 0.11560435999999186, 4.0)]

### GPS Noise

In [184]:
def blur_location(lat, lon, radius_m, fixed=True):
    original_point = geopy.Point(lat, lon)
    
    if not fixed:
        radius_m = random.uniform(0, radius_m)
        
    angle = random.uniform(0, 360)
    distance_vec = distance(meters=radius_m)

    new_point = distance_vec.destination(original_point, angle)
    
    return new_point.latitude, new_point.longitude 

In [185]:
blurred_user_gps_df = user_gps_df.copy()

blurred_coords = []
for tup in blurred_user_gps_df.itertuples():
    blurred_coords.append(blur_location(tup[1], tup[2], radius_m=100, fixed=True))
    
blurred_user_gps_df[['latitude', 'longitude']] = blurred_coords

In [186]:
# GPS
first_blurred, top_blurred = get_top_scorers(non_walk_points, tpf_df, blurred_user_gps_df)
first_blurred

[(522, 0.09336224531648174, 2.0),
 (528, 0.17035124410355618, 2.0),
 (561, 0.06198159482444243, 6.0),
 (597, 0.09573816503377497, 1.0)]

In [193]:
# OSM
overlap_blur_df = get_closest_pt_lines(fbg_tree, non_walk_points, blurred_user_gps_df, labeled_points, 
                                       dist_threshold=120)

overlap_blur_scores_df = score_lines(overlap_blur_df, non_walk_points, threshold=0.5)

In [194]:
# Timetable
candidates_blur_df = gen_candidates(overlap_blur_scores_df, blurred_user_gps_df, fribourg_sbb)
top_candidates_blur_df = top_candidates(candidates_blur_df, blurred_user_gps_df)

In [195]:
top_blur = pd.merge(top_candidates_blur_df, overlap_blur_scores_df, on=['path_id', 'route_short_name'])
top_blur['line_rank'] = top_blur.groupby('path_id')['score'].rank('dense', ascending=False)
top_blur[top_blur.line_rank==1][['path_id', 'route_short_name', 'score', 'time_diff', 'dist_to_stop']]

Unnamed: 0,path_id,route_short_name,score,time_diff,dist_to_stop
1,0,2,0.804958,66.0,97.1
4,2,6,0.981838,130.0,82.41
5,3,1,0.834979,33.5,100.03


### Time split

In [82]:
splits = 5
total_rows = user_gps_df.shape[0]
frag_size = round(total_rows/splits)
osm_res_df = None
gps_res_df = None

start_index = 0

for s in range(splits):
    frag_user_gps_df = user_gps_df.iloc[start_index:start_index+frag_size].reset_index(drop=True)
    start_index += frag_size
    
    frag_edge_df = points_to_edges(frag_user_gps_df)
    frag_edge_df, frag_walk_points, frag_non_walk_points = generate_segment_ids(frag_edge_df)
    frag_walk_points, frag_non_walk_points = edge_to_point_id(frag_edge_df, frag_walk_points, frag_non_walk_points)
    
    print('Fragment {}, Start index {}'.format(s, start_index-frag_size), frag_non_walk_points)
    
    if frag_non_walk_points is None:
        continue
    
    overlap_frag_df = get_closest_pt_lines(fbg_tree, frag_non_walk_points, frag_user_gps_df, labeled_points, 
                                          dist_threshold=70)

    overlap_frag_scores_df = score_lines(overlap_frag_df, frag_non_walk_points, threshold=0.5)
    candidates_frag_df = gen_candidates(overlap_frag_scores_df, frag_user_gps_df, fribourg_sbb)
    top_candidates_frag_df = top_candidates(candidates_frag_df, frag_user_gps_df)
    
    top_candidates_frag_df['frag_id'] = s
    
    if osm_res_df is None:
        osm_res_df = top_candidates_frag_df
    else:
        osm_res_df = pd.concat([osm_res_df, top_candidates_frag_df])
        
    first_frag, top_frag = get_top_scorers(frag_non_walk_points, tpf_df, frag_user_gps_df)
    first_frag = pd.DataFrame([x for a in top_frag for x in a], columns=['vehicle_id', 'score', 'line'])
    #first_frag = pd.DataFrame(first_frag, columns=['vehicle_id', 'score', 'line'])
    first_frag['frag_id'] = s
    
    if gps_res_df is None:
        gps_res_df = first_frag
    else:
        gps_res_df = pd.concat([gps_res_df, first_frag])

Fragment 0, Start index 0 [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47], [48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64]]
Fragment 1, Start index 65 [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57]]
Fragment 2, Start index 130 [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40], [60, 61, 62, 63, 64]]
Fragment 3, Start index 195 [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37], [39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 6

In [78]:
osm_res_df

Unnamed: 0,path_id,route_short_name,time_diff,dist_to_stop,frag_id
0,0,1,47.5,92.23,0
1,0,2,66.0,68.77,0
2,0,6,195.0,68.77,0
3,1,1,19.0,87.74,0
4,1,123,806.5,435.01,0
5,1,127,686.5,209.62,0
6,1,2,153.5,84.22,0
7,1,6,446.5,84.22,0
0,0,2,50.5,82.09,1
1,0,6,98.5,82.09,1


In [81]:
gps_res_df

Unnamed: 0,vehicle_id,score,line,frag_id
0,522,0.064529,2.0,0
1,352,0.064285,4.0,0
0,528,0.085588,2.0,1
0,561,0.042647,6.0,2
1,597,0.00108,1.0,2
0,597,0.112256,1.0,3
1,555,0.093636,1.0,3
0,563,0.195356,6.0,4


In [83]:
gps_res_df

Unnamed: 0,vehicle_id,score,line,frag_id
0,522,0.064529,2.0,0
1,354,0.088003,4.0,0
2,352,0.064285,4.0,0
3,528,0.071244,2.0,0
4,533,0.085294,2.0,0
0,528,0.085588,2.0,1
0,521,0.057593,2.0,2
1,561,0.042647,6.0,2
2,597,0.00108,1.0,2
0,597,0.112256,1.0,3
