We will be using the following packages for this notebook. 
You can install them with 'conda install <package>' or 'pip install <package>'

In [1]:
import pandas as pd
import datetime as dt
import pyproj
import numpy as np


import os

#import geopandas as gpd
#from geographiclib.geodesic import Geodesic
#from geopy.distance import vincenty
#from shapely.geometry import Point

Depending on the device, GPS trajectories can also contain measurements of velocity and acceleration. This dataset does not contain such information, but it can be calculated from XY-coordinates and timestamps of two points. 

velocity = distance(point1, point2) / timedelta(point1, point2)
acceleration = velocitydifference(point1, point2) / timedelta(point1, point2)

The important thing to take into account is that we first have to convert geographic (long, lat) coordinates to Euclidean coordinates, before we can calculate the velocity. This is called the Great-circle distance and takes the the curvature of the earth into account. 

The Python library pyproj can be used to convert between geographic (long, lat) to euclidean coordinates (x,y). Another great python library is geopy.


see:

https://en.wikipedia.org/wiki/Great-circle_distance

https://github.com/jswhit/pyproj



In [2]:
geod = pyproj.Geod(ellps='WGS84')

def to_datetime(string):
    return dt.datetime.strptime(string, '%Y-%m-%d %H:%M:%S')

def calculate_distance(long1, lat1, long2, lat2):
    if lat1 == lat2 and long1 == long2:
        return 0
    if False in np.isfinite([long1, long2, lat1, lat2]):
        return np.nan
    if lat1 < -90 or lat1 > 90 or lat2 < -90 or lat2 > 90:
        #raise ValueError('The range of latitudes seems to be invalid.')
        return np.nan
    if long1 < -180 or long1 > 180 or long2 < -180 or long2 > 180:
        return np.nan
        #raise ValueError('The range of longitudes seems to be invalid.')
    angle1,angle2,distance = geod.inv(long1, lat1, long2, lat2)
    return distance

def calculate_velocity(distance, timedelta):
    if timedelta.total_seconds() == 0: return np.nan
    return distance / timedelta.total_seconds()

def calculate_acceleration(velocity, velocity2, timedelta):
    delta_v = velocity2 - velocity
    if timedelta.total_seconds() == 0: return np.nan
    return delta_v / timedelta.total_seconds()


In [3]:

headers_trajectory = ['lat', 'long', 'null', 'altitude','timestamp_float', 'date', 'time']
headers_metadf = ['trajectory_id', 'start_time', 'end_time', 'v_ave', 'v_med', 'a_ave', 'a_med', 'labels']


def load_trajectory_df(subfolder, filename, trajectory_id):
    df = pd.read_csv(filename, skiprows = 6, header = None, names = headers_trajectory)
    df['trajectory_id'] = trajectory_id
    df['subfolder'] = subfolder
    df['labels'] = ''
    
    df['datetime'] = df.apply(lambda z: to_datetime(z.date + ' ' + z.time), axis=1)
    df['datetime2'] = df['datetime'].shift(1)
    df['long2'] = df['long'].shift(1)
    df['lat2'] = df['lat'].shift(1)
    
    df['distance'] = df.apply(lambda z: calculate_distance(z.long, z.lat, z.long2, z.lat2), axis=1)
    df['timedelta'] = df.apply(lambda z: z.datetime - z.datetime2, axis=1)
    df['velocity'] = df.apply(lambda z: calculate_velocity(z.distance, z.timedelta), axis=1)
    df['velocity2'] = df['velocity'].shift(1)
    df['acceleration'] = df.apply(lambda z: calculate_acceleration(z.velocity, z.velocity2, z.timedelta), axis=1)
    df = df.drop(['datetime2', 'long2', 'lat2', 'velocity2', 'null', 'timestamp_float', 'date', 'time'], axis=1)
    
    return df

def load_labels_df(filename):
    df = pd.read_csv(filename, sep='\t')
    df['start_time'] = df['Start Time'].apply(lambda x: dt.datetime.strptime(x, '%Y/%m/%d %H:%M:%S'))
    df['end_time'] = df['End Time'].apply(lambda x: dt.datetime.strptime(x, '%Y/%m/%d %H:%M:%S'))
    df['labels'] = df['Transportation Mode']
    df = df.drop(['End Time', 'Start Time', 'Transportation Mode'], axis=1)
    return df

def retrieve_metadata(df):
    df_meta = pd.DataFrame(columns = headers_metadf)
    trajectory_ids = df['trajectory_id'].unique()
    for ii in range(len(trajectory_ids)):
        trajectory_id = trajectory_ids[ii]
        df_ = df[df['trajectory_id'] == trajectory_id]
        start_time = df_.head(1)['datetime'].values[0]
        end_time = df_.tail(1)['datetime'].values[0]
        v_ave = np.nanmean(df_['velocity'].values)
        v_med = np.nanmedian(df_['velocity'].values)
        a_ave = np.nanmean(df_['acceleration'].values)
        a_med = np.nanmedian(df_['acceleration'].values)
        labels = df_['labels'].unique()
        labels = ",".join(labels)
        df_meta.loc[ii,:] = [trajectory_id, start_time, end_time, v_ave, v_med, a_ave, a_med, labels]
    return df_meta


In [4]:
labels_file = 'labels.txt'
MAIN_FOLDER = '../GPSML/Data/'
TRAJ_FOLDER = 'Trajectory/'
directories = os.listdir(MAIN_FOLDER)
OUTPUT_FOLDER = '../processed_data/'

if not os.path.exists(OUTPUT_FOLDER):
    os.makedirs(OUTPUT_FOLDER)

for subfolder in directories:
    list_df_traj = []
    subfolder_ = MAIN_FOLDER + subfolder + '/'
    traj_folder = MAIN_FOLDER + subfolder + '/' + TRAJ_FOLDER
    traj_files = os.listdir(traj_folder)
    print(subfolder, len(traj_files))
    for traj_file in traj_files:
        trajectory_id = traj_file.split('.')[0]
        filename = traj_folder + traj_file
        df_traj = load_trajectory_df(subfolder, filename, trajectory_id)
        list_df_traj.append(df_traj)
    df_traj_all = pd.concat(list_df_traj)

    if labels_file in os.listdir(subfolder_):
        filename = subfolder_ + labels_file
        df_labels = load_labels_df(filename)
        for idx in df_labels.index.values:
            st = df_labels.ix[idx]['start_time']
            et = df_labels.ix[idx]['end_time']
            labels = df_labels.ix[idx]['labels']
            if labels:
                df_traj_all.loc[(df_traj_all['datetime'] >= st) & 
                                (df_traj_all['datetime'] <= et), 'labels'] = labels

    filename = OUTPUT_FOLDER + subfolder + '.csv'
    filename_metadata = OUTPUT_FOLDER + subfolder + '_metadata.csv'

    #df_traj_all.to_csv(filename)
    df_metadata = retrieve_metadata(df_traj_all)
    df_metadata.to_csv(filename_metadata)

000 171
001 71
002 175
003 322
004 395
005 86
006 28
007 54
008 34
009 49
010 161
011 201
012 77
013 144
014 279
015 67
016 51
017 391
018 50
019 84
020 151
021 8
022 146
023 34
024 101
025 757
026 43
027 5
028 64
029 45
030 296
031 6
032 16
033 13
034 198
035 74
036 72
037 148
038 110
039 227
040 27
041 557
042 150
043 43
044 72
045 9
046 31
047 12
048 3
049 1
050 51
051 51
052 104
053 9
054 3
055 24
056 31
057 22
058 22
059 8
060 2
061 20
062 706
063 15
064 23
065 137




066 19
067 120
068 408
069 8
070 11
071 73
072 2
073 112
074 91
075 13




076 11
077 3
078 100
079 23
080 9
081 32
082 96
083 36
084 215
085 427
086 6
087 8
088 59
089 64
090 8
091 98
092 157
093 23
094 22
095 36
096 112
097 38
098 6
099 16
100 7
101 68
102 38
103 48
104 115
105 10
106 3
107 3
108 9
109 4
110 25
111 44
112 212
113 32
114 23
115 184
116 3
117 8
118 5
119 45
120 2
121 5
122 16
123 5
124 10
125 57




126 263




127 10
128 2153
129 8
130 20
131 21
132 6
133 5
134 75
135 13
136 17
137 1
138 18
139 19
140 380
141 34




142 156
143 1
144 610




145 6
146 10
147 84
148 3
149 1
150 22
151 1
152 4




153 2024
154 31
155 42
156 2
157 13
158 14
159 13
160 3
161 16
162 11
163 809
164 7
165 26
166 8
167 385




168 97
169 36
170 5
171 5
172 21
173 6
174 70
175 4
176 8
177 1
178 1
179 71




180 5
181 15
