In [1]:
import geopandas as gpd
import pandas as pd  # ours: 0.18.1
from shapely.geometry import Point
import datetime as dt
#from geopy.distance import vincenty
import pyproj
import numpy as np # ours: 1.12.1
from os import listdir

GeoPandas is a great tool to work with if you are using geographical data. It can read almost any vector-based spatial data format including ESRI shapefile, GeoJSON files. The data we have however is in a CSV format. Therefore we will first read it with Pandas and then convert it to GeoPandas. 

To load the long, lat coordinates into a GeoDataFrame, we first have to create an array of shapely geometries first. 

Depending on the device, GPS trajectories can also contain measurements of velocity and acceleration. This dataset does not contain such information, but it can be calculated from XY-coordinates and timestamps of two points. 

velocity = distance(point1, point2) / timedelta(point1, point2)

The important thing to take into account is that we can not simply calculate the velocity by using the Euclidean distance (which assumes that two points lie on a plane), but have to use the Great-circle distance (which takes the curvature of the earth into account). 

The Python library pyproj can be used to convert between geographic (long, lat) to euclidean coordinates (x,y). Another great python library is geopy.


see:
http://geopandas.org/io.html#reading-spatial-data

http://toblerity.org/shapely/manual.html

https://en.wikipedia.org/wiki/Great-circle_distance

https://github.com/jswhit/pyproj



In [3]:
geod = pyproj.Geod(ellps='WGS84')

def to_datetime(string):
    return dt.datetime.strptime(string, '%Y-%m-%d %H:%M:%S')

def calculate_distance(point1, point2):
    angle1,angle2,distance = geod.inv(point1.x, point1.y, point2.x, point2.y)
    return distance

def calculate_velocity(distance, timedelta):
    if timedelta.total_seconds() == 0: return np.nan
    return distance / timedelta.total_seconds()

def calculate_acceleration(velocity, velocity2, timedelta):
    delta_v = velocity2 - velocity
    if timedelta.total_seconds() == 0: return np.nan
    return delta_v / timedelta.total_seconds()

headers_trajectory = ['lat', 'long', 'null', 'altitude','timestamp_float', 'date', 'time']
headers_metadf = ['trajectory_id', 'start_time', 'end_time', 'v_ave', 'v_med', 'a_ave', 'a_med', 'labels']

def load_trajectory_df(filename, trajectory_id):
    #trajectory_id = os.path.splitext(filename)[0]
    df = pd.read_csv(filename, skiprows = 6, header = None, names = headers_trajectory)
    df['point'] = df.apply(lambda z: Point(z.long, z.lat), axis=1)
    df['point2'] = df['point'].shift(1)
    df['datetime'] = df.apply(lambda z: to_datetime(z.date + ' ' + z.time), axis=1)
    df['datetime2'] = df['datetime'].shift(1)
    df.loc[0, 'point2'] = df.ix[1]['point2']
    df.loc[0, 'datetime2'] = df.ix[1]['datetime2']
    df['distance'] = df.apply(lambda z: calculate_distance(z.point, z.point2), axis=1)
    df['timedelta'] = df.apply(lambda z: z.datetime - z.datetime2, axis=1)
    df['velocity'] = df.apply(lambda z: calculate_velocity(z.distance, z.timedelta), axis=1)
    df['velocity2'] = df['velocity'].shift(1)
    df['acceleration'] = df.apply(lambda z: calculate_acceleration(z.velocity, z.velocity2, z.timedelta), axis=1)
    df = df.drop(['lat', 'long', 'null', 'timestamp_float', 'date', 'time', 'point2', 'datetime2', 'velocity2'], axis=1)
    df['labels'] = ''
    df['trajectory_id'] = trajectory_id
    return df

def load_labels_df(filename):
    df = pd.read_csv(filename, sep='\t')
    df['start_time'] = df['Start Time'].apply(lambda x: dt.datetime.strptime(x, '%Y/%m/%d %H:%M:%S'))
    df['end_time'] = df['End Time'].apply(lambda x: dt.datetime.strptime(x, '%Y/%m/%d %H:%M:%S'))
    df['labels'] = df['Transportation Mode']
    df = df.drop(['End Time', 'Start Time', 'Transportation Mode'], axis=1)
    return df

def retrieve_metadata(df):
    df_meta = pd.DataFrame(columns = headers_metadf)
    print(df.columns.values)
    trajectory_ids = df['trajectory_id'].unique()
    for ii in range(len(trajectory_ids)):
        trajectory_id = trajectory_ids[ii]
        df_ = df[df['trajectory_id'] == trajectory_id]
        start_time = df_.head(1)['datetime'].values[0]
        end_time = df_.tail(1)['datetime'].values[0]
        v_ave = np.nanmean(df_['velocity'].values)
        v_med = np.nanmedian(df_['velocity'].values)
        a_ave = np.nanmean(df_['acceleration'].values)
        a_med = np.nanmedian(df_['acceleration'].values)
        labels = df_['labels'].unique()
        labels = ",".join(labels)
        df_meta.loc[ii,:] = [trajectory_id, start_time, end_time, v_ave, v_med, a_ave, a_med, labels]
    return df_meta



labels_file = 'labels.txt'
MAIN_FOLDER = '../GPSML/Data/'
traj_folder = 'Trajectory/'
directories = listdir(MAIN_FOLDER)

for directory in directories:
    print(directory)
    list_df_traj = []
    directory_ = MAIN_FOLDER + directory + '/'
    for traj_file in listdir(directory_ + traj_folder):
        trajectory_id = traj_file.split('.')[0]
        filename = directory_ + traj_folder + traj_file
        df_traj = load_trajectory_df(filename, trajectory_id)
        list_df_traj.append(df_traj)
    df_traj_all = pd.concat(list_df_traj)
    if labels_file in listdir(directory_):
        filename = directory_ + labels_file
        df_labels = load_labels_df(filename)
    for idx in df_labels.index.values:
        st = df_labels.ix[idx]['start_time']
        et = df_labels.ix[idx]['end_time']
        labels = df_labels.ix[idx]['labels']
        df_traj_all.loc[(df_traj_all['datetime'] >= st) & (df_traj_all['datetime'] <= et), 'labels'] = labels
        #print st, et, labels
        #df_meta[(df_meta['start_time'] == st) & (df_meta['end_time'] == et)]['labels'] = labels
    df_traj_all.to_csv(directory_ + directory + '.csv')
    df_metadata = retrieve_metadata(df_traj_all)
    df_metadata.to_csv(directory_ + directory + '_metadata.csv')

049


NameError: name 'df_labels' is not defined