# Appendix A: Demonstration of the Stop-Move-Noise (MSN) classification method

Import packages and general settings:

In [3]:
import os
import sys

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from step import preprocessing as pp
from step import stats as st
from step import util, msn

import math
import numpy as np
from scipy import stats

import gpxpy

import ipympl
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

%load_ext autoreload
%autoreload 2
#%matplotlib notebook
%matplotlib ipympl
%matplotlib ipympl

matplotlib.rcParams['axes.labelsize'] = '10'
matplotlib.rcParams['ytick.labelsize'] = '10'
matplotlib.rcParams['xtick.labelsize'] = '10'
matplotlib.rcParams['axes.titlesize'] = '10'
matplotlib.rcParams['legend.edgecolor'] = 'k'
matplotlib.rcParams['legend.shadow'] = False
matplotlib.rcParams['legend.frameon'] = False
matplotlib.rcParams['figure.figsize'] = (7, 5)
sns.set_style('ticks', {"axes.xmargin": 0.2, "axes.ymargin": 0.2});

# For pretty printing
import warnings
warnings.simplefilter('ignore')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Load one trajectory file in GPX format:

In [2]:
file_path = r'gpx/27031614-1583713025.gpx'
#file_path = r'gpx/49996348-1533217229.gpx'
gpx = gpxpy.parse(open(file_path, 'r'))

Compute movement attributes:

In [4]:
df = pp.compute_attributes(gpx)[1:]

# first 5 rows:
df.head()
np.median(df.distance)

11.09

Plot the trajectory:

In [5]:
fig = plt.figure()
ax = fig.add_subplot(111)

xx = df.longitude
yy = df.latitude

plt.xlabel('longitude')
plt.ylabel('latitude')
plt.plot(df.longitude, df.latitude, 'o:', ms=7)

central_latitude = sum(plt.axes().get_ylim())/2.
mercator_aspect_ratio = 1/math.cos(math.radians(central_latitude))
plt.axes().set_aspect(mercator_aspect_ratio, adjustable='datalim')
plt.tight_layout()

FigureCanvasNbAgg()

Relationship among distance, duration, and speed between each pair of sequential points:

In [6]:
sns.pairplot(df[['duration', 'distance', 'speed']]);
plt.tight_layout()

FigureCanvasNbAgg()

In [7]:
fig = plt.figure(figsize=(9.5,3))
plt.title('Evolution of duration, distance, and speed between each pair of points')
plt.plot(df['duration'], '-o', label='duration (s)', lw=2, ms=5, mew=.5, mec='k', alpha=.8)
plt.plot(df['speed'], '--s', label='speed (m/s)', lw=2, ms=5, mew=.5, mec='k', alpha=.8)
plt.plot(df['distance'], ':D', label='distance (m)', lw=2, ms=5, mew=.5, mec='k', alpha=.8)

plt.gca().xaxis.zoom(-0.1)
plt.xlabel("time")

plt.legend(loc='best', frameon=True);
plt.gca().xaxis.set_major_formatter(matplotlib.dates.DateFormatter("%H:%M:%S"));
plt.tight_layout()

FigureCanvasNbAgg()

In [8]:
df['z_duration'] = stats.zscore(df['duration'])
df['z_speed'] = stats.zscore(df['speed'])
df['z_distance'] = stats.zscore(df['distance'])

fig = plt.figure(figsize=(9.5,3))
plt.title('Evolution of duration, distance, and speed between each pair of points')
plt.plot(df['z_duration'], '-o', label='z duration', lw=2, ms=5, mew=.5, mec='k', alpha=.8)
plt.plot(df['z_speed'], '--s', label='z speed', lw=2, ms=5, mew=.5, mec='k', alpha=.8)
plt.plot(df['z_distance'], ':D', label='z distance', lw=2, ms=5, mew=.5, mec='k', alpha=.8)

plt.gca().xaxis.zoom(-0.1)
plt.xlabel("time")

plt.legend(loc='best', frameon=True);
plt.gca().xaxis.set_major_formatter(matplotlib.dates.DateFormatter("%H:%M:%S"));
#plt.tight_layout()

FigureCanvasNbAgg()

In [9]:
fig, (ax1, ax2, ax3) = plt.subplots(nrows=3, sharex=True)
ax1.set_title('Evolution of duration, distance, and speed between each pair of points')
ax1.plot(df['distance'], '-o', label='distance (m)', lw=2, ms=5, mew=.5, mec='k', alpha=.8)
ax1.set_ylabel('distance (m)')

ax2.plot(df['duration'], '-o', label='duration (s)', lw=2, ms=5, mew=.5, mec='k', alpha=.8)
ax2.set_ylabel('duration (s)')

ax3.plot(df['speed'], '-o', label='speed (m/s)', lw=2, ms=5, mew=.5, mec='k', alpha=.8)
ax3.set_ylabel('speed (m/s)')

plt.gca().xaxis.zoom(-0.1)
plt.xlabel("time")

plt.gca().xaxis.set_major_formatter(matplotlib.dates.DateFormatter("%H:%M:%S"));
plt.tight_layout()

FigureCanvasNbAgg()

## Parameters

- `distance_threshold`, `duration_threshold`, `speed_threshold`: value of absolute modified z-score at which higher values are considered as outliers

- `minimum_angle`: minimum angle in degrees

- `max_sequential_sharp_angles`: maximum number of sequential angles that can be below `minimum_angle`

- `jitter`: amount of random noise that can be added to each duration

In [10]:
distance_threshold = 3.5
duration_threshold = 5.0
speed_threshold = 3.5

minimum_angle = 45
max_sequential_sharp_angles = 1

jitter = 0.01

## Noise labeling

### Analysis of distance between points

Statistics about distance:

In [11]:
df['distance'].describe()

count    76.000000
mean     11.964737
std       5.449888
min       9.980000
25%      10.577500
50%      11.090000
75%      11.620000
max      54.940000
Name: distance, dtype: float64

Modified z-score computation:

In [12]:
mz_distance = st.modified_zscore(df['distance'])
long_distance_indexes = [i for i, mz in enumerate(mz_distance) 
                         if mz > distance_threshold]

Abnormal distance(s):

In [13]:
df.iloc[long_distance_indexes]

Unnamed: 0,latitude,longitude,distance,duration,speed,acceleration,heading,angle,elevation,timestamp,idx,z_duration,z_speed,z_distance
2016-07-04 16:13:29,45.190644,5.762608,54.94,46,1.19,3.29,261,174.44,215,2016-07-04 16:13:29,30,5.254955,-1.089486,7.937927
2016-07-04 16:13:31,45.190608,5.762417,15.55,2,7.77,-0.77,255,177.92,213,2016-07-04 16:13:31,31,-0.633976,5.315761,0.662231
2016-07-04 16:17:16,45.187866,5.758729,28.62,12,2.38,0.2,213,177.05,216,2016-07-04 16:17:16,69,0.704418,0.068909,3.076381


Distribution of modified-zscore of distances:

In [14]:
plt.figure()
plt.title("Distribution of distances' z-scores")
plt.ylabel('frequency')

sns.distplot(mz_distance.as_matrix(), rug=True)

plt.xticks(list(plt.xticks()[0]) + [distance_threshold])
plt.axvline(distance_threshold, color='tomato', linestyle='--');

FigureCanvasNbAgg()

### Analysis of angles between points

In [15]:
sharp_angles = df[df['angle'] < minimum_angle][:-1]
sharp_angles_index = [df.index.get_loc(i) 
                          for i in sharp_angles.index]

diff = np.ediff1d(sharp_angles_index, np.inf)
mask = [True if d <= max_sequential_sharp_angles + 1 
            else False for d in diff]

outliers = []
for i, m in enumerate(mask):
    if m:
        outliers.append(sharp_angles_index[i])
        outliers.append(sharp_angles_index[i+1])
        
sharp_angle_intervals = util.get_interval_limits(outliers)

sharp_angle_indexes = []
for interval in sharp_angle_intervals:
    sharp_angle_indexes.extend(np.linspace(interval[0], 
                                           interval[1], 
                                           interval[1]-interval[0]+1, 
                                           dtype=int))

Sharp angles:

In [16]:
df.iloc[sharp_angle_indexes]

Unnamed: 0,latitude,longitude,distance,duration,speed,acceleration,heading,angle,elevation,timestamp,idx,z_duration,z_speed,z_distance


In [17]:
plt.figure()

plt.title('Evolution of direction')
plt.ylabel('Angle (degrees)')

plt.plot(df['angle'][:-1], ':o')
plt.plot(df.iloc[sharp_angle_indexes]['angle'], 's', color='tomato')
plt.axhline(minimum_angle, color='tomato', ls='--')
plt.gca().xaxis.set_major_formatter(
    matplotlib.dates.DateFormatter("%H:%M:%S"));

FigureCanvasNbAgg()

### Remove outliers

In [18]:
noise_indexes = np.union1d(long_distance_indexes, sharp_angle_indexes)
noise_indexes = np.array(noise_indexes, dtype=int)
noise_indexes

array([29, 30, 68])

In [19]:
def get_interval_limits(array, tolerance=1):
    print(array)
    diff = np.ediff1d(array, np.inf)
    print(diff)
    mask = [True if d <= abs(tolerance+1) else False for d in diff]
    print(mask)
    # d <= 2 includes 'normal' observations between 2 'abnormal' observations

    start = None
    intervals = []
    for i in range(len(array)):
        print(array[i])
        if mask[i]:
            if start is None:
                start = array[i]
                continue
        else:
            end = array[i]
            if start is None:
                start = array[i]

            intervals.append([start, end])
            print(intervals)
            start, end = None, None
    
    # case there's still an opened interval:
    if any([start, end]) and not all([start, end]):
        print(start, end)
        intervals.append([start, start])

    return intervals

In [20]:
noise_intervals = get_interval_limits(noise_indexes)
noise_intervals

[29 30 68]
[                   1                   38 -9223372036854775808]
[True, False, True]
29
30
[[29, 30]]
68
68 None


[[29, 30], [68, 68]]

In [21]:
noise_indexes = np.array([])
print(noise_intervals)
for interval in noise_intervals:
    linspace = np.linspace(interval[0], interval[1], interval[1]-interval[0]+1)
    noise_indexes = np.concatenate((noise_indexes, linspace))

noise_indexes

[[29, 30], [68, 68]]


array([29., 30., 68.])

In [22]:
noise_indexes = np.array(np.sort(noise_indexes), dtype=int)
noise = df.iloc[noise_indexes]
noise_indexes

array([29, 30, 68])

In [23]:
noise = df.iloc[noise_indexes]
noise

Unnamed: 0,latitude,longitude,distance,duration,speed,acceleration,heading,angle,elevation,timestamp,idx,z_duration,z_speed,z_distance
2016-07-04 16:13:29,45.190644,5.762608,54.94,46,1.19,3.29,261,174.44,215,2016-07-04 16:13:29,30,5.254955,-1.089486,7.937927
2016-07-04 16:13:31,45.190608,5.762417,15.55,2,7.77,-0.77,255,177.92,213,2016-07-04 16:13:31,31,-0.633976,5.315761,0.662231
2016-07-04 16:17:16,45.187866,5.758729,28.62,12,2.38,0.2,213,177.05,216,2016-07-04 16:17:16,69,0.704418,0.068909,3.076381


In [24]:
noise_indexes = np.union1d(long_distance_indexes, sharp_angle_indexes)
noise_indexes = np.array(noise_indexes, dtype=int)
noise_intervals = get_interval_limits(noise_indexes)

noise_indexes = np.array([])
for interval in noise_intervals:
    linspace = np.linspace(interval[0], interval[1], interval[1]-interval[0]+1)
    noise_indexes = np.concatenate((noise_indexes, linspace))
    
noise_indexes = np.array(np.sort(noise_indexes), dtype=int)
noise = df.iloc[noise_indexes]

df_clean = df.drop(df.index[noise_indexes])

original_len = len(df)
without_outliers_len = len(df_clean)

print('original:', original_len)
print('without outliers:', without_outliers_len)
print('{:.2f}% of points have been dropped'.format((1 - without_outliers_len/original_len)*100))

[29 30 68]
[                   1                   38 -9223372036854775808]
[True, False, True]
29
30
[[29, 30]]
68
68 None
original: 76
without outliers: 73
3.95% of points have been dropped


In [25]:
noise_indexes

array([29, 30, 68])

Show noisy points:

In [26]:
fig = plt.figure()
ax = fig.add_subplot(111)
plt.xlabel('longitude')
plt.ylabel('latitude')
plt.title('Noise points')

xx = df.longitude
yy = df.latitude
plt.plot(xx, yy, 'o:', ms=7, alpha=1)

xx = df.iloc[noise_indexes].longitude
yy = df.iloc[noise_indexes].latitude
plt.plot(xx, yy, 's', ms=9, alpha=.8, mew=.5, mec='k')

central_latitude = sum(plt.axes().get_ylim())/2.
mercator_aspect_ratio = 1/math.cos(math.radians(central_latitude))
plt.axes().set_aspect(mercator_aspect_ratio, adjustable='datalim')

FigureCanvasNbAgg()

## Stop classification

### Analysis of duration between points

Statistics about duration:

In [27]:
df_clean['duration'].describe()

count    73.000000
mean      6.191781
std       6.043034
min       2.000000
25%       4.000000
50%       5.000000
75%       6.000000
max      42.000000
Name: duration, dtype: float64

Modified z-score computation:

In [28]:
duration_with_jitter = pp.jitter(df_clean['duration'], jitter)
mz_duration = st.modified_zscore(duration_with_jitter)
long_duration_indexes = [i for i, mz in enumerate(mz_duration) if mz > duration_threshold]

Abnormal duration(s):

In [29]:
df_clean.iloc[long_duration_indexes]

Unnamed: 0,latitude,longitude,distance,duration,speed,acceleration,heading,angle,elevation,timestamp,idx,z_duration,z_speed,z_distance
2016-07-04 16:11:38,45.190511,5.764833,10.05,42,0.24,0.28,298,177.37,216,2016-07-04 16:11:38,17,4.719597,-2.014256,-0.35367
2016-07-04 16:15:38,45.189882,5.760566,11.73,40,0.29,0.37,174,177.17,214,2016-07-04 16:15:38,47,4.451919,-1.965584,-0.043358


Distribution of modified-zscore of durations:

In [30]:
plt.figure()
plt.title("Distribution of durations' z-scores")
plt.ylabel('frequency')

sns.distplot(mz_duration.as_matrix(), rug=True)

plt.xticks(list(plt.xticks()[0]) + [duration_threshold])
plt.axvline(duration_threshold, color='tomato', linestyle='--');

FigureCanvasNbAgg()

### Analysis of speed

Statistics about speed:

In [31]:
df_clean['speed'].describe()

count    73.000000
mean      2.248767
std       0.823770
min       0.240000
25%       1.770000
50%       2.100000
75%       2.800000
max       6.230000
Name: speed, dtype: float64

Modified z-score computation:

In [32]:
mz_speed = st.modified_zscore(np.log(df_clean['speed']))
slow_speed_indexes = [i for i, mz in enumerate(mz_speed) if mz < -3.5]

Abnormal speed(s):

In [33]:
df_clean.iloc[slow_speed_indexes]

Unnamed: 0,latitude,longitude,distance,duration,speed,acceleration,heading,angle,elevation,timestamp,idx,z_duration,z_speed,z_distance
2016-07-04 16:11:38,45.190511,5.764833,10.05,42,0.24,0.28,298,177.37,216,2016-07-04 16:11:38,17,4.719597,-2.014256,-0.35367
2016-07-04 16:15:38,45.189882,5.760566,11.73,40,0.29,0.37,174,177.17,214,2016-07-04 16:15:38,47,4.451919,-1.965584,-0.043358


Distribution of modified-zscore of speeds:

In [34]:
plt.figure()
plt.title("Distribution of speed's z-scores")
plt.ylabel('frequency')

sns.distplot(df['speed'].as_matrix(), rug=True)

plt.xticks(list(plt.xticks()[0]) + [-speed_threshold])
plt.axvline(-speed_threshold, color='tomato', linestyle='--');

FigureCanvasNbAgg()

In [35]:
plt.figure()
plt.title("Distribution of speed's log z-scores")
plt.ylabel('frequency')

sns.distplot(mz_speed.as_matrix(), rug=True)

plt.xticks(list(plt.xticks()[0]) + [-speed_threshold])
plt.axvline(-speed_threshold, color='tomato', linestyle='--');

FigureCanvasNbAgg()

### The intersection of long times and slow speed are probably stops

In [36]:
stops = df_clean.iloc[np.intersect1d(long_duration_indexes, slow_speed_indexes)]
stops

Unnamed: 0,latitude,longitude,distance,duration,speed,acceleration,heading,angle,elevation,timestamp,idx,z_duration,z_speed,z_distance
2016-07-04 16:11:38,45.190511,5.764833,10.05,42,0.24,0.28,298,177.37,216,2016-07-04 16:11:38,17,4.719597,-2.014256,-0.35367
2016-07-04 16:15:38,45.189882,5.760566,11.73,40,0.29,0.37,174,177.17,214,2016-07-04 16:15:38,47,4.451919,-1.965584,-0.043358


### Relation between duration and speed

In [37]:
x_label = 'modified z-score (duration)'
y_label = 'modified z-score (speed)'

x = mz_duration
y = mz_speed
X = np.stack((x, y), axis=-1)

fig = plt.figure()
plt.xlabel(x_label)
plt.ylabel(y_label)
ax = fig.add_subplot(111)
plt.scatter(X[:,0], X[:,1], s=50, alpha=.8, color='cornflowerblue', lw=1)
plt.axhline(-speed_threshold, color='tomato', linestyle='--')
plt.axvline(duration_threshold, color='tomato', linestyle='--')
plt.legend(frameon=True)

for i, element in enumerate(X):
    if x[i] > duration_threshold and y[i] < -speed_threshold:
        ax.annotate(df_clean.iloc[i]['idx'], xy=(element[0]+0.008, element[1]+0.001), color='black', alpha=.8)
        plt.scatter(element[0], element[1], marker='D', s=60, color='tomato')

FigureCanvasNbAgg()

No handles with labels found to put in legend.


### Visualizing the relation of duration, distance, and speed:

In [38]:
x_label = 'duration (s)'
y_label = 'distance (m) '

x = df['duration']
y = df['distance']
z = df['speed']

X = np.stack((x, y), axis=-1)

fig = plt.figure()
plt.xlabel(x_label)
plt.ylabel(y_label)
ax = fig.add_subplot(111)
plt.scatter(X[:,0], X[:,1], s=50, alpha=.8, color='cornflowerblue', lw=1, label='moves')

# plot noise
all_noise = np.array([])
for i in noise_indexes:
    element = X[i]
    ax.annotate(df.iloc[i]['idx'], xy=(element[0]+0.1, element[1]+0.1), color='black', alpha=.8)
    all_noise = np.append(all_noise, [element[0], element[1]])
    
if len(all_noise) > 0:
    all_noise = np.reshape(all_noise, (-1, 2))
    plt.scatter(all_noise[:,0], all_noise[:,1], marker='s', s=70, color='mediumseagreen', label='noise')

#plot stops
x = df_clean['duration']
y = df_clean['distance']
X = np.stack((x, y), axis=-1)

stop = np.array([])
for i, element in enumerate(X):
    if mz_duration[i] > duration_threshold and mz_speed[i] < -speed_threshold:
        ax.annotate(df_clean.iloc[i]['idx'], xy=(element[0]+0.1, element[1]+0.1), color='black', alpha=.8)
        stop = np.append(stop, [element[0], element[1]])

if len(stop) > 0:
    stop = np.reshape(stop, (-1, 2))
    plt.scatter(stop[:,0], stop[:,1], marker='D', s=70, color='tomato', label='stops')

plt.legend(frameon=True, loc='best');

plt.tight_layout()

FigureCanvasNbAgg()

Plot points with their classifications:

In [39]:
fig = plt.figure()
ax = fig.add_subplot(111)
plt.xlabel('longitude')
plt.ylabel('latitude')

xx = df.longitude
yy = df.latitude
plt.plot(xx, yy, 'o:', label='moves', color='cornflowerblue', ms=7, alpha=1)

xx = stops.longitude
yy = stops.latitude
plt.plot(xx, yy, 'D', label='stops', color='tomato', ms=8, alpha=1, mew=.5, mec='k')

xx = noise.longitude
yy = noise.latitude
plt.plot(xx, yy, 's', label='noise', color='mediumseagreen', ms=8, alpha=1, mew=.5, mec='k')

plt.legend(loc='best', frameon=True)

central_latitude = sum(plt.axes().get_ylim())/2.
mercator_aspect_ratio = 1/math.cos(math.radians(central_latitude))
plt.axes().set_aspect(mercator_aspect_ratio, adjustable='datalim')

plt.tight_layout()

FigureCanvasNbAgg()

In [None]:
%%timeit
msn.get_move_stop_noise(df)

In [None]:
msn.get_move_stop_noise(df)