In [1]:
import pandas as pd
import numpy as np
import re
from datetime import datetime as dt
import scipy.stats as stats
from geopy.distance import distance as geodist

%matplotlib inline

In [2]:
TIME_SAMPLE_FREQ = '500s'

In [3]:
df = pd.read_csv(".\\_generated\\location_4.data", index_col = False, header = None, low_memory = False, \
                 names = ['timestamp', 'accuracy', 'altitude', 'latitude', 'longitude'])

In [4]:
df.head()

Unnamed: 0,timestamp,accuracy,altitude,latitude,longitude
0,08.12.2020_17:08:21.695,20.9,92.799995,53.275431,34.417051
1,08.12.2020_17:08:28.714,41.807999,92.799995,53.275418,34.416236
2,08.12.2020_17:08:29.353,35.148998,92.799995,53.275426,34.416732
3,08.12.2020_17:08:32.700,31.540001,92.799995,53.275429,34.4169
4,08.12.2020_17:08:36.186,23.089001,92.799995,53.275443,34.416841


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10635 entries, 0 to 10634
Data columns (total 5 columns):
timestamp    10635 non-null object
accuracy     10635 non-null float64
altitude     10635 non-null float64
latitude     10635 non-null float64
longitude    10635 non-null float64
dtypes: float64(4), object(1)
memory usage: 415.5+ KB


In [6]:
df['timestamp'] = df['timestamp'].apply(lambda x: dt.strptime(x, '%d.%m.%Y_%H:%M:%S.%f'))

In [7]:
df.index = pd.DatetimeIndex(df.timestamp)
df = df.sort_index()

In [8]:
df['prev_latitude'] = df['latitude'].shift(1)
df['prev_longitude'] = df['longitude'].shift(1)
df['prev_timestamp'] = df['timestamp'].shift(1)
df['prev_altitude'] = df['altitude'].shift(1)

In [9]:
def get_speed(row):
    prev_coords = (row['prev_latitude'], row['prev_longitude'])
    curr_coords = (row['latitude'], row['longitude'])
    delta = row['timestamp'] - row['prev_timestamp']
    if pd.isnull(delta):
        return np.nan
    time = abs(delta.total_seconds())
    if np.isnan(prev_coords[0]) or np.isnan(prev_coords[1]) or np.isnan(curr_coords[0]) or np.isnan(curr_coords[1]):
        return np.nan
    if time == 0:
        return np.nan
    return geodist(curr_coords, prev_coords).meters / time

def get_altitude_speed(row):
    prev = row['prev_altitude']
    curr = row['altitude']
    delta = row['timestamp'] - row['prev_timestamp']
    if pd.isnull(delta):
        return np.nan
    time = abs(delta.total_seconds())
    if np.isnan(prev) or np.isnan(curr):
        return np.nan
    if time == 0:
        return np.nan
    return abs(curr - prev) / time

In [10]:
df['speed'] = df.apply(lambda row: get_speed(row), axis=1)

In [11]:
df['altitude_speed'] = df.apply(lambda row: get_altitude_speed(row), axis=1)

In [12]:
df = df.drop(['prev_latitude', 'prev_longitude', 'prev_altitude'], axis=1)

In [13]:
df['prev_speed'] = df['speed'].shift(1)
df['prev_altitude_speed'] = df['altitude_speed'].shift(1)

In [14]:
def get_acceleration(row):
    prev_speed = row['prev_speed']
    curr_speed = row['speed']
    delta = row['timestamp'] - row['prev_timestamp']
    if pd.isnull(delta):
        return np.nan
    time = abs(delta.total_seconds())
    if np.isnan(prev_speed) or np.isnan(curr_speed):
        return np.nan
    if time == 0:
        return np.nan
    return curr_speed - prev_speed / time

def get_altitude_acceleration(row):
    prev_speed = row['prev_altitude_speed']
    curr_speed = row['altitude_speed']
    delta = row['timestamp'] - row['prev_timestamp']
    if pd.isnull(delta):
        return np.nan
    time = abs(delta.total_seconds())
    if np.isnan(prev_speed) or np.isnan(curr_speed):
        return np.nan
    if time == 0:
        return np.nan
    return curr_speed - prev_speed / time

In [15]:
df['acc'] = df.apply(lambda row: get_acceleration(row), axis=1)

In [16]:
df['altitude_acc'] = df.apply(lambda row: get_altitude_acceleration(row), axis=1)

In [17]:
df = df.drop(['prev_altitude_speed', 'prev_speed', 'timestamp', 'prev_timestamp'], axis=1)

In [18]:
df

Unnamed: 0_level_0,accuracy,altitude,latitude,longitude,speed,altitude_speed,acc,altitude_acc
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020-12-06 18:50:42.465,28.100000,92.799995,53.275432,34.417052,,,,
2020-12-06 18:50:48.600,28.100000,92.799995,53.275461,34.417052,0.526072,0.000000,,
2020-12-06 18:50:51.246,13.936000,92.799995,53.275461,34.417087,0.882396,0.000000,0.683578,0.000000
2020-12-06 18:50:52.889,11.256000,92.799995,53.275438,34.417151,3.029778,0.000000,2.492714,0.000000
2020-12-06 18:50:57.860,11.792000,92.799995,53.275385,34.417190,1.296868,0.000000,0.687377,0.000000
2020-12-06 19:26:58.090,1200.000000,0.000000,53.275753,34.419882,0.085265,0.042958,0.084664,0.042958
2020-12-06 19:27:05.968,28.100000,92.799995,53.275432,34.417052,24.389022,11.779639,24.378198,11.774186
2020-12-06 19:27:11.877,28.944000,92.799995,53.275451,34.417055,0.359449,0.000000,-3.767987,-1.993508
2020-12-06 19:27:13.886,25.728001,92.799995,53.275457,34.417072,0.655074,0.000000,0.476154,0.000000
2020-12-06 19:27:18.938,19.296000,92.799995,53.275476,34.417184,1.536992,0.000000,1.407326,0.000000


In [19]:
def kurt(col):
    return stats.kurtosis(col)

common_funcs_list = ['mean', 'var', 'median', 'skew', kurt, 'std']

In [20]:
agg_dict = {
    'accuracy': common_funcs_list,
    'speed': common_funcs_list,
    'altitude_speed': common_funcs_list,
    'acc': common_funcs_list,
    'altitude_acc': common_funcs_list 
}

In [21]:
df_sampling = df.groupby(pd.Grouper(freq = TIME_SAMPLE_FREQ)).agg(agg_dict)

In [22]:
df_sampling.columns = ["_".join([str(high_level_name), str(low_level_name)]) \
                       for (high_level_name, low_level_name) in df_sampling.columns.values]

In [23]:
df_rolling = df.rolling(TIME_SAMPLE_FREQ, min_periods = 1, center = False).agg(agg_dict)

In [24]:
df_rolling.columns = ["_".join([str(high_level_name), str(low_level_name)]) \
                       for (high_level_name, low_level_name) in df_rolling.columns.values]

In [25]:
df_sampling = df_sampling.dropna()
df_sampling = df_sampling.fillna(0)

df_rolling = df_rolling.dropna()
df_rolling = df_rolling.fillna(0)

In [26]:
df_sampling.to_csv(".\\_datasets\\location_sampling_dataset_4.csv")
df_rolling.to_csv(".\\_datasets\\location_rolling_dataset_4.csv")