In [28]:
import pandas as pd
import numpy as np
import re
from datetime import datetime as dt
import scipy.stats as stats
from geopy.distance import distance as geodist

%matplotlib inline

In [29]:
TIME_SAMPLE_FREQ = '500s'

In [30]:
df = pd.read_csv("..\\..\\scripts\\_split_all\\user_1\location.data", sep=';', index_col = False, header = None, low_memory = False, \
                 names = ['timestamp', 'accuracy', 'altitude', 'latitude', 'longitude'])

In [31]:
df.head()

Unnamed: 0,timestamp,accuracy,altitude,latitude,longitude
0,08.03.2021_15:15:05.761,23177999,140760012,52610334,39594224
1,08.03.2021_15:15:10.717,19396000,140720483,52610474,39594158
2,08.03.2021_15:15:11.504,23584000,140708798,52610399,39594220
3,08.03.2021_15:15:14.474,19308001,140768349,52610435,39594193
4,08.03.2021_15:15:18.483,14046000,140809834,52610438,39594164


In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11118 entries, 0 to 11117
Data columns (total 5 columns):
timestamp    11118 non-null object
accuracy     11118 non-null object
altitude     11118 non-null object
latitude     11118 non-null object
longitude    11118 non-null object
dtypes: object(5)
memory usage: 434.4+ KB


In [33]:
df['timestamp'] = df['timestamp'].apply(lambda x: dt.strptime(x, '%d.%m.%Y_%H:%M:%S.%f'))

In [34]:
df.index = pd.DatetimeIndex(df.timestamp)
df = df.sort_index()

In [35]:
df['accuracy'] = df['accuracy'].apply(lambda x: x.replace(',', '.'))
df['altitude'] = df['altitude'].apply(lambda x: x.replace(',', '.'))
df['latitude'] = df['latitude'].apply(lambda x: x.replace(',', '.'))
df['longitude'] = df['longitude'].apply(lambda x: x.replace(',', '.'))

In [36]:
df['accuracy'] = df['accuracy'].astype(float)
df['altitude'] = df['altitude'].astype(float)
df['latitude'] = df['latitude'].astype(float)
df['longitude'] = df['longitude'].astype(float)

In [37]:
df['prev_latitude'] = df['latitude'].shift(1)
df['prev_longitude'] = df['longitude'].shift(1)
df['prev_timestamp'] = df['timestamp'].shift(1)
df['prev_altitude'] = df['altitude'].shift(1)

In [38]:
def get_speed(row):
    prev_coords = (row['prev_latitude'], row['prev_longitude'])
    curr_coords = (row['latitude'], row['longitude'])
    delta = row['timestamp'] - row['prev_timestamp']
    if pd.isnull(delta):
        return np.nan
    time = abs(delta.total_seconds())
    if np.isnan(prev_coords[0]) or np.isnan(prev_coords[1]) or np.isnan(curr_coords[0]) or np.isnan(curr_coords[1]):
        return np.nan
    if time == 0:
        return np.nan
    return geodist(curr_coords, prev_coords).meters / time

def get_altitude_speed(row):
    prev = row['prev_altitude']
    curr = row['altitude']
    delta = row['timestamp'] - row['prev_timestamp']
    if pd.isnull(delta):
        return np.nan
    time = abs(delta.total_seconds())
    if np.isnan(prev) or np.isnan(curr):
        return np.nan
    if time == 0:
        return np.nan
    return abs(curr - prev) / time

In [39]:
df['speed'] = df.apply(lambda row: get_speed(row), axis=1)

In [40]:
df['altitude_speed'] = df.apply(lambda row: get_altitude_speed(row), axis=1)

In [41]:
df = df.drop(['prev_latitude', 'prev_longitude', 'prev_altitude'], axis=1)

In [42]:
df['prev_speed'] = df['speed'].shift(1)
df['prev_altitude_speed'] = df['altitude_speed'].shift(1)

In [43]:
df = df.drop(['prev_altitude_speed', 'prev_speed', 'timestamp', 'prev_timestamp'], axis=1)

In [44]:
def kurt(col):
    return stats.kurtosis(col)

common_funcs_list = ['mean', 'var', 'median', 'skew', kurt, 'std']

In [45]:
agg_dict = {
    'accuracy': common_funcs_list,
    'speed': common_funcs_list,
    'altitude_speed': common_funcs_list,
}

In [46]:
df_sampling = df.groupby(pd.Grouper(freq = TIME_SAMPLE_FREQ)).agg(agg_dict)

In [47]:
df_sampling.columns = ["_".join([str(high_level_name), str(low_level_name)]) \
                       for (high_level_name, low_level_name) in df_sampling.columns.values]

In [48]:
df_rolling = df.rolling(TIME_SAMPLE_FREQ, min_periods = 1, center = False).agg(agg_dict)

In [49]:
df_rolling.columns = ["_".join([str(high_level_name), str(low_level_name)]) \
                       for (high_level_name, low_level_name) in df_rolling.columns.values]

In [50]:
df_sampling = df_sampling.dropna()
df_sampling = df_sampling.fillna(0)

df_rolling = df_rolling.dropna()
df_rolling = df_rolling.fillna(0)

In [26]:
df_sampling.to_csv(".\\_datasets\\location_sampling_dataset_4.csv")
df_rolling.to_csv(".\\_datasets\\location_rolling_dataset_4.csv")