In [1]:
import pandas as pd
import numpy as np
import  re
from datetime import datetime as dt
import scipy.stats as stats
from geopy.distance import distance as geodist

%matplotlib inline

In [2]:
TIME_SAMPLE_FREQ = '500s'

In [3]:
df = pd.read_csv(".\\_events\\_generated\\valid_user_3\\location_0.data", sep=';', index_col = False, header = None, low_memory = False, \
                 names = ['timestamp', 'accuracy', 'altitude', 'latitude', 'longitude', 'user'])

In [4]:
df.head()

Unnamed: 0,timestamp,accuracy,altitude,latitude,longitude,user
0,2021-03-09 17:22:32.354,20000000,140691003,52610471,39594133,3
1,2021-03-09 17:22:39.013,20000000,140781288,52610472,39594132,3
2,2021-03-09 17:22:45.602,20000000,140641696,52610472,39594132,3
3,2021-03-09 17:22:51.809,163225006,140622933,52610472,39594104,3
4,2021-03-09 17:22:58.430,20000000,140576167,52610472,39594130,3


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66 entries, 0 to 65
Data columns (total 6 columns):
timestamp    66 non-null object
accuracy     66 non-null object
altitude     66 non-null object
latitude     66 non-null object
longitude    66 non-null object
user         66 non-null int64
dtypes: int64(1), object(5)
memory usage: 3.2+ KB


In [6]:
df['timestamp'] = df['timestamp'].apply(lambda x: dt.strptime(x, '%Y-%m-%d %H:%M:%S.%f'))

In [7]:
df.index = pd.DatetimeIndex(df.timestamp)
df = df.sort_index()

In [8]:
VALID_USER = df.iloc[0]['user']
df['events_count'] = 1

In [9]:
df['accuracy'] = df['accuracy'].apply(lambda x: x.replace(',', '.'))
df['altitude'] = df['altitude'].apply(lambda x: x.replace(',', '.'))
df['latitude'] = df['latitude'].apply(lambda x: x.replace(',', '.'))
df['longitude'] = df['longitude'].apply(lambda x: x.replace(',', '.'))

In [10]:
df['accuracy'] = df['accuracy'].astype(float)
df['altitude'] = df['altitude'].astype(float)
df['latitude'] = df['latitude'].astype(float)
df['longitude'] = df['longitude'].astype(float)

In [11]:
df['prev_latitude'] = df['latitude'].shift(1)
df['prev_longitude'] = df['longitude'].shift(1)
df['prev_timestamp'] = df['timestamp'].shift(1)
df['prev_altitude'] = df['altitude'].shift(1)

In [12]:
def get_speed(row):
    prev_coords = (row['prev_latitude'], row['prev_longitude'])
    curr_coords = (row['latitude'], row['longitude'])
    delta = row['timestamp'] - row['prev_timestamp']
    if pd.isnull(delta):
        return np.nan
    time = abs(delta.total_seconds())
    if np.isnan(prev_coords[0]) or np.isnan(prev_coords[1]) or np.isnan(curr_coords[0]) or np.isnan(curr_coords[1]):
        return np.nan
    if time == 0:
        return np.nan
    return geodist(curr_coords, prev_coords).meters / time

def get_altitude_speed(row):
    prev = row['prev_altitude']
    curr = row['altitude']
    delta = row['timestamp'] - row['prev_timestamp']
    if pd.isnull(delta):
        return np.nan
    time = abs(delta.total_seconds())
    if np.isnan(prev) or np.isnan(curr):
        return np.nan
    if time == 0:
        return np.nan
    return abs(curr - prev) / time

In [13]:
df['speed'] = df.apply(lambda row: get_speed(row), axis=1)

In [14]:
df['altitude_speed'] = df.apply(lambda row: get_altitude_speed(row), axis=1)

In [15]:
df = df.drop(['prev_latitude', 'prev_longitude', 'prev_altitude'], axis=1)

In [16]:
df['prev_speed'] = df['speed'].shift(1)
df['prev_altitude_speed'] = df['altitude_speed'].shift(1)

In [17]:
def get_acceleration(row):
    prev_speed = row['prev_speed']
    curr_speed = row['speed']
    delta = row['timestamp'] - row['prev_timestamp']
    if pd.isnull(delta):
        return np.nan
    time = abs(delta.total_seconds())
    if np.isnan(prev_speed) or np.isnan(curr_speed):
        return np.nan
    if time == 0:
        return np.nan
    return curr_speed - prev_speed / time

def get_altitude_acceleration(row):
    prev_speed = row['prev_altitude_speed']
    curr_speed = row['altitude_speed']
    delta = row['timestamp'] - row['prev_timestamp']
    if pd.isnull(delta):
        return np.nan
    time = abs(delta.total_seconds())
    if np.isnan(prev_speed) or np.isnan(curr_speed):
        return np.nan
    if time == 0:
        return np.nan
    return curr_speed - prev_speed / time

In [18]:
df['acc'] = df.apply(lambda row: get_acceleration(row), axis=1)

In [19]:
df['altitude_acc'] = df.apply(lambda row: get_altitude_acceleration(row), axis=1)

In [20]:
df = df.drop(['prev_altitude_speed', 'prev_speed', 'timestamp', 'prev_timestamp'], axis=1)

In [21]:
def kurt(col):
    return stats.kurtosis(col)

def user_agg(col):
    if (col == VALID_USER).all():
        return 1
    else:
        return 0

common_funcs_list = ['mean', 'var', 'median', 'skew', kurt, 'std']

In [22]:
agg_dict = {
    'accuracy': common_funcs_list,
    'speed': common_funcs_list,
    'altitude_speed': common_funcs_list,
    'acc': common_funcs_list,
    'altitude_acc': common_funcs_list,
    'events_count': 'sum',
    'user': user_agg
}

In [23]:
df_sampling = df.groupby(pd.Grouper(freq = TIME_SAMPLE_FREQ)).agg(agg_dict)

In [24]:
df_sampling.columns = ["_".join([str(high_level_name), str(low_level_name)]) \
                       for (high_level_name, low_level_name) in df_sampling.columns.values]

In [25]:
df_rolling = df.rolling(TIME_SAMPLE_FREQ, min_periods = 1, center = False).agg(agg_dict)

In [26]:
df_rolling.columns = ["_".join([str(high_level_name), str(low_level_name)]) \
                       for (high_level_name, low_level_name) in df_rolling.columns.values]

In [27]:
df_sampling = df_sampling.dropna()
df_sampling = df_sampling.fillna(0)

df_rolling = df_rolling.dropna()
df_rolling = df_rolling.fillna(0)

In [29]:
df_rolling

Unnamed: 0_level_0,accuracy_mean,accuracy_var,accuracy_median,accuracy_skew,accuracy_kurt,accuracy_std,speed_mean,speed_var,speed_median,speed_skew,...,acc_kurt,acc_std,altitude_acc_mean,altitude_acc_var,altitude_acc_median,altitude_acc_skew,altitude_acc_kurt,altitude_acc_std,events_count_sum,user_user_agg
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-03-09 17:31:00.085,48.7982,2476.580084,20.0,1.485702,0.854877,49.76525,0.539534,0.529792,0.177725,1.594326,...,0.58836,0.630301,0.009796,0.000273,0.0071,0.945751,1.643236,0.016514,25.0,1.0
2021-03-09 17:31:04.548,47.657731,2411.334303,20.0,1.545557,1.037414,49.105339,0.520293,0.518225,0.173537,1.649151,...,0.762054,0.623221,0.010874,0.000292,0.007766,0.802596,0.962764,0.017089,26.0,1.0
2021-03-09 17:31:05.086,46.618667,2347.741338,20.0,1.60357,1.223113,48.453497,0.529003,0.500342,0.177725,1.627979,...,0.740146,0.613655,0.008396,0.000447,0.0071,-0.530087,2.316029,0.021135,27.0,1.0
2021-03-09 17:39:40.662,52.017625,1652.594859,44.5685,0.798662,-0.607688,40.65212,0.090994,0.02854,0.03249,3.737961,...,6.630448,0.109427,0.021338,0.003201,0.008356,2.561734,4.872096,0.056576,24.0,0.0
2021-03-09 17:39:43.670,53.246583,1586.09106,44.5685,0.795787,-0.549018,39.825759,0.085169,0.028762,0.028709,3.798952,...,6.9784,0.107239,0.021265,0.003203,0.008356,2.562959,4.873797,0.056596,24.0,0.0
2021-03-09 17:39:50.675,49.360583,1371.856238,42.522002,0.988234,-0.00651,37.038578,0.078556,0.028796,0.020504,3.909847,...,7.572042,0.104395,0.021451,0.003197,0.008356,2.56043,4.871393,0.056542,24.0,0.0
2021-03-09 17:39:54.665,48.50104,1333.16593,41.450001,1.052548,0.12914,36.512545,0.075414,0.027843,0.015729,3.983355,...,7.945835,0.10224,0.022028,0.003072,0.00882,2.562326,5.052399,0.055427,25.0,0.0
2021-03-09 17:40:04.673,51.160083,1302.458561,44.5685,0.962831,0.064498,36.089591,0.037622,0.003454,0.000868,1.695738,...,3.473752,0.055131,0.014813,0.001921,0.008356,3.261625,10.446574,0.043831,24.0,0.0
2021-03-09 17:40:08.727,51.129375,1303.376812,44.5685,0.964139,0.063547,36.10231,0.032788,0.003224,0.0,2.021096,...,3.952347,0.053197,0.017891,0.001819,0.008893,3.391612,10.932177,0.042655,24.0,0.0
2021-03-09 17:40:12.688,49.85604,1289.603996,43.594002,1.019533,0.151256,35.911057,0.031477,0.003133,0.0,2.082196,...,4.285666,0.052347,0.017145,0.001758,0.00882,3.465135,11.519872,0.041923,25.0,0.0


In [26]:
df_sampling.to_csv(".\\_datasets\\location_sampling_dataset_4.csv")
df_rolling.to_csv(".\\_datasets\\location_rolling_dataset_4.csv")

In [30]:
"location_6.data"[-6:]

'6.data'

In [32]:
import os

In [34]:
os.listdir(".\\_events\\_generated\\valid_user_1")

['base_bt_0.data',
 'base_bt_1.data',
 'base_bt_2.data',
 'base_bt_3.data',
 'base_bt_4.data',
 'base_bt_5.data',
 'base_bt_6.data',
 'base_bt_7.data',
 'base_bt_8.data',
 'base_bt_9.data',
 'base_wifi_0.data',
 'base_wifi_1.data',
 'base_wifi_2.data',
 'base_wifi_3.data',
 'base_wifi_4.data',
 'base_wifi_5.data',
 'base_wifi_6.data',
 'base_wifi_7.data',
 'base_wifi_8.data',
 'base_wifi_9.data',
 'broadcasts_0.data',
 'broadcasts_1.data',
 'broadcasts_2.data',
 'broadcasts_3.data',
 'broadcasts_4.data',
 'broadcasts_5.data',
 'broadcasts_6.data',
 'broadcasts_7.data',
 'broadcasts_8.data',
 'broadcasts_9.data',
 'conn_wifi_0.data',
 'conn_wifi_1.data',
 'conn_wifi_2.data',
 'conn_wifi_3.data',
 'conn_wifi_4.data',
 'conn_wifi_5.data',
 'conn_wifi_6.data',
 'conn_wifi_7.data',
 'conn_wifi_8.data',
 'conn_wifi_9.data',
 'le_bt_0.data',
 'le_bt_1.data',
 'le_bt_2.data',
 'le_bt_3.data',
 'le_bt_4.data',
 'le_bt_5.data',
 'le_bt_6.data',
 'le_bt_7.data',
 'le_bt_8.data',
 'le_bt_9.data',
