In [1]:
import pandas as pd
import numpy as np
import os
from datetime import datetime as dt
import scipy.stats as stats
from geopy.distance import distance as geodist

%matplotlib inline


In [6]:
def location_make_dataframes(file_path, sampling_freq, rolling = False, sampling = True):
    df = pd.read_csv(file_path, index_col = False, header = None, low_memory = False, \
                     names = ['timestamp', 'accuracy', 'altitude', 'latitude', 'longitude'])
    
    new_files_dir = "_datasets"
    if os.path.exists(os.path.join(os.getcwd(), new_files_dir)) is False:
        os.mkdir(os.path.join(os.getcwd(), new_files_dir))
        
    new_files_dir += "\\" + sampling_freq
    if os.path.exists(os.path.join(os.getcwd(), new_files_dir)) is False:
        os.mkdir(os.path.join(os.getcwd(), new_files_dir))

    df['timestamp'] = df['timestamp'].apply(lambda x: dt.strptime(x, '%d.%m.%Y_%H:%M:%S.%f'))

    df.index = pd.DatetimeIndex(df.timestamp)
    df = df.sort_index()

    df['prev_latitude'] = df['latitude'].shift(1)
    df['prev_longitude'] = df['longitude'].shift(1)
    df['prev_timestamp'] = df['timestamp'].shift(1)
    df['prev_altitude'] = df['altitude'].shift(1)

    def get_speed(row):
        prev_coords = (row['prev_latitude'], row['prev_longitude'])
        curr_coords = (row['latitude'], row['longitude'])
        delta = row['timestamp'] - row['prev_timestamp']
        if pd.isnull(delta):
            return np.nan
        time = abs(delta.total_seconds())
        if np.isnan(prev_coords[0]) or np.isnan(prev_coords[1]) or np.isnan(curr_coords[0]) or np.isnan(curr_coords[1]):
            return np.nan
        if time == 0:
            return np.nan
        return geodist(curr_coords, prev_coords).meters / time

    def get_altitude_speed(row):
        prev = row['prev_altitude']
        curr = row['altitude']
        delta = row['timestamp'] - row['prev_timestamp']
        if pd.isnull(delta):
            return np.nan
        time = abs(delta.total_seconds())
        if np.isnan(prev) or np.isnan(curr):
            return np.nan
        if time == 0:
            return np.nan
        return abs(curr - prev) / time

    df['speed'] = df.apply(lambda row: get_speed(row), axis=1)
    df['altitude_speed'] = df.apply(lambda row: get_altitude_speed(row), axis=1)

    df = df.drop(['prev_latitude', 'prev_longitude', 'prev_altitude'], axis=1)

    df['prev_speed'] = df['speed'].shift(1)
    df['prev_altitude_speed'] = df['altitude_speed'].shift(1)

    def get_acceleration(row):
        prev_speed = row['prev_speed']
        curr_speed = row['speed']
        delta = row['timestamp'] - row['prev_timestamp']
        if pd.isnull(delta):
            return np.nan
        time = abs(delta.total_seconds())
        if np.isnan(prev_speed) or np.isnan(curr_speed):
            return np.nan
        if time == 0:
            return np.nan
        return curr_speed - prev_speed / time

    def get_altitude_acceleration(row):
        prev_speed = row['prev_altitude_speed']
        curr_speed = row['altitude_speed']
        delta = row['timestamp'] - row['prev_timestamp']
        if pd.isnull(delta):
            return np.nan
        time = abs(delta.total_seconds())
        if np.isnan(prev_speed) or np.isnan(curr_speed):
            return np.nan
        if time == 0:
            return np.nan
        return curr_speed - prev_speed / time

    df['acc'] = df.apply(lambda row: get_acceleration(row), axis=1)
    df['altitude_acc'] = df.apply(lambda row: get_altitude_acceleration(row), axis=1)

    df = df.drop(['prev_altitude_speed', 'prev_speed', 'timestamp', 'prev_timestamp'], axis=1)

    def kurt(col):
        return stats.kurtosis(col)

    common_funcs_list = ['mean', 'var', 'median', 'skew', kurt, 'std']

    agg_dict = {
        'accuracy': common_funcs_list,
        'speed': common_funcs_list,
        'altitude_speed': common_funcs_list,
        'acc': common_funcs_list,
        'altitude_acc': common_funcs_list 
    }

    df_sampling = df.groupby(pd.Grouper(freq = sampling_freq)).agg(agg_dict)

    df_sampling.columns = ["_".join([str(high_level_name), str(low_level_name)]) \
                               for (high_level_name, low_level_name) in df_sampling.columns.values]
    
    df_sampling = df_sampling.dropna()
    df_sampling = df_sampling.fillna(0)
    
    index = os.path.basename(file_path).split('_')[-1][0]
    
    df_sampling.to_csv(new_files_dir + "\\location_sampling_dataset_" + index + ".csv")

    df_rolling = df.rolling(sampling_freq, min_periods = 1, center = False).agg(agg_dict)

    df_rolling = df_rolling.dropna()
    df_rolling = df_rolling.fillna(0)

    df_rolling.columns = ["_".join([str(high_level_name), str(low_level_name)]) \
                       for (high_level_name, low_level_name) in df_rolling.columns.values]
    
    df_rolling.to_csv(new_files_dir + "\\location_rolling_dataset_" + index + ".csv")

In [7]:
def location_pipeline(files, sampling_freq):
    logs = []
    for file in files:
        logs.append(location_file_process(file))
    for t in logs:
        print(t['BASE'], sampling_freq)
        location_make_dataframes(t['BASE'], sampling_freq, True, True)

In [8]:
SAMPLING_FREQs = ['5s', '10s', '30s', '60s', '90s', '120s', '240s', '600s']
data_list = [
#     ".\\raw_data\\location_1.data",
#     ".\\raw_data\\location_2.data",
#     ".\\raw_data\\location_3.data",
#     ".\\raw_data\\location_4.data",
#     ".\\raw_data\\location_5.data",
#     ".\\raw_data\\location_6.data",
#     ".\\raw_data\\location_7.data",
    ".\\raw_data\\location_8.data",
]

In [9]:
for freq in SAMPLING_FREQs: 
    location_pipeline(data_list, freq)

D:\Dev\nir\_generated\location_8.data 5s
D:\Dev\nir\_generated\location_8.data 10s
D:\Dev\nir\_generated\location_8.data 30s
D:\Dev\nir\_generated\location_8.data 60s
D:\Dev\nir\_generated\location_8.data 90s
D:\Dev\nir\_generated\location_8.data 120s
D:\Dev\nir\_generated\location_8.data 240s
D:\Dev\nir\_generated\location_8.data 600s


In [10]:
# location_pipeline([".\\user_1\\wifi\\wifi_0.data"], SAMPLING_FREQ)

In [11]:
# location_file_process(".\\location_2.data")