In [None]:
import os
import csv
import yaml
import wandb
import pickle
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from keras.utils import to_categorical
from keras.layers import Dense
from keras.models import Sequential
from src import (
    read_nz_file, read_jg_file, update_meta_data, split_df, aggregate_files, add_moving_window, add_moving_window_2)
from sklearn.model_selection import train_test_split
from wandb.keras import WandbCallback

processed data - intermediate step

> raw data

> clean data

> preprocess data : store it in DB (better compared to file format) 

    - data lake
    
    or
    
    - DB model based SQL (Nice to have but not required if we deceide to save the data as a Feather file)

> 

## 1. Load and clean raw data

files from SensorLog iOS app has in total over 70 colums and precision of 12 decimal figures. The output file is over 135 MB, which is too large for GitHub. GitHub restricts the file size, therefore these files from SensorLog must be cleaned.

Below summary of steps which is done only for iOS files:

- read raw data as csv files
- remove unnecessary columns (captured in list 'remove_cols' below)
- round to 6 decimal places to reduce the size of files
- output dataframe as csv
- upload the csv on GitHub

# 2. Meta data

The data on different data files is captured in meta dataframe below:
- file name
- user (nz or jg)
- activity (running/cycling/walking/sitting)
- pocket (in which pocket handy was during the activity)
- position_x
- position_y


# 3. View data

Sensor activity data is captured from 2 different Apps:
- SensorLog (iOS) by user 'nz'
- AndrioSensor (Andriod) by user 'jg'

**Response**: 'Acivity' with 4 classes: running/walking/cycling/sitting



In [None]:
update_meta_data()

meta = pd.read_csv('data/meta.csv')
meta

In [None]:
# expects np array as input
def one_hot_encode(l):
    for i, v in enumerate(l):
        if v == 'sitting':
            l[i] = 0
        elif v == 'walking':
            l[i] = 1
        elif v == 'running':
            l[i] = 2
        elif v == 'cycling':
            l[i] = 3

    l = to_categorical(np.array(l))
    return l

In [None]:
# create an aggregate of all files and output a list with the possible start positions within the dataframe. (lookback must always be contained within a single activity)
# saves this preprocessed file and list to the tmp folder
def preprocess_sequential(moving_window_seconds, hz, step_size, test_proportion = 0.2, select_train_files = 'all'):

    # create empty data frames
    train = pd.DataFrame()
    test = pd.DataFrame()

    # create empty lists of start indexes
    train_indexes = []
    test_indexes = []

    all_train_files = []
    all_test_files = []

    for index, (file, user, activity, position_x) in enumerate(
            zip(meta['file'], meta['user'], meta['activity'], meta['position_x'])):

        if user == 'nz':
            df = read_nz_file(file, activity)
            df = df.drop(columns=['datetime'])

        elif user == 'jg':
            df = read_jg_file(file, activity)

        print(file, user, activity, position_x, df.shape)

        # split into train-test
        my_train_files, my_test_files = split_df(
            df, hz = hz, test_proportion = test_proportion, moving_window_size = moving_window_seconds,
            select_train_files = select_train_files, user = user, position_x = position_x, index = index
        )

        all_train_files += my_train_files
        all_test_files += my_test_files

    print([len(i) for i in all_train_files])
    print([len(i) for i in all_test_files])

    lookback = hz * moving_window_seconds
    # aggregate data points (try moving average) transform to mean, sd, ...
    for i, v_train in enumerate(all_train_files):

        # save possible start indexes for training and test sequences
        max_train_index = len(v_train) - 1
        train_rows = np.arange(lookback, max_train_index, step_size)

        # append start indexes of sequences to total lists
        train_indexes += list(train_rows + len(train))

        # append files to total file
        train = pd.concat([train, v_train])

    for i, v_test in enumerate(all_test_files):

        # save possible start indexes for training and test sequences
        max_test_index = len(v_test) - 1
        test_rows = np.arange(lookback, max_test_index, step_size)

        # append start indexes of sequences to total lists
        test_indexes += list(test_rows + len(test))

        # append files to total file
        test = pd.concat([test, v_test])

    # split x and y
    cols = [
        'v_accelerometer',
        'v_gyroscope',
        'v_magnetometer',
        'v_gravity',
        'v_orientation',
        'min_accelerometer',
        'max_accelerometer',
        'min_gyroscope',
        'max_gyroscope',
        'min_magnetometer',
        'max_magnetometer',
        'min_gravity',
        'max_gravity',
        'min_orientation',
        'max_orientation'
    ]

    y_train = train['y']
    # x_train = train.drop(columns=['y', 'time_since_start(ms)'])
    x_train = train[cols]

    y_test = test['y']
    # x_test = test.drop(columns=['y', 'time_since_start(ms)'])
    x_test = test[cols]

    # normalize data by training aggregates
    mean = x_train.mean(axis=0)
    x_train -= mean
    x_test -= mean
    std = x_train.std(axis=0)
    x_train /= std
    x_test /= std

    # one hot encode labels
    y_train = one_hot_encode(np.array(y_train))
    y_test = one_hot_encode(np.array(y_test))

    # shuffle the list of indexes
    random.shuffle(train_indexes)
    random.shuffle(test_indexes)

    return x_train, x_test, y_train, y_test, train_indexes, test_indexes

In [None]:
def save_sequential_preprocessing(
        X_train, X_test, y_train, y_test, train_indexes, test_indexes, folder: str, settings: str):

    if not os.path.exists(f'./tmp/{folder}'):
        os.mkdir(f'./tmp/{folder}')

    with open(f'tmp/{folder}/X_train.pickle', 'wb') as f: pickle.dump(X_train, f)
    with open(f'tmp/{folder}/X_test.pickle', 'wb') as f: pickle.dump(X_test, f)
    with open(f'tmp/{folder}/y_train.pickle', 'wb') as f: pickle.dump(y_train, f)
    with open(f'tmp/{folder}/y_test.pickle', 'wb') as f: pickle.dump(y_test, f)
    with open(f'tmp/{folder}/train_indexes.pickle', 'wb') as f: pickle.dump(train_indexes, f)
    with open(f'tmp/{folder}/test_indexes.pickle', 'wb') as f: pickle.dump(test_indexes, f)

    with open(rf'./tmp/{folder}/metadata.yaml', 'w') as file: yaml.dump(settings, file)

    print(f'Saved files to "./tmp/{folder}"')

In [None]:
# for moving_window_size in [1, 2]:
#     for hz in [1, 2, 5, 10, 20]:

settings = {
    'MOVING_WINDOW_SIZE': 2,
    'HZ': 5,
    'STEP_SIZE': 5,
    'TEST_PROPORTION': 0.5,
    'AGGREGATION': "sequential",
    'FEATURES': "min, max, EucDist"
}

directory = f"sequential_index_{settings['HZ']}hz_{settings['MOVING_WINDOW_SIZE']}sec"
print(directory)

X_train, X_test, y_train, y_test, train_indexes, test_indexes = preprocess_sequential(
    moving_window_seconds = settings['MOVING_WINDOW_SIZE'],
    hz = settings['HZ'],
    step_size = settings['STEP_SIZE'],
    test_proportion = settings['TEST_PROPORTION'],
    select_train_files = 'index'
)

save_sequential_preprocessing(X_train, X_test, y_train, y_test, train_indexes, test_indexes, directory, settings)

In [None]:
def preprocess_data(moving_window_seconds, hz, step_size, agg_func, test_proportion = 0.2, select_train_files = 'all'):
    '''
    agg_func: aggregate function to apply eg add_moving_window or add_moving_window_2
    '''
    # create empty data frames
    train = pd.DataFrame()
    test = pd.DataFrame()

    all_train_files = []
    all_test_files = []

    for index, (file, user, activity, position_x) in enumerate(
            zip(meta['file'], meta['user'], meta['activity'], meta['position_x'])):
        if user == 'nz':
            df = read_nz_file(file, activity)

        elif user == 'jg':
            df = read_jg_file(file, activity)

        print(file, user, activity, position_x, df.shape)


        # split into train-test
        my_train_files, my_test_files = split_df(
            df, hz = hz, test_proportion = test_proportion, moving_window_size = moving_window_seconds,
            select_train_files = select_train_files, user = user, position_x = position_x, index = index
        )

        print(f'Train: {len(my_train_files)}, Test: {len(my_test_files)}')

        all_train_files += my_train_files
        all_test_files += my_test_files

    print(f'Train: {[len(i) for i in all_train_files]}')
    print(f'Test: {[len(i) for i in all_test_files]}')

    # aggregate every file in training set
    for i, v_train in enumerate(all_train_files):
        if agg_func == 'add_moving_window_2':
            v_train = add_moving_window_2(
                v_train, hz_old_data = hz, seconds = moving_window_seconds, step_size = step_size
            )

        elif agg_func == 'add_moving_window':
            v_train = add_moving_window(
                v_train, hz_old_data = hz, seconds = moving_window_seconds, step_size = step_size
            )

        train = pd.concat([train, v_train])

    # aggregate every file in test set
    for i, v_test in enumerate(all_test_files):
        if agg_func == 'add_moving_window_2':
            v_test = add_moving_window_2(
                v_test, hz_old_data = hz, seconds = moving_window_seconds, step_size = step_size
            )
        elif agg_func == 'add_moving_window':

            v_test = add_moving_window(
                v_test, hz_old_data = hz, seconds = moving_window_seconds, step_size = step_size
            )

        test = pd.concat([test, v_test])


    print(f'Train length: {len(train)}')
    print(f'Test length: {len(test)}')

    # X - y split for train and test data, shuffle data!?
    y_train = train['y'].to_frame()
    X_train = train.drop(columns=['y'])
    y_test = test['y'].to_frame()
    X_test = test.drop(columns=['y'])

    return X_train, X_test, y_train, y_test

In [None]:
def save_preprocessing(X_train, X_test, y_train, y_test, folder: str, settings: str, file_type: str = 'parquet'):
    if not os.path.exists(f'./tmp/{folder}'):
        os.mkdir(f'./tmp/{folder}')

    if file_type == 'parquet':
        X_train.to_parquet(f'tmp/{folder}/X_train.parquet')
        X_test.to_parquet(f'tmp/{folder}/X_test.parquet')
        y_train.to_parquet(f'tmp/{folder}/y_train.parquet')
        y_test.to_parquet(f'tmp/{folder}/y_test.parquet')
    elif file_type == 'pickle':
        with open(f'tmp/{folder}/X_train.pickle', 'wb') as f: pickle.dump(X_train, f)
        with open(f'tmp/{folder}/X_test.pickle', 'wb') as f: pickle.dump(X_test, f)
        with open(f'tmp/{folder}/y_train.pickle', 'wb') as f: pickle.dump(y_train, f)
        with open(f'tmp/{folder}/y_test.pickle', 'wb') as f: pickle.dump(y_test, f)

    with open(rf'./tmp/{folder}/metadata.yaml', 'w') as file:
        yaml.dump(settings, file)

    print(f'Saved files to "./tmp/{folder}"')

In [None]:
directory = 'dimension_independent_5hz_2sec'

settings = {
    'MOVING_WINDOW_SIZE': 2,
    'HZ': 5,
    'STEP_SIZE': 5,
    'TEST_PROPORTION': 0.2,
    'AGGREGATION': "normal",
    'FEATURES': "min, max, EucDist (mean & std)",
    'PREPROCESSING': directory
}

X_train, X_test, y_train, y_test = preprocess_data(
    moving_window_seconds = settings['MOVING_WINDOW_SIZE'],
    hz = settings['HZ'],
    step_size = settings['STEP_SIZE'],
    test_proportion = settings['TEST_PROPORTION'],
    agg_func = 'add_moving_window_2',
    select_train_files='user' # 'all', 'index', 'position_x'
)

save_preprocessing(X_train, X_test, y_train, y_test, directory, settings)