In [None]:
import os
import csv
import yaml
import wandb
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from keras.utils import to_categorical
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from src import read_nz_file, read_jg_file, update_meta_data, split_df, aggregate_files, add_moving_window
from sklearn.model_selection import train_test_split
from wandb.keras import WandbCallback

processed data - intermediate step

> raw data

> clean data

> preprocess data : store it in DB (better compared to file format) 

    - data lake
    
    or
    
    - DB model based SQL (Nice to have but not required if we deceide to save the data as a Feather file)

> 

## 1. Load and clean raw data

files from SensorLog iOS app has in total over 70 colums and precision of 12 decimal figures. The output file is over 135 MB, which is too large for GitHub. GitHub restricts the file size, therefore these files from SensorLog must be cleaned.

Below summary of steps which is done only for iOS files:

- read raw data as csv files
- remove unnecessary columns (captured in list 'remove_cols' below)
- round to 6 decimal places to reduce the size of files
- output dataframe as csv
- upload the csv on GitHub

# 2. Meta data

The data on different data files is captured in meta dataframe below:
- file name
- user (nz or jg)
- activity (running/cycling/walking/sitting)
- pocket (in which pocket handy was during the activity)
- position_x
- position_y


# 3. View data

Sensor activity data is captured from 2 different Apps:
- SensorLog (iOS) by user 'nz'
- AndrioSensor (Andriod) by user 'jg'

**Response**: 'Acivity' with 4 classes: running/walking/cycling/sitting



In [None]:
update_meta_data()

In [None]:
meta = pd.read_csv('data/meta.csv')
meta

In [None]:
def create_tensors(data, moving_window_seconds, hz, step_size):

    # split into x and y
    labels = data['y']
    data = data.drop(columns=['y'])

    # to numpy
    labels = labels.to_numpy()
    data = data.to_numpy()

    # select starter rows
    max_index = len(data) -1
    min_index = 0
    lookback = moving_window_seconds * hz

    i = min_index + lookback

    rows = np.arange(i, max_index, step_size)
    # print(rows)

    # create empty data structures
    samples = np.zeros((len(rows), int(lookback / step_size), data.shape[-1]))
    targets = []

    print(samples.shape)

    # add data for every starter row to data structures
    for j, row in enumerate(rows):
        indices = range(row - lookback, row, step_size)
        samples[j] = data[indices]
        targets.append(labels[row])

    return samples, targets

In [None]:
def concat_tensor(total, new_part):
    if total is not None:
        total = np.concatenate((total, new_part))
    else:
        total = new_part

    return total

In [None]:
def preprocess_sequential(moving_window_seconds, hz, step_size, test_proportion = 0.2):
    # create empty data frames
    x_train = None
    x_test = None
    y_train = []
    y_test = []

    for file, user, activity in zip(meta['file'], meta['user'], meta['activity']):
        if user == 'nz':
            df = read_nz_file(file, activity)
            df = df.drop(columns=['datetime'])

        elif user == 'jg':
            df = read_jg_file(file, activity)

        print(file, user, activity, df.shape)

        # create synthetic features

        # split into train-test
        my_train_files, my_test_files = split_df(
            df, hz = hz, test_proportion = test_proportion, moving_window_size = moving_window_seconds
        )

        # aggregate data points (try moving average) transform to mean, sd, ...
        for i, (v_train, v_test) in enumerate(zip(my_train_files, my_test_files)):

            x_train_next, y_train_next = create_tensors(v_train, moving_window_seconds, hz, step_size)
            x_test_next, y_test_next = create_tensors(v_test, moving_window_seconds, hz, step_size)

            x_train = concat_tensor(x_train, x_train_next)
            x_test = concat_tensor(x_test, x_test_next)
            y_train += y_train_next
            y_test += y_test_next


    return x_train, x_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test = preprocess_sequential(20, 20, 20, test_proportion = 0.2)

In [None]:
print(X_train.shape)
print(len(y_train))
print(X_test.shape)
print(len(y_test))

In [None]:

def preprocess_data(moving_window_seconds, hz, step_size, agg_func ,test_proportion = 0.2):
    '''
    agg_func: aggregate function to apply eg add_moving_window or add_moving_window_2
    '''
    # create empty data frames
    train = pd.DataFrame()
    test = pd.DataFrame()

    for file, user, activity in zip(meta['file'], meta['user'], meta['activity']):
        if user == 'nz':
            df = read_nz_file(file, activity)

        elif user == 'jg':
            df = read_jg_file(file, activity)

        print(file, user, activity, df.shape)

        # create synthetic features

        # split into train-test
        my_train_files, my_test_files = split_df(
            df, hz = hz, test_proportion = test_proportion, moving_window_size = moving_window_seconds
        )

        # print(f'Train: {[len(i) for i in my_train_files]}')
        # print(f'Test: {[len(i) for i in my_test_files]}')

        # aggregate data points (try moving average) transform to mean, sd, ...
        for i, (v_train, v_test) in enumerate(zip(my_train_files, my_test_files)):
            # i.reset_index(drop = True)
            v_train = agg_func(
                v_train, hz_old_data = hz, seconds = moving_window_seconds, step_size = step_size
            )
            my_train_files[i] = v_train

            v_test = agg_func(
                v_test, hz_old_data = hz, seconds = moving_window_seconds, step_size = step_size
            )
            my_test_files[i] = v_test

        # print(f'Train: {[len(i) for i in my_train_files]}')
        # print(f'Test: {[len(i) for i in my_test_files]}')

        # append to train and test
        train = aggregate_files(my_train_files, train)
        test = aggregate_files(my_test_files, test)

    # X - y split for train and test data, shuffle data!?
    y_train = train['y'].to_frame()
    X_train = train.drop(columns=['y'])
    y_test = test['y'].to_frame()
    X_test = test.drop(columns=['y'])

    return X_train, X_test, y_train, y_test



In [None]:
def save_preprocessing(X_train, X_test, y_train, y_test, folder: str, settings: str):
    if not os.path.exists(f'./tmp/{folder}'):
        os.mkdir(f'./tmp/{folder}')

    X_train.to_parquet(f'tmp/{folder}/X_train.parquet')
    X_test.to_parquet(f'tmp/{folder}/X_test.parquet')
    y_train.to_parquet(f'tmp/{folder}/y_train.parquet')
    y_test.to_parquet(f'tmp/{folder}/y_test.parquet')

    with open(rf'./tmp/{folder}/metadata.yaml', 'w') as file:
        yaml.dump(settings, file)

    print(f'Saved parquet files to "./tmp/{folder}"')

In [None]:
directory = 'euclid_20hz_20sec'

settings = {
    'MOVING_WINDOW_SIZE': 20,
    'HZ': 20,
    'STEP_SIZE': 20,
    'TEST_PROPORTION': 0.2,
    'AGGREGATION': "normal",
    'FEATURES': "all (mean & std)",
    'PREPROCESSING': directory
}

X_train, X_test, y_train, y_test = preprocess_data(
    moving_window_seconds = settings['MOVING_WINDOW_SIZE'],
    hz = settings['HZ'],
    step_size = settings['STEP_SIZE'],
    test_proportion = settings['TEST_PROPORTION']
)

save_preprocessing(X_train, X_test, y_train, y_test, directory, settings)