In [None]:
import os
import csv
import yaml
import wandb
import pickle
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from keras.utils import to_categorical
from keras.layers import Dense
from keras.models import Sequential
from src import (
    read_nz_file, read_jg_file, update_meta_data, split_df, aggregate_files, add_moving_window, add_moving_window_2,
    preprocess_data, save_preprocessing, one_hot_encode, preprocess_sequential, save_sequential_preprocessing)
from sklearn.model_selection import train_test_split
from wandb.keras import WandbCallback

processed data - intermediate step

> raw data

> clean data

> preprocess data : store it in DB (better compared to file format) 

    - data lake
    
    or
    
    - DB model based SQL (Nice to have but not required if we deceide to save the data as a Feather file)

> 

## 1. Load and clean raw data

files from SensorLog iOS app has in total over 70 colums and precision of 12 decimal figures. The output file is over 135 MB, which is too large for GitHub. GitHub restricts the file size, therefore these files from SensorLog must be cleaned.

Below summary of steps which is done only for iOS files:

- read raw data as csv files
- remove unnecessary columns (captured in list 'remove_cols' below)
- round to 6 decimal places to reduce the size of files
- output dataframe as csv
- upload the csv on GitHub

# 2. Meta data

The data on different data files is captured in meta dataframe below:
- file name
- user (nz or jg)
- activity (running/cycling/walking/sitting)
- pocket (in which pocket handy was during the activity)
- position_x
- position_y


# 3. View data

Sensor activity data is captured from 2 different Apps:
- SensorLog (iOS) by user 'nz'
- AndrioSensor (Andriod) by user 'jg'

**Response**: 'Acivity' with 4 classes: running/walking/cycling/sitting



In [None]:
update_meta_data()

meta = pd.read_csv('data/meta.csv')
meta

In [None]:
# for moving_window_size in [1, 2]:
#     for hz in [1, 2, 5, 10, 20]:

settings = {
    'MOVING_WINDOW_SIZE': 2,
    'HZ': 5,
    'STEP_SIZE': 5,
    'TEST_PROPORTION': 0.5,
    'AGGREGATION': "sequential",
    'FEATURES': "min, max, EucDist"
}

directory = f"sequential_index_{settings['HZ']}hz_{settings['MOVING_WINDOW_SIZE']}sec"
print(directory)

X_train, X_test, y_train, y_test, train_indexes, test_indexes = preprocess_sequential(
    moving_window_seconds = settings['MOVING_WINDOW_SIZE'],
    hz = settings['HZ'],
    step_size = settings['STEP_SIZE'],
    meta = meta,
    test_proportion = settings['TEST_PROPORTION'],
    select_train_files = 'index'
)

save_sequential_preprocessing(X_train, X_test, y_train, y_test, train_indexes, test_indexes, directory, settings)

In [None]:
directory = 'dimension_independent_5hz_2sec'

settings = {
    'MOVING_WINDOW_SIZE': 2,
    'HZ': 5,
    'STEP_SIZE': 5,
    'TEST_PROPORTION': 0.2,
    'AGGREGATION': "normal",
    'FEATURES': "min, max, EucDist (mean & std)",
    'PREPROCESSING': directory
}

X_train, X_test, y_train, y_test = preprocess_data(
    moving_window_seconds = settings['MOVING_WINDOW_SIZE'],
    hz = settings['HZ'],
    step_size = settings['STEP_SIZE'],
    meta = meta,
    test_proportion = settings['TEST_PROPORTION'],
    agg_func = 'add_moving_window_2',
    select_train_files='user' # 'all', 'index', 'position_x'
)

save_preprocessing(X_train, X_test, y_train, y_test, directory, settings)