In [None]:
#pip install plotly

In [None]:
import csv
import numpy as np
import pandas as pd

import glob
import os
import time
import re

import seaborn as sns
import matplotlib.pyplot as plt


processed data - intermediate step

> raw data

> clean data

> preprocess data : store it in DB (better compared to file format) 

    - data lake
    
    or
    
    - DB model based SQL (Nice to have but not required if we deceide to save the data as a Feather file)

> 

## 1. Load and clean raw data

files from SensorLog iOS app has in total over 70 colums and precision of 12 decimal figures. The output file is over 135 MB, which is too large for GitHub. GitHub restricts the file size, therefore these files from SensorLog must be cleaned.

Below summary of steps which is done only for iOS files:

- read raw data as csv files
- remove unnecessary columns (captured in list 'remove_cols' below)
- round to 6 decimal places to reduce the size of files
- output dataframe as csv
- upload the csv on GitHub

# 2. Meta data

The data on different data files is captured in meta dataframe below:
- file name
- user (nz or jg)
- activity (running/cycling/walking/sitting)
- pocket (in which pocket handy was during the activity)
- position_x
- position_y


In [None]:
meta = pd.DataFrame(
    data={
        'file': [
            'walking_jg_1.csv', 'walking_jg_2.csv', 'walking_jg_3.csv', 'running_jg_1.csv', 'running_jg_2.csv',
            'running_nz_1.csv', 'walking_nz_1.csv','walking_nz_2.csv',  # added NZ 20221006
            'sitting_jg_1.csv', 'sitting_jg_2.csv', # added JG 20221006
            'sitting_nz_3.csv', 'walking_nz_4.csv', 'running_nz_3.csv', #added NZ 20221010
            'running_jg_3.csv', 'running_jg_4.csv', 'running_jg_5.csv', # added JG 20221011
            'cycling_nz_3.csv', # added NZ 20221012
            'cycling_jg_1.csv', # added JG 20221012
        ],
        'user': [
            'jg', 'jg', 'jg', 'jg', 'jg',
            'nz','nz','nz', # added NZ 20221006
            'jg', 'jg', # added JG 20221006
            'nz', 'nz', 'nz', # added NZ 20221010
            'jg', 'jg', 'jg', # added JG 20221011
            'nz', # added NZ 20221012
            'jg', # added JG 20221012
        ],
        'activity': [
            'walking', 'walking', 'walking', 'running', 'running',
            'running', 'walking','walking', #added NZ 20221006
            'sitting', 'sitting', # added JG 20221006
            'sitting', 'walking', 'running' , # added NZ 20221010
            'running', 'running', 'running', # added JG 20221011
            'cycling', # added NZ 20221012
            'cycling', # added JG 20221012
        ],
        'pocket': [
            'left pant pocket', 'left pant pocket', 'left pant pocket', 'left pant pocket', 'left pant pocket',
            'left jacket pocket', 'left jacket pocket', 'left jacket pocket', # added NZ 20221006
            'left pant pocket', 'left pant pocket', # added JG 20221006
            'left pant pocket', 'left pant pocket', 'left pant pocket', #added NZ 20221010
            'left pant pocket', 'left pant pocket', 'left pant pocket', # added JG 20221011
            'left pant pocket', # added NZ 20221012
            'left pant pocket', # added JG 20221012
        ],
        'position_x': [
            'screen towards body', 'screen towards body', 'screen towards body', 'screen towards body', 'screen towards body',
            'screen not towards body', 'screen not towards body', 'screen towards body', # added NZ 20221006
            'screen towards body', 'screen not towards body', # added JG 20221006
            'screen not towards body','screen towards body', 'screen not towards body', #added NZ 20221010
            'screen not towards body', 'screen not towards body', 'screen not towards body', # added JG 20221011
            'screen not towards body' , # added NZ 20221012
            'screen towards body', # added JG 20221012
        ],
        'position_y': [
            'upright', 'upside down', 'upside down', 'upside down', 'upright',
            'upright','upright','upright', # added NZ 20221006
            'upside down', 'upside down', # added JG 20221006
            'upright', 'upside down', 'upright', # added NZ 20221010
            'upside down', 'upright', 'upright', # added JG 20221011
            'upright', #added NZ 20221012
            'upside down', # added JG 20221012
        ]
    }
)


meta

# 3. View data

Sensor activity data is captured from 2 different Apps:
- SensorLog (iOS) by user 'nz'
- AndrioSensor (Andriod) by user 'jg'

**Response**: 'Acivity' with 4 classes: running/walking/cycling/sitting



In [None]:
def read_jg_file(file: str, activity: str) -> pd.DataFrame:
    
    df = pd.read_csv(f'data/{file}', sep=';', header=1)[1200:-1200]
    df['datetime'] = pd.to_datetime(df['YYYY-MO-DD HH-MI-SS_SSS'], format='%Y-%m-%d %H:%M:%S:%f')

     #add file name in new column
    df['y'] = activity

    #transformations
    df['gravity_X(G)'] = df['GRAVITY X (m/s²)'] / -9.80665
    df['gravity_Y(G)'] = df['GRAVITY Y (m/s²)'] / -9.80665
    df['gravity_Z(G)'] = df['GRAVITY Z (m/s²)'] / -9.80665

    df['accelerometer_X(G)'] = df['ACCELEROMETER X (m/s²)'] / -9.80665
    df['accelerometer_Y(G)'] = df['ACCELEROMETER Y (m/s²)'] / -9.80665
    df['accelerometer_Z(G)'] = df['ACCELEROMETER Z (m/s²)'] / -9.80665

    df['orientation_X(rad)'] = df['ORIENTATION X (pitch °)'] / 60
    df['orientation_Y(rad)'] = df['ORIENTATION Y (roll °)'] / 60
    df['orientation_Z(rad)'] = df['ORIENTATION Z (azimuth °)'] / 60

    
    my_cols={
         'GYROSCOPE X (rad/s)': 'gyroscope_X(rad/s)',
         'GYROSCOPE Y (rad/s)': 'gyroscope_Y(rad/s)',
         'GYROSCOPE Z (rad/s)': 'gyroscope_Z(rad/s)',
         'magnetometerX(µT)': 'magnetometer_X(microT)',
         'magnetometerY(µT)': 'magnetometer_Y(microT)',
         'magnetometerZ(µT)': 'magnetometer_Z(microT)',
         'Time since start in ms ': 'time_since_start(ms)'
     }

     #rename columns
    df = df.rename(my_cols, axis = 1)
    
    cols = list(my_cols.values()) + ['y']
    
    df = df[[c for c in cols]]
    
    return df

In [None]:
def read_nz_file(file: str, activity: str) -> pd.DataFrame:

    df = pd.read_csv(#file,
                    f'data/{file}',
                    parse_dates = ["loggingTime(txt)"],
                    date_parser = lambda col:pd.to_datetime(col, utc=True) # to handle timezone
                   )[1200:-1200] #remove first few and last few seconds

    #add file name in new column
    df['y'] = activity
    
    my_cols = {
        'loggingTime(txt)': 'datetime',
        'accelerometerAccelerationX(G)': 'accelerometer_X(G)',
        'accelerometerAccelerationY(G)': 'accelerometer_Y(G)',
        'accelerometerAccelerationZ(G)': 'accelerometer_Z(G)',
        'gyroRotationX(rad/s)' : 'gyroscope_X(rad/s)' , 
        'gyroRotationY(rad/s)' : 'gyroscope_Y(rad/s)', 
        'gyroRotationZ(rad/s)' : 'gyroscope_Z(rad/s)', 
        'magnetometerX(µT)' : 'magnetometer_X(microT)',
        'magnetometerY(µT)' : 'magnetometer_Y(microT)', 
        'magnetometerZ(µT)' : 'magnetometer_Z(microT)',
        'motionGravityX(G)' : 'gravity_X(G)' , 
        'motionGravityY(G)' : 'gravity_Y(G)',
        'motionGravityZ(G)' : 'gravity_Z(G)',
        'motionYaw(rad)': 'orientation_Z(rad)',
        'motionRoll(rad)' : 'orientation_Y(rad)', 
        'motionPitch(rad)' : 'orientation_X(rad)', 
    }
    
    #rename columns
    df = df.rename(my_cols, axis = 1)
    
    cols = list(my_cols.values()) + ['y']
    
    df = df[[c for c in cols]]
    
    return df

In [None]:
all_train = pd.DataFrame()
all_test = pd.DataFrame()

for file, user, activity in zip(meta['file'], meta['user'], meta['activity']):
    print(file, user, activity)
    if user == 'nz':
        df = read_nz_file(file, activity) 

    elif user == 'jg':
        df = read_jg_file(file, activity)
    
    # split into train-test
    
    # preprocessing with aggregation and added y label for both files
    
    # append to train and test
    
    
# X - y split for test data