## Libraries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from influxdb import InfluxDBClient, DataFrameClient

## Load Data

In [2]:
influxClient = DataFrameClient(host='css20.dmz.teco.edu', port=80, database='browser',username="user", password="pass")
data = influxClient.query("select * FROM devicemotion")["devicemotion"]

In [3]:
data.columns

Index(['acceleration.x', 'acceleration.y', 'acceleration.z',
       'accelerationIncludingGravity.x', 'accelerationIncludingGravity.y',
       'accelerationIncludingGravity.z', 'label', 'mobile',
       'rotationRate.alpha', 'rotationRate.beta', 'rotationRate.gamma',
       'subject', 'useragent'],
      dtype='object')

In [4]:
data.head(2)

Unnamed: 0,acceleration.x,acceleration.y,acceleration.z,accelerationIncludingGravity.x,accelerationIncludingGravity.y,accelerationIncludingGravity.z,label,mobile,rotationRate.alpha,rotationRate.beta,rotationRate.gamma,subject,useragent
2020-05-18 15:38:31.330200+00:00,0.0514,-0.4303,1.4837,0.038307,5.439626,9.825804,testing,UnknownPhone,-13.115,-43.004999,0.488,10b5c,Mozilla/5.0 (Linux; Android 7.0; FRD-L19) Appl...
2020-05-18 15:38:31.348400+00:00,0.6009,-0.1249,1.0312,0.172383,5.477933,9.174581,testing,UnknownPhone,-2.623,-6.527,-2.379,10b5c,Mozilla/5.0 (Linux; Android 7.0; FRD-L19) Appl...


In [5]:
data.shape

(346487, 13)

## Data Cleaning

In [6]:
data = data.loc['2020-07-01':].dropna()
data.shape

(179951, 13)

## Preprocessing

In [7]:
# Extract labels and subjects
labels = data['label'].unique()
subjects = data['subject'].unique()

# Group data according to labels and subjects
grouped_data = {}
for subject in subjects:
    grouped_data[subject] = {}
    subject_data = data[data['subject'] == subject]
    for label in labels:
        grouped_data[subject][label] = subject_data[subject_data["label"] == label]

## Windowing

In [8]:
def minmax(data):
    return np.max(data)-np.min(data)

In [9]:
# Prepare aggregation
numeric_columns = {}
for col, dtype in zip(data.columns, data.dtypes):
    if dtype == 'float64':
        numeric_columns[col] = ['max', 'mean', 'min',"std", "var",minmax]

In [11]:
aggregatedDict = {}
for subject in subjects:
    aggregatedDict[subject] = {}
    for label in labels:
        # check if part is empty
        if(not grouped_data[subject][label].empty):
            # use resample for frequency conversion 
            aggregatedDict[subject][label] = grouped_data[subject][label].resample('1s').agg(numeric_columns).dropna()

## Data Transformation

In [19]:
# groups for Leave-One-Subject-Out-CV
groups = []

# Transform data into ungrouped and flat table
data = pd.DataFrame()
for subject in subjects:
    for label in labels:
        if label in aggregatedDict[subject]:
            curr_data = aggregatedDict[subject][label]
            curr_labels = len(curr_data)*[label]
            curr_data["label"] = curr_labels
            data = data.append(curr_data)
            groups.extend(len(curr_data)*[subject])

## Train-Test Split (Leave-one-Subject-out)

In [None]:
from sklearn.model_selection import LeaveOneGroupOut

X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
y = np.array([1, 2, 1, 2])
groups = np.array([1, 1, 2, 2])
logo = LeaveOneGroupOut()
logo.get_n_splits(X, y, groups)
logo.get_n_splits(groups=groups)  # 'groups' is always required