In [1]:
import os
import sys
import glob
import copy
import json
import scipy
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import datetime
from datetime import datetime
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import roc_auc_score
# import ipywidgets as widgets
# import bqplot.pyplot as bqplt
# from tqdm.notebook import tqdm
from IPython.core.interactiveshell import InteractiveShell
import warnings
from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)
# import tensorflow as tf
# ['all', 'last', 'last_expr', 'none', 'last_expr_or_assign']
InteractiveShell.ast_node_interactivity = "last"
# matplotlib configuration
mpl.rcParams['grid.color'] = 'k'
mpl.rcParams['grid.linestyle'] = ':'
mpl.rcParams['grid.linewidth'] = 0.5
mpl.rcParams['font.size'] = 12
# plt.style.use(['dark_background'])
# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_colwidth', None)


Column 1: acceleration from the chest sensor (X axis)
Column 2: acceleration from the chest sensor (Y axis)
Column 3: acceleration from the chest sensor (Z axis)
Column 4: electrocardiogram signal (lead 1) 
Column 5: electrocardiogram signal (lead 2)
Column 6: acceleration from the left-ankle sensor (X axis)
Column 7: acceleration from the left-ankle sensor (Y axis)
Column 8: acceleration from the left-ankle sensor (Z axis)
Column 9: gyro from the left-ankle sensor (X axis)
Column 10: gyro from the left-ankle sensor (Y axis)
Column 11: gyro from the left-ankle sensor (Z axis)
Column 13: magnetometer from the left-ankle sensor (X axis)
Column 13: magnetometer from the left-ankle sensor (Y axis)
Column 14: magnetometer from the left-ankle sensor (Z axis)
Column 15: acceleration from the right-lower-arm sensor (X axis)
Column 16: acceleration from the right-lower-arm sensor (Y axis)
Column 17: acceleration from the right-lower-arm sensor (Z axis)
Column 18: gyro from the right-lower-arm sensor (X axis)
Column 19: gyro from the right-lower-arm sensor (Y axis)
Column 20: gyro from the right-lower-arm sensor (Z axis)
Column 21: magnetometer from the right-lower-arm sensor (X axis)
Column 22: magnetometer from the right-lower-arm sensor (Y axis)
Column 23: magnetometer from the right-lower-arm sensor (Z axis)
Column 24: Label (0 for the null class)

In [2]:
features = ['acc_chest_x', 'acc_chest_y', 'acc_chest_z', 'ecd_chest_1', 'ecd_chest_2',
            'acc_lankle_x', 'acc_lankle_y', 'acc_lankle_z', 'gyro_lankle_x', 'gyro_lankle_y', 'gyro_lankle_z', 'mag_lankle_x', 'mag_lankle_y', 'mag_lankle_z',
            'acc_rlarm_x', 'acc_rlarm_y', 'acc_rlarm_z', 'gyro_rlarm_x', 'gyro_rlarm_y', 'gyro_rlarm_z', 'mag_rlarm_x', 'mag_rlarm_y', 'mag_rlarm_z',
            'label']


class MHealthDataset:
    def __init__(self, data_path, activities=None, train_rate=0.5, nb_views=5):
        self.data_path = data_path
        self.nb_views = nb_views
        self.features = features
        self.train_rate = train_rate
        self.activities = activities

    def load_data(self):
        p_paths = sorted(glob.glob(self.data_path+"/*.log"))
        p_dfs = [pd.read_csv(p_path, delimiter='\t', names=self.features, header=None) for p_path in p_paths]
        aps_dfs = {}
        min_length = 100000
        for p, p_df in enumerate(p_dfs):
            tmp_dfs = [group[1] for group in p_df.groupby('label') if group[0] in self.activities]
            for act in self.activities:
                key = f"a{act:02d}_p{p+1}"
                aps_dfs[key] = tmp_dfs[act-1]
                min_length = min(min_length, aps_dfs[key].shape[0])
        train_ap_dfs = {}
        test_ap_dfs = {}
        # train_split = int(min_length*self.train_rate)
        train_split = 2096
        for key, ap_df in aps_dfs.items():
            # train_ap_dfs[key] = aps_dfs[key][:train_split].reset_index().drop(columns=['index'])
            # test_ap_dfs[key] = aps_dfs[key][train_split:train_split+train_split].reset_index().drop(columns=['index'])
            train_ap_dfs[key] = aps_dfs[key][:train_split].reset_index().drop(columns=['index'])
            test_ap_dfs[key] = aps_dfs[key][:train_split].reset_index().drop(columns=['index'])
        return train_ap_dfs, test_ap_dfs

    def split_views(self, ap_dfs):
        views = {}
        for v, part in enumerate(['chest', 'lankle', 'rlarm']):
            view_features = [feat for feat in self.features if part in feat]
            views[f'view_{v+1}'] = {key: df[view_features]
                                    for key, df in ap_dfs.items()}
        return views

data_path = '../raw_datasets/mhealth'
activities = list(range(1, 11+1))
clusters = [f"a{act:02d}" for act in activities]
dataset = MHealthDataset(data_path, activities=activities)
train_ap_dfs, test_ap_dfs = dataset.load_data()
train_views_dfs = dataset.split_views(train_ap_dfs)
test_views_dfs = dataset.split_views(test_ap_dfs)
train_views_dfs['view_2']['a11_p1'].head()

Unnamed: 0,acc_lankle_x,acc_lankle_y,acc_lankle_z,gyro_lankle_x,gyro_lankle_y,gyro_lankle_z,mag_lankle_x,mag_lankle_y,mag_lankle_z
0,-3.9508,-19.154,12.294,0.5974,-0.74109,-0.38114,118.88,-71.831,14.358
1,9.4186,-11.642,-15.696,0.5974,-0.74109,-0.38114,20.622,-169.75,34.618
2,19.521,-6.9617,-12.007,0.5974,-0.74109,-0.38114,-72.995,40.179,44.492
3,-2.5352,-9.3031,-1.5947,0.51763,-0.78612,-0.42436,-121.06,27.449,70.256
4,0.75023,-19.147,11.998,0.51763,-0.78612,-0.42436,-135.99,-145.8,68.536


In [3]:
train_views_dfs['view_2']['a11_p1'].shape

(2096, 9)

In [4]:
stored_dir = '../preprocessed_datasets/mhealth'
# Train dataset
for view, view_dfs in train_views_dfs.items():
    view_path = stored_dir+f"/raw/{view}"
    if not os.path.exists(view_path):
        os.makedirs(view_path)
    for ap, df in view_dfs.items():
        for col in df.columns:
            path = f"{view_path}/{col}"
            os.makedirs(path, exist_ok=True)
            df[col].to_csv(f"{path}/{ap}.csv", header=[col])

In [None]:
%run ../src/anomaly_generations.py

dataset_name = 'mhealth_timestep_same_subject_random_view'
sample = 10
nb_views = 3
today = datetime.now().strftime("%Y%m%d")
for anomaly_rate in [5]:
    dir_path = f"../preprocessed_datasets/datasets_{today}/{dataset_name}/sample{sample}/anomaly_rate_{anomaly_rate}_views_{nb_views}"
    swapped_test_views_dfs, ground_truths = swap_time_steps(copy.deepcopy(
        test_views_dfs), clusters=clusters, anomaly_rate=anomaly_rate*0.01)

    # Save to files
    print("Saving files...")
    for view, view_dfs in train_views_dfs.items():
        view_path = dir_path+f"/train/{view}"
        if not os.path.exists(view_path):
            os.makedirs(view_path)
        for ap, df in view_dfs.items():
            if not 'a11' in ap:
                df.to_csv(f"{view_path}/{ap}.csv", index=False)
    for view, view_dfs in swapped_test_views_dfs.items():
        view_path = dir_path+f"/test/{view}"
        if not os.path.exists(view_path):
            os.makedirs(view_path)
        for ap, df in view_dfs.items():
            if not 'a11' in ap:
                df.to_csv(f"{view_path}/{ap}.csv", index=False)
    for ap, gt in ground_truths.items():
        if not 'a11' in ap:
            gt.to_csv(dir_path+f"/test/{ap}.csv", index=False)
    print('Done.')


Generating anomalies by replacing one subsequence...


100%|██████████| 110/110 [00:00<00:00, 3495.54it/s]

Saving files...





Done.
Generating anomalies by replacing one subsequence...


100%|██████████| 110/110 [00:00<00:00, 3899.77it/s]

Saving files...





Done.
