# DSA ETL

In [None]:
import os
import sys
import glob
import copy
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import datetime
from datetime import datetime
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import roc_auc_score
# import ipywidgets as widgets
# import bqplot.pyplot as bqplt
# from tqdm.notebook import tqdm
from IPython.core.interactiveshell import InteractiveShell
import warnings
from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)
# import tensorflow as tf
# ['all', 'last', 'last_expr', 'none', 'last_expr_or_assign']
InteractiveShell.ast_node_interactivity = "last"
# matplotlib configuration
mpl.rcParams['grid.color'] = 'k'
mpl.rcParams['grid.linestyle'] = ':'
mpl.rcParams['grid.linewidth'] = 0.5
mpl.rcParams['font.size'] = 12
# plt.style.use(['dark_background'])
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

## Load DSA dataset

In [None]:
%run ../src/dsa_loader.py

raw_data_dir = "../raw_datasets/dsa"
clusters = [f"a{i:02d}" for i in range(1, 19+1)]
dsa_dataset = DSADataset(raw_data_dir, activities=clusters)
train_object_dfs, test_object_dfs = dsa_dataset.load_data()
train_views_dfs = dsa_dataset.split_views(train_object_dfs)
test_views_dfs = dsa_dataset.split_views(test_object_dfs)
stored_dir = '../preprocessed_datasets/dsa'
dsa_dataset.save_into_features(stored_dir, train_views_dfs)

In [None]:
test_views_dfs['view_1']['a09_p1'].head()

In [None]:
import json

def get_scores(view, feature, instances):
    score_path = f"../preprocessed_datasets/dsa/gen_data/{view}/{feature}_saved_scores.json"
    with open(score_path, 'r') as f:
        saved_scores = json.load(f)
    anomaly_scores_np = np.array([saved_score[1] for saved_score in saved_scores])
    print(anomaly_scores_np.shape)
    anomaly_scores = pd.DataFrame(anomaly_scores_np.T, columns=[f'{i}' for i in instances])
    return anomaly_scores

def plot_instance(instance, view, feature, instances):
    anomaly_score = get_scores(view, feature, instances)
    fig, axs = plt.subplots(nrows=2, ncols=1, figsize=(20, 7*2))
    data = train_views_dfs[view][instance][feature]
    axs[0].plot(data)
    predict_indices = anomaly_score[instance].sort_values(ascending=False)[:100].index
    axs[0].scatter(predict_indices, data[predict_indices], c='r')
    # new_data = data.copy(deep=True)
    # for idx in predict_indices:
    #     new_data[idx] = 0.
    # axs[1].plot(new_data)
    axs[1].plot(anomaly_score[instance])
    plt.show()

def replace_anomaly_values(view, feature, instances):
    anomaly_scores = get_scores(view, feature, instances)
    for instance in instances:
        predict_indices = anomaly_scores[instance].sort_values(ascending=False)[:20].index
        for idx in predict_indices:
            train_views_dfs[view][instance][feature][idx] = np.nan
            test_views_dfs[view][instance][feature][idx] = np.nan
        # train_views_dfs[view][instance].fillna(method='ffill', inplace=True)
        # test_views_dfs[view][instance].fillna(method='ffill', inplace=True)
        # NOTE: We use linear method since it is the only method that supports MultiIndexes
        train_views_dfs[view][instance].interpolate(method='linear', inplace=True)
        test_views_dfs[view][instance].interpolate(method='linear', inplace=True)
        


# instance = 'a01_p10'
# view = 'view_2'
# feature = 'mag_lankle_y'
# instances = sorted(list(train_views_dfs['view_1'].keys()))
# plot_instance(instance, view, feature, instances)
# replace_anomaly_values(view, feature, instances)
# plot_instance(instance, view, feature, instances)
# for view, view_dfs in train_views_dfs.items():
#     instances = sorted(list(train_views_dfs[view].keys()))
#     for feature in view_dfs[instances[0]].columns:
#         replace_anomaly_values(view, feature, instances)

In [None]:
for view in train_views_dfs.keys():
    for instance in train_views_dfs['view_1'].keys():
        train_views_dfs[view][instance] = pd.DataFrame(train_views_dfs[view][instance][:3750].values, columns=train_views_dfs[view][instance].columns)
        test_views_dfs[view][instance] = pd.DataFrame(test_views_dfs[view][instance][3750:].values, columns=test_views_dfs[view][instance].columns)
print(train_views_dfs['view_1']['a01_p1'].shape)
print(test_views_dfs['view_1']['a01_p1'].shape)

## Generate Anomalies

In [None]:
%run ../src/anomaly_generations.py

dataset_name = 'dsa_timestep_same_subject_random_view'
n_samples = 10
nb_views = 5

for sample in range(11, n_samples+1):
    for anomaly_rate in [5, 10, 15, 20]:
        dir_path = f"../preprocessed_datasets/{dataset_name}/sample{sample}/anomaly_rate_{anomaly_rate}_views_{nb_views}"
        swapped_test_views_dfs, ground_truths = swap_time_steps(copy.deepcopy(test_views_dfs), clusters=clusters, anomaly_rate=anomaly_rate*0.01)
        # Save to files
        print("Saving files...")
        for view, view_dfs in train_views_dfs.items():
            view_path = dir_path+f"/train/{view}"
            if not os.path.exists(view_path): os.makedirs(view_path)
            for ap, df in view_dfs.items():
                df.to_csv(f"{view_path}/{ap}.csv", index=False)
        for view, view_dfs in swapped_test_views_dfs.items():
            view_path = dir_path+f"/test/{view}"
            if not os.path.exists(view_path):
                os.makedirs(view_path)
            for ap, df in view_dfs.items():
                df.to_csv(f"{view_path}/{ap}.csv", index=False)
        for ap, gt in ground_truths.items():
            gt.to_csv(dir_path+f"/test/{ap}.csv", index=False)
        print('Done.')