# Anomaly detection

## Import libraries

In [None]:
import time
import gc
import warnings
warnings.filterwarnings('ignore')

'''Main'''
import numpy as np
import pandas as pd
pd.options.display.float_format = '{:.2f}'.format
import glob

'''Data Viz'''
import matplotlib.pyplot as plt
# import seaborn as sns
import plotly.express as px

'''Data Prep and Model Evaluation'''
from sklearn import preprocessing as pp
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold 
from sklearn.metrics import log_loss
from sklearn.metrics import precision_recall_curve, average_precision_score
from sklearn.metrics import roc_curve, auc, roc_auc_score

'''Algorithms'''

'''Tensorflow and Keras'''
import tensorflow as tf
from tensorflow import keras
# K = keras.backend

from keras.models import Sequential, Model
from keras.layers import Activation, Dense, Dropout
from keras.layers import BatchNormalization, Input, Lambda
from keras import regularizers
from keras.losses import mse, binary_crossentropy

### Check GPU

In [None]:
# from tensorflow.python.client import device_lib
# device_lib.list_local_devices()

## Load data

In [None]:
event_dir = {0: "", 1: "", 2: "", 3: "", 4: "",5: "",\
                6: "", 7: "", 8: "", 9: "", 10: "", 11: ""}
event_list = ["", "", "", "", "",\
                "", "", "", "", "", "", ""]
exercise_list = ["No motion", "motion"]
colors = ["red", "blue", "orange", "green", "black", "tan", "gray", "purple", \
            "cyan", "yellow", "pink", "magenta"]
labels_to_remove = [0, 2, 3, 4, 6]

In [None]:
def remove_zero(df: pd.DataFrame) -> pd.DataFrame:
    """
    input: Dataframe including 0
    output: Dataframe removed 0 and reindexed
    """
    df = df[df["event"] != 0]
    df = df.reset_index()
    df = df.drop("index", axis=1) # remove index column that is made automaticaly at rest_index()
    return df

### Train data *make loading part function

Some data files are used, and left and right data are combined into one dataframe.\
Below code chunk loads data, and print result of loading.

In [None]:
left_columns_name = ["L_accX", "L_accY", "L_accZ", "L_bpm", "L_temp", "event"]
right_columns_name = ["R_accX", "R_accY", "R_accZ", "R_bpm", "R_temp", "event"]

# file names as list
left_files = glob.glob("../../../../../data/data_[1][0-2]/left*.csv")
right_files = glob.glob("../../../../../data/data_[1][0-2]/right*.csv")


df_list = []

# read every file one by one
for left, right in zip(left_files, right_files):
    # read both hands data
    left_df = pd.read_csv(left, header=None, names=left_columns_name)
    right_df = pd.read_csv(right, header=None, names=right_columns_name)

    # drop "event" column not to duplicate
    right_df = right_df.drop("event", axis=1)
    # right_df = right_df.drop("R_temp", axis=1)
    # left_df = left_df.drop("L_temp", axis=1)

    df_train = pd.concat([left_df, right_df], axis=1)
    df_list.append(df_train)

# concatenate dataframes in vertical direction
df_train = pd.concat(df_list, axis=0, ignore_index=True)
del df_list, left_df, right_df
gc.collect()
df_train = remove_zero(df_train)

df_train

### Test data

Same procedure as train data, but different files are loaded.

In [None]:
# file names as list
left_files = glob.glob("../../../../../data/data_[1][3-5]/left*.csv")
right_files = glob.glob("../../../../../data/data_[1][3-5]/right*.csv")

df_list = []

# read every file one by one
for left, right in zip(left_files, right_files):
    # read both hands data
    left_df = pd.read_csv(left, header=None, names=left_columns_name)
    right_df = pd.read_csv(right, header=None, names=right_columns_name)

    # drop "event" column not to duplicate
    right_df = right_df.drop("event", axis=1)
    # right_df = right_df.drop("R_temp", axis=1)
    # left_df = left_df.drop("L_temp", axis=1)

    df_test = pd.concat([left_df, right_df], axis=1)
    df_list.append(df_test)

# concatenate dataframes in vertical direction
df_test = pd.concat(df_list, axis=0, ignore_index=True)
del df_list, left_df, right_df
gc.collect()
df_test = remove_zero(df_test)
df_test

## Pre-processing

### Label encoding, 0: No exercise, 1: Exercise

In [None]:
le = LabelEncoder()

# firstly convert event labels into two types: 0, 1 
def custom_encoding(x):
    if x not in [2,3,4,6]:
        return "A"
    else:
        return "B"

# encoding those 2 types
data_encoded = le.fit_transform([custom_encoding(x) for x in df_train["event"]])
df_train["motion"] = data_encoded

data_encoded = le.fit_transform([custom_encoding(x) for x in df_test["event"]])
df_test["motion"] = data_encoded

### Standardization

**From here, df_train and df_test are standardized data**

In [None]:
# train
scaler = MinMaxScaler()
labels = df_train[["event", "motion"]]
columns_name = df_train.columns
df_train = scaler.fit_transform(df_train.drop(["event", "motion"], axis=1))
df_train = pd.DataFrame(df_train, columns=columns_name[0:10])
df_train[["event", "motion"]] = labels
df_train.describe()

# test
scaler = MinMaxScaler()
labels = df_test[["event", "motion"]]
columns_name = df_test.columns
df_test = scaler.fit_transform(df_test.drop(["event", "motion"], axis=1))
df_test = pd.DataFrame(df_test, columns=columns_name[0:10])
df_test[["event", "motion"]] = labels
df_test.describe()

Store anomaly data

In [None]:
anomaly_train = df_train[df_train["event"].isin(labels_to_remove)]
anomaly_train = anomaly_train.reset_index()

anomaly_test = df_test[df_test["event"].isin(labels_to_remove)]
anomaly_test = anomaly_test.reset_index()

### Remove 0,2,3,4,6 labels in train data: think them as anomaly

X is NO exercise data (X is normal data)\
X_train and X_test do not include label 1

In [None]:
# train
X_train = df_train[~df_train["event"].isin(labels_to_remove)]
X_train = X_train.reset_index()
X_train = X_train.drop("index", axis=1)
print("labels:", X_train["event"].unique())

# test
X_test = df_test[~df_test["event"].isin(labels_to_remove)]
X_test = X_test.reset_index()
X_test = X_test.drop("index", axis=1)
print("labels:", X_test["event"].unique())

### Ratio of activities of original

In [None]:
l = df_train["event"].value_counts()
l.sort_index() / len(df_train["event"])

In [None]:
l = df_train["motion"].value_counts()
print("運動なし ratio: " + str(round(l[0]/len(df_train["motion"]), 3)))
print("運動あり ratio: " + str(round(l[1]/len(df_train["motion"]), 3)))

### Ratio of activities after removing anomaly

In [None]:
l = X_train["event"].value_counts()
l.sort_index() / len(X_train["event"])

In [None]:
l = X_train["motion"].value_counts()
print("Train data")
print("運動なし ratio: " + str(round(l[0]/len(X_train["motion"]), 3)))

l = X_test["motion"].value_counts()
print("Test data")
print("運動なし ratio: " + str(round(l[0]/len(X_test["motion"]), 3)))

### sliding window

Make a new 2D list including window-sized data\
Data will be separated into window_size chunk, so the new list is shaped like\
[ [32 data], [next 32 data], [next 32 data], ...]

In [None]:
window_size = 32 # window size
step_size = 32 # step size (if same as window_size, no overlap)

In [None]:
def create_sequences(df: pd.DataFrame) -> pd.DataFrame:
    """
    make sliding window lists
    """
    x = []
    for i in range(0, len(df) - window_size + 1, step_size):
        x.append(df[i:i + window_size].to_numpy())
    x_out = np.array(x)
    return x_out

**here is problem**

In [None]:
# train
normal_train = create_sequences(X_train["L_accX"])
print(normal_train)

# test
normal_test = create_sequences(X_test["L_accX"])
# normal_test = create_sequences(df_test["L_accX"])
normal_test

## AutoEncoder

Use motion (0 or 1) label as detection of motion.\
I think 0 as normal and 1 as anomal.

In [None]:
normal_train.shape

In [None]:
# input layer
input_layer = Input(shape=(normal_train.shape[1]))
# encoding layer
encoding = Dense(27, activation="relu")(input_layer)
# encoding = Dense(50, activation="relu")(encoding)
# encoding = Dense(7, activation="relu")(encoding)

# decoding layer
# decoding = Dense(15, "relu")(encoding)
# decoding = Dense(42, activation="relu")(encoding)
output_layer = Dense(32, activation="sigmoid")(encoding)

model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer="adam", loss="mse", metrics=["mae"])

model.summary()

In [None]:
keras.utils.plot_model(model, show_shapes=True)

In [None]:
start = time.time()
# learning
epochs = 7
# batch_size = 32
validation_split = 0.20
# history = model.fit(windows, windows, epochs=epochs,\
#             batch_size=batch_size, validation_split=validation_split)
history = model.fit(normal_train, normal_train, epochs=epochs, verbose=1,\
                        validation_split=validation_split)
# , validation_data=(train_window, train_window)

end = time.time() - start
print(f"\n{round(end, 2)} sec taken")


In [None]:
model.summary()

In [None]:
keras.utils.plot_model(model, show_shapes=False, show_layer_activations=True, show_layer_names=False, )

## Show results

In [None]:
def visualize_loss(history, title):
    loss = history.history["loss"]
    val_loss = history.history["val_loss"]
    mae = history.history["mae"]
    epochs = range(len(loss))
    plt.figure()
    plt.plot(epochs, loss, "b", label="Training loss MSE")
    plt.plot(epochs, mae, "g", label="Training loss MAE")
    plt.plot(epochs, val_loss, "r", label="Validation loss MSE")
    plt.title(title)
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.legend()
    plt.show()


visualize_loss(history, "Training and Validation Loss")

In [None]:
from sklearn.metrics import mean_squared_error

# prediction and confirmation
predict_data = model.predict(normal_train)

# calculate all mse
all_data_mse = [ mean_squared_error(x, y) for x, y in zip(predict_data, normal_train) ]

# mse as histgram
plt.figure(figsize=(12, 8))
plt.hist(all_data_mse, bins=100, color="blue", alpha=0.5)
plt.title("MSE hist")
plt.xlabel("MSE")
plt.ylabel("freq")
plt.legend(fontsize=12)

In [None]:
# np.round(predict_data, 3)

arrange predicted data to 1D

In [None]:
predict_data.shape

In [None]:
combined_data = []
for i in range(0, len(predict_data),):
    combined_data.extend(predict_data[i])
combined_data

In [None]:
import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(go.Scatter(x=(X_train.index/16), y=X_train["L_accX"], name="train"))
fig.add_trace(go.Scatter(x=(X_train.index/16), y=combined_data, name="predicted"))
fig.update_yaxes(tickformat=".1f", title_text="accX (G)")
fig.update_xaxes(tickformat='d', title_text="second")

fig.update_layout(legend=dict(title_font_family="Times New Roman",
                                font=dict(size= 20)
))

fig.show()

In [None]:
# anomaly train is anomaly data from training dataframe
anomaly_train = create_sequences(anomaly_train["L_accX"])

In [None]:
len(anomaly_train), len(normal_train)

In [None]:
anomaly_train

In [None]:
import seaborn as sns
sns.set(style='white', context='notebook', palette='deep')

def get_errors(input):
    output = model.predict(input)
    print("input shape", input.shape)
    print("output shape", output.shape)
    sub = np.abs(input-output)
    errors = np.sum(sub, axis=(1))
    return errors

x_normal_errors = get_errors(normal_train) 
x_abnomal_errors = get_errors(anomaly_train) 
ax=sns.distplot(x_abnomal_errors,bins=20, label="Exercise")
sns.distplot(x_normal_errors,ax=ax,bins=20, label="NO exercise")
ax.set_xlabel("error")
# plt.xlim([0, 2])
plt.legend()

In [None]:
import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(go.Scatter(y=x_abnomal_errors))
fig.show()

Apply to test data

In [None]:
import seaborn as sns
sns.set(style='white', context='notebook', palette='deep')

def get_errors(input):
    output = model.predict(input)
    print("input shape", input.shape)
    print("output shape", output.shape)
    sub = np.abs(input-output)
    errors = np.sum(sub, axis=(1))
    return errors

anomaly_test = df_test[df_test["event"].isin(labels_to_remove)]
anomaly_test = anomaly_test.reset_index()

anomaly_test = create_sequences(anomaly_test["L_accX"])

x_normal_errors = get_errors(normal_test) 
x_anomaly_errors = get_errors(anomaly_test) 
ax=sns.distplot(x_anomaly_errors,bins=20, label="Exercise")
sns.distplot(x_normal_errors,ax=ax,bins=20, label="NO exercise")
ax.set_xlabel("error")
# plt.xlim([0, 2])
plt.legend()

X_test と df_testはどっちをテストで使う？

In [None]:
X_test.shape, df_test.shape

## Classify based on the error

NO運動をNO運動と分類した割合

In [None]:
threshold = 1.0
count_normal = np.count_nonzero(x_normal_errors <= threshold)

print(f"threshhold: {threshold}")
print(f"normal length: {len(x_normal_errors)}")
print(f"TN: {count_normal}")
print(f"FP: {len(x_normal_errors)-count_normal}")
print(f"ratio: {count_normal/len(x_normal_errors)}")

運動ありを運動ありと分類した割合

In [None]:
count_anomaly = np.count_nonzero(x_anomaly_errors > threshold)

print(f"threshhold: {threshold}")
print(f"anomaly length: {len(x_anomaly_errors)}")
print(f"TP: {count_anomaly}")
print(f"FN: {len(x_anomaly_errors)-count_anomaly}")
print(f"raito: {count_anomaly/len(x_anomaly_errors)}")

In [None]:
print(f"Accuracy: {(count_normal+count_anomaly) / (len(x_normal_errors)+len(x_anomaly_errors))}")

---

In [None]:
(df_test["motion"] == 1).sum()

In [None]:
test = create_sequences(df_test["L_accX"])

import seaborn as sns
sns.set(style='white', context='notebook', palette='deep')

def get_errors(input):
    output = model.predict(input)
    print("input shape", input.shape)
    print("output shape", output.shape)
    sub = np.abs(input-output)
    errors = np.sum(sub, axis=(1))
    return sub

test_errors = get_errors(test)  
ax=sns.distplot(test_errors, bins=20)
ax.set_xlabel("error")
# plt.xlim([0, 2])
plt.legend()

In [None]:
(test_errors > 0.1).sum()

threshold = 0.5
res = []
for i in range(0, len(test_errors)):
    if test_errors[i] < threshold:
        res.append(0)
    else:
        res.append(1)
res

In [None]:
from sklearn.metrics import confusion_matrix
pred = model.predict(test)

print(confusion_matrix(test, pred))

# Free space

In [None]:
plt.plot(x_abnomal_errors)
len(x_normal_errors)

????

In [None]:
plt.plot(predict_data[0])
plt.plot(predict_data[10])
plt.plot(predict_data[11])
plt.legend([1,2,3])
predict_data.shape

In [None]:
a= pd.DataFrame(x_abnomal_errors)
a.describe()

In [None]:
a= pd.DataFrame(x_normal_errors)
a.describe()