In [1]:
import os
import pathlib
import numpy as np
import pandas as pd

In [12]:
# Specify the datatype that we saved in our binary files
dt = np.dtype([('x', np.double), ('y', np.double), ('z', np.double), ('timestamp', np.ulonglong)])

In [13]:
data_path = pathlib.Path(os.getcwd()).joinpath(r"./AccelerometerData")

In [14]:
# Get all file in the directory (it is important to note that the filename you have used will be the class names from now on)
_, _, filenames = next(os.walk(data_path))

In [16]:
# This function convert every item in the numpy array to a list instead of a tuple
def convert(item):
    return np.asarray(item)
# Iterate through the file and add a column to them corresponding to their class names
data = pd.DataFrame()
class_names = []
for file in filenames:
    print(file)
    filepath = data_path.joinpath(file)
    class_name = file.split('.')[0]
    class_names.append(class_name)
    class_data = np.fromfile(filepath, dtype=dt)
    # Fromfile as this weird way of loading data in so we need to convert
    # everything back to a list and reconvert to numpy array with the corresponding data type.
    class_dataframe = pd.DataFrame(class_data)
    class_dataframe.insert(0, "Mode", class_name)
    data = data.append(class_dataframe, ignore_index=True)
print(data)
print(class_names)

Error_1.bin
Error_2.bin
Normal.bin
Off.bin
            Mode       x       y       z      timestamp
0        Error_1  0.0546 -1.0842 -0.1560  1619640596650
1        Error_1 -0.0468 -1.0374 -0.0312  1619640596652
2        Error_1 -0.1326 -1.0452  0.0000  1619640596654
3        Error_1 -0.1326 -1.0062  0.0312  1619640596656
4        Error_1 -0.0624 -0.9282  0.0468  1619640596658
...          ...     ...     ...     ...            ...
1305814      Off  0.0234 -0.9984  0.0000  1619980693003
1305815      Off  0.0156 -1.0062 -0.0702  1619980693005
1305816      Off  0.0078 -0.9906 -0.0234  1619980693007
1305817      Off  0.0234 -0.9984 -0.0546  1619980693009
1305818      Off  0.0234 -0.9984 -0.0546  1619980693011

[1305819 rows x 5 columns]
['Error_1', 'Error_2', 'Normal', 'Off']


In [7]:
# Now that we have our dataframe, we can start augmenting it and patching the missing data
# TODO

In [9]:
# Now that our data is all well in a DataFrame we can start creating our X and y arrays
# We want to create a 25x3 image (the 3 is simply x, y, z data of each sample)
# from our input data so that we can give it to a CNN model
# These model are made to work really well on images. Our data might not seem like an image but trust me for a minute
# and you'll understand why we do it this way later.

# We'll begin by creating some helful functions

# Here are some library we will need to perform some of the modification
from scipy import stats
from tqdm import tqdm
# This function gives us the window of data to put in each of our samples
# each time you call it the window steps forward (by N step) through our data but it's width remains the same
def window_gen(data, window_size, step=1):
    start = 0
    while start < data.count():
        yield start, start + window_size
        start += step
        
# This function splits our data into 25x3 chunks and also create an array with the corresponding label
def split_data(data, window_size=10, step=10):
    segments = np.empty((0,window_size,3))
    labels= np.empty((0))
    for (start, end) in tqdm(window_gen(data['timestamp'],window_size, step)):
        x = data['x'][start:end]
        y = data['y'][start:end]
        z = data['z'][start:end]
        # check to make sure that if the window size is not a multiple of our data, 
        # we simply discard the end (because it won't have the right lenght)
        #if(len(data['timestamp'][start:end]) == window_size):
        try:
            segments = np.vstack([segments,np.dstack([x,y,z])])
            labels = np.append(labels,stats.mode(data['Mode'][start:end])[0][0])
        except:
            pass
    return segments, labels

In [10]:
X, y = split_data(data, window_size = 25, step=20)

65291it [20:32, 52.98it/s] 


In [9]:
# We also need to one hot encode our output labels for the training
y_categorical = np.asarray(pd.get_dummies(y), dtype=np.int8)
print(y_categorical)

[[1 0 0 0]
 [1 0 0 0]
 [1 0 0 0]
 ...
 [0 0 0 1]
 [0 0 0 1]
 [0 0 0 1]]


In [10]:
# Save these arrays to avoid having to recompute this lengthy step
with open("training_data.npz", "wb") as output_file:
    np.savez(output_file, X=X, y=y, y_categorical=y_categorical)

In [2]:
# To reload the data from the file run these line:
X = np.array([])
y = np.array([])
y_categorical = np.array([])
with open("training_data.npz", "rb") as input_file:
    npzdata = np.load(input_file)
    X = npzdata['X']
    y = npzdata['y']
    y_categorical = npzdata['y_categorical']

In [3]:
# As an additionnal step we need to reshape our data to add an extra dimension for tensorflow (i.e the number of channel)
X_reshaped = X.reshape(X.shape[0], X.shape[1], X.shape[2], 1)

In [4]:
# Now that we have our data seperated in a data array and a label array we can make a training set and a validation set
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_reshaped, y_categorical, test_size=0.3)

In [5]:
# Some import required for Tensorflow2
from tensorflow import keras
from tensorflow.keras import layers, Input, Model
from tensorflow.keras.models import Sequential

# Let's now define a function that will create our model (not train it, only create it)
def create_model(input_shape, nb_classes):
    model = Sequential()
    model.add(layers.Conv2D(4, (3,3), input_shape=(*input_shape, 1), padding='same', activation='relu'))
    model.add(layers.MaxPooling2D(pool_size=(2, 1),padding='same'))
    model.add(layers.Conv2D(8, (5,3), input_shape=(*input_shape, 1), padding='same', activation='relu'))
    model.add(layers.MaxPooling2D(pool_size=(2, 1),padding='same'))
    model.add(layers.Flatten())
    model.add(layers.Dense(25, activation='relu'))
    model.add(layers.Dropout(0.2))
    model.add(layers.Dense(nb_classes, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model
    

In [6]:
model = create_model((25, 3), len(np.unique(y)))

In [7]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 25, 3, 4)          40        
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 13, 3, 4)          0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 13, 3, 8)          488       
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 7, 3, 8)           0         
_________________________________________________________________
flatten (Flatten)            (None, 168)               0         
_________________________________________________________________
dense (Dense)                (None, 25)                4225      
_________________________________________________________________
dropout (Dropout)            (None, 25)                0

In [8]:
model.fit(x=X_train, y=y_train, validation_data=(X_test, y_test), epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x29b5b5b5c40>

In [9]:
y_predict = model.predict(X_test)

In [18]:
print(y_predict)

[[6.5145514e-08 3.4300822e-33 9.9999988e-01 2.0309684e-27]
 [4.2951089e-17 0.0000000e+00 1.5053535e-03 9.9849463e-01]
 [1.0000000e+00 1.6129544e-11 9.3287794e-16 0.0000000e+00]
 ...
 [4.7285201e-14 1.0000000e+00 5.8667609e-37 0.0000000e+00]
 [8.7341050e-09 2.0901463e-34 1.0000000e+00 4.9927849e-23]
 [3.5331534e-17 0.0000000e+00 1.4865758e-03 9.9851340e-01]]


In [21]:
# Convert prediction to class label and compare them to their expected values by creating a confusion matrix
from sklearn.preprocessing import LabelBinarizer

lb = LabelBinarizer()
lb.fit(class_names)
# Convert our prediction and the expected y_test back to class namespace
y_predict_class = lb.inverse_transform(y_predict)
y_test_class = lb.inverse_transform(y_test)
print(predict_class)
print(y_test_class)

['Normal' 'Off' 'Error_1' ... 'Error_2' 'Normal' 'Off']
['Normal' 'Off' 'Error_1' ... 'Error_2' 'Normal' 'Off']


In [23]:
# Now print the confusion matrix using the predicted class and the actual class contained in y_test
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test_class, y_predict_class))

[[4884    1    0    0]
 [   0 4857    0    0]
 [   1    0 4849    2]
 [   0    0    0 4993]]


In [25]:
# That's pretty good :) Let's now save our model and get converting (we need to convert it to c to run it on the STM32)
model.save("anomaly_detection.h5")