In [None]:
import torch
from torchvision import datasets
from torchvision import transforms
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import time

class AE(torch.nn.Module):
    def __init__(self, features):
        super().__init__()
        self.features = features
        self.num_feat = features.shape[0]
        self.check_vector = torch.zeros(self.num_feat)
        self.losses = []
        # Building an linear encoder with Linear
        self.encoder = torch.nn.Sequential(
            torch.nn.Linear(self.num_feat, 32),
            torch.nn.LeakyReLU(0.1),
            torch.nn.Linear(32, 16),
            torch.nn.LeakyReLU(0.1),
            torch.nn.Linear(16, 8)
        )
         
        # Building an linear decoder with Linear
        self.decoder = torch.nn.Sequential(
            torch.nn.Linear(8, 16),
            torch.nn.LeakyReLU(0.1),
            torch.nn.Linear(16, 32),
            torch.nn.LeakyReLU(0.1),
            torch.nn.Linear(32, self.num_feat),
            torch.nn.LeakyReLU(0.1))
        
    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded
    
    def train(self, train_loader, epochs = 10, lr = 1e-3):
        # Model Initialization
 
        # Validation using MSE Loss function
        loss_function = torch.nn.MSELoss()
 
        # Using an Adam Optimizer with lr = 0.001
        optimizer = torch.optim.Adam(self.parameters(), lr = lr, weight_decay = 1e-8)
        outputs = []
        
        for epoch in range(epochs):
            for (sample) in train_loader:
                # Output of Autoencoder
                reconstructed = self.forward(sample)
                self.check_vector = (self.check_vector + torch.sum(sample, 0)/sample.size(0))/2
                # Calculating the loss function
                loss = loss_function(reconstructed, sample)
        
                # The gradients are set to zero,
                # the gradient is computed and stored.
                # .step() performs parameter update
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
       
                # Storing the losses in a list for plotting
                self.losses.append(loss.detach().numpy())
            outputs.append((epochs, sample, reconstructed))
        for i in range(self.check_vector.size(dim=0)):
            if self.check_vector[i] == 0:
                self.check_vector[i] = 1
            else:
                self.check_vector[i] = 0
        ###############
        return self.losses 
    
    def check_tensor(self, sample):
        check = torch.mul(self.check_vector, sample)
        combine_error = []
        if torch.sum(check) == 0:
            # nomal    
            return False,1
        else:
            # abnomal
            for i in range(self.num_feat):
                if check[i] != 0:
                    combine_error.append(self.features[i])
            return True,combine_error

    def test(self,test_tensor,mode):
        loss_function = torch.nn.MSELoss()
        losses = []
        for sample in test_tensor:
            loss = loss_function(self.forward(sample), sample)
            losses.append(loss.detach().numpy())
        threshold = np.mean(losses) + 3 * np.std(losses)
        label = []
        list_combine_error = []
        ## Test code
        for i in range(len(losses)):
            check, combine_error = self.check_tensor(test_tensor[i])
            if check:
                label.append(2)
                list_combine_error.append([i, combine_error])
            else:
                if losses[i] > threshold:
                    label.append(1)
                else: 
                    label.append(0)
#         from sklearn.metrics import classification_report
#         print(classification_report(np.zeros((pd.DataFrame(label)).shape[0]),label))
        if mode == True:
            return np.array(losses)
        else:
            return label, list_combine_error

def principal_combine(shap_value, percent):
    shap_value = abs(shap_value)
    sum_shap = np.sum(shap_value)
    temp_shap = 0
    shap_value_sort = sorted( shap_value, reverse = True)
    shap_sort = np.argsort(-shap_values)
    count_feat = 0
    for i in range(len(shap_value)):
        temp_shap = temp_shap + shap_value_sort[i]
        if (temp_shap >= percent * sum_shap):
            count_feat = i + 1
            break
    return shap_sort[:count_feat]

def main():
    dataset = pd.DataFrame()
    path_1 = "/kaggle/input/data-17-02-2023-60s"
    dir_list_1 = os.listdir(path_1)
    for name_file in dir_list_1:
        dataset = pd.concat([dataset,pd.read_csv(path_1 + "/" + name_file)])
    path_2 = "/kaggle/input/test-data-21-02-2023"
    dir_list_2 = os.listdir(path_2)
    for name_file in dir_list_2:
        dataset = pd.concat([dataset,pd.read_csv(path_2 + "/" + name_file)])
    dataset.fillna(0, inplace=True)
    dataset = dataset.drop(['EVENT_TIME', 'Unnamed: 0'], axis = 1)
    start = time.time()
    
    train, test = train_test_split(dataset, test_size=0.02,shuffle=False)
    features = train.columns
    scaler = MinMaxScaler()
    scaler.fit(train)
    train = scaler.transform(train)
    test = scaler.transform(test)
    train = pd.DataFrame(train, columns = features)
    test = pd.DataFrame(test, columns = features)
    train_tensor = torch.Tensor(np.array(train))
    test_tensor = torch.Tensor(np.array(test))
    train_loader = torch.utils.data.DataLoader(dataset = train_tensor,batch_size = 16,shuffle = True)
    
    start = time.time()
    model = AE(features)
    losses_train = model.train(train_loader, 10, 0.001)
    print( "Training time: ", time.time() - start, "s")
    plt.style.use('bmh')
    plt.xlabel('Sample')
    plt.ylabel('Train testing')
    plt.plot(losses_train)
    
    start = time.time()
    losses_test = model.test(test_tensor, True)
    print("Testing time: ", time.time() - start, "s")
    plt.style.use('bmh')
    plt.xlabel('Sample')
    plt.ylabel('Loss testing')
    plt.plot(losses_test)
    
    start = time.time()
    # new combine
    label, list_combine_error = model.test(test_tensor, False)
    for com_err in list_combine_error:
        print("Time error: ", com_err[0], ", Root cause: ", com_err[1])
    # the cover function 
    def f(train):
        train_tensor = torch.Tensor(np.array(train))
        return model.test(train_tensor, True)
    # shap modun
    import shap
    shap.initjs()
    explainer = shap.KernelExplainer(f, train)
    list_shap_values = []
    position = []
    for i in range(len(label)):
        if label[i] == 1:
            shap_values = explainer.shap_values(test.iloc[i,:], nsamples=500)
            list_shap_values.append(shap_values)
            position.append(i+1)
#         shap.force_plot(explainer.expected_value, shap_values, test.iloc[i,:])  
    for i in range(len(list_shap_values)):
        feat = []
        for j in principal_combine(list_shap_values[i], 0.95):
            feat.append(features[j])
        print("Time error: ", position[i], ", Root cause: ", feat)
    shap.force_plot(explainer.expected_value, list_shap_values[0], test.iloc[134,:])
    # shap.summary_plot(shap_values,test.iloc[134,:])
    # shap.summary_plot(shap_values, test.iloc[134,:], plot_type='bar')
    print("Detect root cause time: ", time.time() - start, "s")
    
    import matplotlib.pyplot as plt
    from matplotlib import style
    #Getting unique labels
 
    u_labels = np.unique(np.array(label))
 
    #plotting the results:
    y = pd.DataFrame()
    y['stt'] = range(185)
    y['loss'] = losses
    y['label'] = label
    plt.style.use('bmh')
    for i in u_labels:
        plt.scatter(y[label == i].iloc[:,0] , y[label == i].iloc[:,1], label = i)
    plt.xlabel('Sample')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()
    
if __name__ == "__main__":
    main()

https://www.geeksforgeeks.org/implementing-an-autoencoder-in-pytorch/
https://discuss.pytorch.org/t/autoencoder-testing-encoder-output/29988
https://www.vielina.com/Uploads/Articles/Docs/downloads/nguyenthithanhnga/Tom%20tat%20LA..pdf

https://visualstudiomagazine.com/Articles/2021/04/13/Autoencoder-Anomaly-Detection.aspx?Page=2
https://learn.microsoft.com/en-us/windows/ai/windows-ml/what-is-a-machine-learning-model
https://nttuan8.com/bai-1-tensor/

In [None]:
# import seaborn as sns
# # Vẽ biểu đồ swarm
# plt.figure(figsize=(16, 8))
# sns.histplot(data = losses, bins=1000)
# plt.xlabel('Scale', fontsize=16)
# plt.ylabel('cm', fontsize=16)
# plt.title("Histogram of Sepal.Width", fontsize=18)

https://phamdinhkhanh.github.io/deepai-book/ch_appendix/appendix_matplotlib.html