In [None]:
import numpy as np
import pandas as pd
import os
from os import path
import matplotlib.pyplot as plt
from datetime import datetime

from sklearn.cluster import KMeans
from pyentrp import entropy as ent
import sklearn.metrics

In [None]:
project_root = '/home/project/gas_anormaly_detection/restaurant/1training/'

In [None]:
data_root = path.join(project_root, 'Dataset')
save_data_root = path.join(project_root, '2pattern_extraction/save_data')
if not path.exists(save_data_root):
    os.mkdir(save_data_root)
    print("make a new dir.")

In [None]:
train_samples = np.load(path.join(save_data_root, 'norm_sample_intra.npy'))

In [None]:
# max normalization
train_samples_normlization = []
for i in range(train_samples.shape[0]):
    train_sample_temp = train_samples[i]
    max_value = max(train_sample_temp)
    train_sample_temp = train_sample_temp/max_value
    train_samples_normlization.append(train_sample_temp)
train_samples_normlization = np.array(train_samples_normlization)
    

## transfor to day entropy

In [None]:
entropy_train_samples = []
for i in range(train_samples_normlization.shape[0]):
    sample_data = train_samples_normlization[i]
    entropy_feature = []
    for j in range(7):
        day_data = sample_data[j*24:j*24+24]
        entropy = ent.permutation_entropy(day_data,order=3,delay=1,normalize=True)
        entropy_feature.append(entropy)
    entropy_feature = np.array(entropy_feature)
    entropy_train_samples.append(entropy_feature)

entropy_train_samples = np.array(entropy_train_samples)


## KMeans clustering

In [None]:
sc = []
for i in range(5,20):
    km = KMeans(n_clusters=i,random_state=0)
    km.fit(entropy_train_samples)

    labels = km.labels_
    sc.append(sklearn.metrics.silhouette_score(entropy_train_samples, labels,metric='euclidean'))
    
plt.rcParams['figure.figsize'] = (24.0, 8.0)
plt.plot(range(5,20),sc,marker="o")
plt.xlabel("K")
plt.ylabel("sc")
plt.xlim([4,20])
plt.show()

In [None]:
clusters_num = 10
estimator_ed_dtw = KMeans(n_clusters=clusters_num,random_state=0)
estimator_ed_dtw.fit(entropy_train_samples)

In [None]:
label_pred = estimator_ed_dtw.labels_ 
cluster_centers = estimator_ed_dtw.cluster_centers_
cluster_sample_index = {}
intra_cluster_distances ={}
cluster_distance_mean ={}
cluster_sample_num ={}

for i in range(len(label_pred)):
    label=label_pred[i]
    sample = entropy_train_samples[i]
    center = cluster_centers[label]
    distance_temp = np.linalg.norm(sample-center)

    if label in cluster_sample_num.keys():
        sample_num = cluster_sample_num[label]
        distance_list = intra_cluster_distances[label]
        distance_list.append(distance_temp)
        distance_mean = cluster_distance_mean[label]
        distance_mean = distance_mean+distance_temp
        sample_num+=1
        
        sample_indexs = cluster_sample_index[label]
        sample_indexs.append(i)
        
        cluster_sample_num[label]=sample_num
        intra_cluster_distances[label]=distance_list
        cluster_sample_index[label]=sample_indexs
        cluster_distance_mean[label]=distance_mean
        
    else:
        cluster_sample_num[label]=1
        intra_cluster_distances[label]=[distance_temp]
        cluster_sample_index[label]=[i]
        cluster_distance_mean[label]=distance_temp


        

In [None]:
for key in cluster_distance_mean.keys():
    distance = cluster_distance_mean[key]
    sample_num = cluster_sample_num[key]
    distance = distance/sample_num
    cluster_distance_mean[key]=distance

In [None]:
print(sorted(cluster_distance_mean.items(), key=lambda item:item[1], reverse=True))

### cluster-level consistency

In [None]:
cluster_distance_mean_np = []
for i in range(clusters_num):
    cluster_distance_mean_np.append(cluster_distance_mean[i])
    

In [None]:
bad_clusters = []
for i in range(clusters_num):
    class_mean = cluster_distance_mean_np[i]
    all_mean = []
    
    for j in range(clusters_num):
        if i!=j: 
            all_mean.append(cluster_distance_mean_np[j]) 
    if class_mean>np.mean(all_mean)+2*np.std(all_mean):
        bad_clusters.append(i)
    
    

In [None]:
anomaly_index =[]
for label in bad_clusters:
    anomaly_index.extend(cluster_sample_index[label])

### instance-level consistency

In [None]:
for label in range(clusters_num):
    if label not in bad_clusters:
        sample_index = np.array(cluster_sample_index[label])
        cluster_distances = np.array(intra_cluster_distances[label])
        anomaly_index_temp = sample_index[np.where(cluster_distances>np.mean(cluster_distances)+2*np.std(cluster_distances))[0]]
        anomaly_index.extend(anomaly_index_temp)
print(len(anomaly_index))

In [None]:
abnorm_sample_np = train_samples[anomaly_index]

nomal_index = [val for val in range(len(train_samples)) if val not in anomaly_index]
train_sample_np = train_samples[nomal_index]

print(abnorm_sample_np.shape)
print(train_sample_np.shape)


In [None]:
# save train test set
np.save(path.join(save_data_root, 'abnorm_sample_inter.npy'), abnorm_sample_np)
np.save(path.join(save_data_root, 'train_sample_inter.npy'), train_sample_np)