In [1]:
import tensorflow as tf
import tensorflow.keras as keras
import tensorflow.keras.backend as K
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV

from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, plot_confusion_matrix
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")
from dagmm_INSE_6180 import DAGMM

In [2]:
# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images")
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
#     if tight_layout:
#         plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution, bbox_inches='tight')

# Experiement 1

In [3]:
load_path = './datasets/chiller/df_dataset/'

def load_data(load_path, filename):
    csv_load_path = os.path.join(load_path, filename)
    return pd.read_csv(csv_load_path)

data_1 = load_data(load_path, 'chiller10.csv')
data_11, data_12 = train_test_split(data_1, test_size=0.2, random_state=0)
X_12, y_12  = data_12.iloc[:, :-1], data_12.iloc[:, -1] 
columns = X_12.columns
# choices = np.random.choice(X_12.size, 2*X_12.shape[0], replace=False)
# X_12_flatted = X_12.to_numpy().ravel()
# X_12_flatted[choices] = 500
choices = np.random.choice(X_12.size, 2*X_12.shape[0], replace=False)
X_12_flatted = X_12.to_numpy().ravel()
X_12_flatted[choices] = 500
X_12 = pd.DataFrame(X_12_flatted.reshape(X_12.shape))
X_12.columns = columns
data_12 = X_12.copy()
data_12['label'] = np.asarray(y_12, dtype=int)
data = data_11.append(data_12)
data = data.drop('Unnamed: 0', axis='columns')
data_trn, data_tst = train_test_split(data, test_size=0.5, random_state=0)


In [5]:
## Dataset 1 - Origninal data

X_trn, y_trn = data_trn.iloc[:, :-1], data_trn.iloc[:, -1]
X_tst, y_tst = data_tst.iloc[:, :-1], data_tst.iloc[:, -1]
# Scaling data
scaler_d1 = StandardScaler().fit(X_trn)
X_trn_scaled, X_tst_scaled = scaler_d1.transform(X_trn), scaler_d1.transform(X_tst)

In [6]:
# Dataset 2 - Cleaning data using multiclass DAGMM
model_dagmms = []
data_rate = 80
data_trn_clean_d2 = pd.DataFrame()
energy_thresholds = []

# Offline phase
n = len(data_trn['label'].unique())
for i in range(n):
    model_dagmm = DAGMM(comp_hiddens=[50, 30, 20, 10], comp_activation="elu",
                      est_hiddens=[15, 3], est_activation="elu", est_dropout_ratio=0.2,
                      n_epochs=5, batch_size=128, normalize=True)
    data_trn_c, X_trn_c = data_trn[y_trn==i], X_trn[y_trn==i]
    model_dagmm.build(X_trn_c)
    model_dagmm.fit(X_trn_c)
    energy = model_dagmm.predict(X_trn_c)
    energy_threshold = np.percentile(energy, data_rate)
    data_trn_clean_d2 = data_trn_clean_d2.append(data_trn_c[energy <= energy_threshold])
    model_dagmms.append(model_dagmm)
    energy_thresholds.append(energy_threshold)

X_trn_clean_d2, y_trn_clean_d2 = data_trn_clean_d2.iloc[:, :-1], data_trn_clean_d2.iloc[:, -1]

# Online phase
energies = []
for i in range(len(model_dagmms)):
    energy = model_dagmms[i].predict(X_tst)
    energies.append(energy)

energies_np = np.asanyarray(energies)

energy_thresholds_np = np.asarray(energy_thresholds).reshape(-1, 1)
votes = np.where(energies_np > energy_thresholds_np, 1, 0).T
votes_sum = np.sum(votes, axis=1)
idx_outlier = np.where(votes_sum >= 5)
idx = np.where(votes_sum <= 5) # normal data index
data_tst_clean_d2 = data_tst.iloc[idx]
X_tst_clean_d2, y_tst_clean_d2 = data_tst_clean_d2.iloc[:, :-1], data_tst_clean_d2.iloc[:, -1]

# Scaling data
scaler_d2 = StandardScaler().fit(X_trn_clean_d2)
X_trn_clean_scaled_d2, X_tst_clean_scaled_d2 = scaler_d2.transform(X_trn_clean_d2), scaler_d2.transform(X_tst_clean_d2)



Epoch 1/5
Best Epoch: 1
Epoch 2/5
Best Epoch: 2
Epoch 3/5
Best Epoch: 3
Epoch 4/5
Best Epoch: 4
Epoch 5/5
Best Epoch: 5
Epoch 1/5




Best Epoch: 1
Epoch 2/5
Best Epoch: 2
Epoch 3/5
Best Epoch: 3
Epoch 4/5
Best Epoch: 3
Epoch 5/5
Best Epoch: 5
Epoch 1/5




Best Epoch: 1
Epoch 2/5
Best Epoch: 2
Epoch 3/5
Best Epoch: 3
Epoch 4/5
Best Epoch: 3
Epoch 5/5
Best Epoch: 5




Epoch 1/5
Best Epoch: 1
Epoch 2/5
Best Epoch: 2
Epoch 3/5
Best Epoch: 3
Epoch 4/5
Best Epoch: 4
Epoch 5/5
Best Epoch: 5




Epoch 1/5
Best Epoch: 1
Epoch 2/5
Best Epoch: 2
Epoch 3/5
Best Epoch: 3
Epoch 4/5
Best Epoch: 4
Epoch 5/5
Best Epoch: 4
Epoch 1/5




Best Epoch: 1
Epoch 2/5
Best Epoch: 2
Epoch 3/5
Best Epoch: 3
Epoch 4/5
Best Epoch: 4
Epoch 5/5
Best Epoch: 4
Epoch 1/5
 128/2635 [=>............................] - mean_loss: 222.8342



Best Epoch: 1
Epoch 2/5
Best Epoch: 2
Epoch 3/5
Best Epoch: 3
Epoch 4/5
Best Epoch: 4
Epoch 5/5
Best Epoch: 5




Epoch 1/5
Best Epoch: 1
Epoch 2/5
Best Epoch: 2
Epoch 3/5
Best Epoch: 3
Epoch 4/5
Best Epoch: 4
Epoch 5/5
Best Epoch: 4


In [13]:
# # Dataset 1 - Original dataset
# X_trn_, X_tst_ = X_trn, X_tst
# X_trn_scaled_, X_tst_scaled_ = X_trn_scaled, X_tst_scaled
# y_trn_, y_tst_ = y_trn, y_tst

# Dataset 2 - Cleaning data using multiclass DAGMM
X_trn_, X_tst_ = X_trn_clean_d2, X_tst_clean_d2
X_trn_scaled_, X_tst_scaled_ = X_trn_clean_scaled_d2, X_tst_clean_scaled_d2
y_trn_, y_tst_ = y_trn_clean_d2, y_tst_clean_d2

X_trn_.shape, X_tst_.shape

((16610, 65), (16015, 65))

In [14]:
## 1. KNN MODEL

print("KNN MODEL:")

k = [i for i in range(2,10)]
p = [j for j in range(1,3)]
param_grid = [{'n_neighbors': k, 'p': p}]
knn_grid_search = GridSearchCV(KNeighborsClassifier(), param_grid=param_grid, cv=3,
                           scoring='f1_weighted', n_jobs=-1, return_train_score=True,
                           verbose=1)

knn_grid_search.fit(X_trn_scaled_, y_trn_)
knn_clf = knn_grid_search.best_estimator_
print(knn_clf)

print("Train Accuracy:", 100*knn_clf.score(X_trn_scaled_, y_trn_), chr(37))
print("Test Accuracy:", 100*knn_clf.score(X_tst_scaled_, y_tst_), chr(37))

y_tst_pred_ = knn_clf.predict(X_tst_scaled_)
print("====================================================================")
print("Confusion matrix: \n", confusion_matrix(y_tst_, y_tst_pred_))
print("====================================================================")
print("Classification report: \n", classification_report(y_tst_, y_tst_pred_, digits=4))
print("====================================================================")


## 2. SVM MODEL

print("SVM MODEL:")

C = [2**i for i in range(0, 4)]
gamma = [2**j for j in range(-2,2)]
param_grid = [{'C': C, 'gamma': gamma}]
svm_grid_search = GridSearchCV(SVC(), param_grid=param_grid, cv=3,
                           scoring='f1_weighted', n_jobs=-1, return_train_score=True,
                           verbose=1)
svm_grid_search.fit(X_trn_scaled_, y_trn_)
svm_clf = svm_grid_search.best_estimator_
print(svm_clf)

print("Train Accuracy:", 100*svm_clf.score(X_trn_scaled_, y_trn_), chr(37))
print("Test Accuracy:", 100*svm_clf.score(X_tst_scaled_, y_tst_), chr(37))

y_tst_pred_ = svm_clf.predict(X_tst_scaled_)
print("====================================================================")
print("Confusion matrix: \n", confusion_matrix(y_tst_, y_tst_pred_))
print("====================================================================")
print("Classification report: \n", classification_report(y_tst_, y_tst_pred_, digits=4))
print("====================================================================")

## 3. RANDOM FOREST MODEL

print("RANDOM FOREST MODEL:")

n_estimators_ = [int(x) for x in np.linspace(100, 550, 10)]

param_grid = {'n_estimators':n_estimators_}
rf_grid_search = GridSearchCV(RandomForestClassifier(), param_grid=param_grid, cv=3,
                              scoring='f1_weighted', n_jobs=-1, return_train_score=True,
                              verbose=1)
rf_grid_search.fit(X_trn_, y_trn_)

rf_clf = rf_grid_search.best_estimator_

print(rf_clf)

print("Train Accuracy:", 100*rf_clf.score(X_trn_, y_trn_), chr(37))
print("Test Accuracy:", 100*rf_clf.score(X_tst_, y_tst_), chr(37))

y_tst_pred_ = rf_clf.predict(X_tst_)
print("====================================================================")
print("Confusion matrix: \n", confusion_matrix(y_tst_, y_tst_pred_))
print("====================================================================")
print("Classification report: \n", classification_report(y_tst_, y_tst_pred_, digits=4))
print("====================================================================")


# ## 4. LOGISTIC REGRESSION MODEL

# C= np.logspace(-4,4,9)
# solver=['newton-cg']

# param_grid = [{'solver': solver, 'C': C}]
# lr_grid_search = GridSearchCV(LogisticRegression(max_iter=100),
#                            param_grid=param_grid, cv=3, scoring='f1_weighted',
#                            n_jobs=-1, return_train_score=True, verbose=1)
# lr_grid_search.fit(X_trn_scaled_, y_trn_)
# lr_clf = lr_grid_search.best_estimator_
# print(lr_clf)

# print("Train Accuracy:", 100*lr_clf.score(X_trn_scaled_, y_trn_), chr(37))
# print("Test Accuracy:", 100*lr_clf.score(X_tst_scaled_, y_tst_), chr(37))

## 5. ADABOOST MODEL

print("ADABOOST MODEL:")

param_grid = {
    'n_estimators' : [100, 200, 300],
    'learning_rate' : [0.001, 0.01, 0.1]
}
ad_grid_search = GridSearchCV(AdaBoostClassifier(), param_grid = param_grid,
                                cv=3, scoring='f1_weighted', n_jobs=-1, return_train_score=True, verbose=1
                                )
ad_grid_search.fit(X_trn_, y_trn_)

ad_clf = ad_grid_search.best_estimator_

print(ad_clf)

print("Train Accuracy:", 100*ad_clf.score(X_trn_, y_trn_), chr(37))
print("Test Accuracy:", 100*ad_clf.score(X_tst_, y_tst_), chr(37))

y_tst_pred_ = ad_clf.predict(X_tst_)
print("====================================================================")
print("Confusion matrix: \n", confusion_matrix(y_tst_, y_tst_pred_))
print("====================================================================")
print("Classification report: \n", classification_report(y_tst_, y_tst_pred_, digits=4))
print("====================================================================")


# ## 6. GAUSIAN NAIVE BAYSESSIAN

# params_NB = {'var_smoothing': np.logspace(0,-9, num=100)}
# gnb_grid_search = GridSearchCV(GaussianNB(), param_grid=params_NB, cv=3, scoring='accuracy',return_train_score=True)
# gnb_grid_search.fit(X_trn_scaled_, y_trn_)

# gnb_clf = gnb_grid_search.best_estimator_

# print(gnb_clf)

# print("Train Accuracy:", 100*gnb_clf.score(X_trn_scaled_, y_trn_), chr(37))
# print("Test Accuracy:", 100*gnb_clf.score(X_tst_scaled_, y_tst_), chr(37))


KNN MODEL:
Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:   46.5s finished


KNeighborsClassifier(n_neighbors=3, p=1)
Train Accuracy: 99.42805538832029 %
Test Accuracy: 97.70839837652201 %
Confusion matrix: 
 [[2054    7    2   26    5    6    0    0]
 [  25 1769    0    3    3    0    0    0]
 [  22    0 2079    0    0    0    0    0]
 [  44    4    0 2003   35   10    0    0]
 [   8    0    0   49 1917   23    0    0]
 [  13    0    0   29   13 1909    0    0]
 [   9    0    0    3    3    4 1853    1]
 [   7    0    0    2    4    0    7 2064]]
Classification report: 
               precision    recall  f1-score   support

           0     0.9413    0.9781    0.9594      2100
           1     0.9938    0.9828    0.9883      1800
           2     0.9990    0.9895    0.9943      2101
           3     0.9470    0.9556    0.9513      2096
           4     0.9682    0.9599    0.9640      1997
           5     0.9780    0.9720    0.9750      1964
           6     0.9962    0.9893    0.9928      1873
           7     0.9995    0.9904    0.9949      2084

    accura

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:  3.4min finished


SVC(C=8, gamma=0.25)
Train Accuracy: 99.99397953040338 %
Test Accuracy: 98.4264751795192 %
Confusion matrix: 
 [[2057    0    0   23   15    5    0    0]
 [  11 1767    0    0   22    0    0    0]
 [   0    0 2095    0    6    0    0    0]
 [  23    0    0 2021   42    9    1    0]
 [   6    0    0   18 1967    6    0    0]
 [   3    0    0    6   13 1942    0    0]
 [   1    0    0    0   13    0 1855    4]
 [   2    0    0    0   22    0    1 2059]]
Classification report: 
               precision    recall  f1-score   support

           0     0.9781    0.9795    0.9788      2100
           1     1.0000    0.9817    0.9907      1800
           2     1.0000    0.9971    0.9986      2101
           3     0.9773    0.9642    0.9707      2096
           4     0.9367    0.9850    0.9602      1997
           5     0.9898    0.9888    0.9893      1964
           6     0.9989    0.9904    0.9946      1873
           7     0.9981    0.9880    0.9930      2084

    accuracy                   

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   35.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   35.5s finished


RandomForestClassifier(n_estimators=350)
Train Accuracy: 100.0 %
Test Accuracy: 99.87511707773962 %
Confusion matrix: 
 [[2097    0    0    3    0    0    0    0]
 [   0 1800    0    0    0    0    0    0]
 [   0    0 2101    0    0    0    0    0]
 [  12    0    0 2081    0    2    1    0]
 [   0    0    0    0 1997    0    0    0]
 [   0    0    0    0    0 1964    0    0]
 [   0    0    0    0    0    0 1873    0]
 [   0    0    0    0    2    0    0 2082]]
Classification report: 
               precision    recall  f1-score   support

           0     0.9943    0.9986    0.9964      2100
           1     1.0000    1.0000    1.0000      1800
           2     1.0000    1.0000    1.0000      2101
           3     0.9986    0.9928    0.9957      2096
           4     0.9990    1.0000    0.9995      1997
           5     0.9990    1.0000    0.9995      1964
           6     0.9995    1.0000    0.9997      1873
           7     1.0000    0.9990    0.9995      2084

    accuracy          

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 out of  27 | elapsed:   34.3s remaining:    4.2s
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:   38.8s finished


AdaBoostClassifier(learning_rate=0.1, n_estimators=300)
Train Accuracy: 67.76038531005419 %
Test Accuracy: 67.16203559163284 %
Confusion matrix: 
 [[ 321    0    0 1617  129   33    0    0]
 [   0 1800    0    0    0    0    0    0]
 [   0    0 2101    0    0    0    0    0]
 [  67    0    0  909 1030   90    0    0]
 [   0    0    0  104 1893    0    0    0]
 [ 237    0    0  559 1148   20    0    0]
 [   2    0    0    1  166    0 1690   14]
 [   0    0    0    0   62    0    0 2022]]
Classification report: 
               precision    recall  f1-score   support

           0     0.5120    0.1529    0.2354      2100
           1     1.0000    1.0000    1.0000      1800
           2     1.0000    1.0000    1.0000      2101
           3     0.2850    0.4337    0.3439      2096
           4     0.4275    0.9479    0.5893      1997
           5     0.1399    0.0102    0.0190      1964
           6     1.0000    0.9023    0.9486      1873
           7     0.9931    0.9702    0.9816      2