In [1]:
# Import common libraries

import tensorflow as tf
import tensorflow.keras as keras
import tensorflow.keras.backend as K
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV

from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, plot_confusion_matrix
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")
from dagmm_INSE_6180 import DAGMM

In [2]:
# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images")
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
#     if tight_layout:
#         plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution, bbox_inches='tight')

# Experiement 1

In [3]:
load_path = './datasets/chiller/df_dataset/' # path to the dataset

def load_data(load_path, filename):
    csv_load_path = os.path.join(load_path, filename)
    return pd.read_csv(csv_load_path)

data_1 = load_data(load_path, 'chiller10.csv') # chiller data of severity level 1

# split data so that 20% data will be used to create outliers
data_11, data_12 = train_test_split(data_1, test_size=0.2, random_state=0) 
X_12, y_12  = data_12.iloc[:, :-1], data_12.iloc[:, -1] 
columns = X_12.columns

# generate artificial outliers
choices = np.random.choice(X_12.size, 2*X_12.shape[0], replace=False)
X_12_flatted = X_12.to_numpy().ravel()
X_12_flatted[choices] = 500
X_12 = pd.DataFrame(X_12_flatted.reshape(X_12.shape))
X_12.columns = columns
data_12 = X_12.copy()
data_12['label'] = np.asarray(y_12, dtype=int)
data = data_11.append(data_12)
data = data.drop('Unnamed: 0', axis='columns')
data_trn, data_tst = train_test_split(data, test_size=0.5, random_state=0) # Final training data and test data for experiments


In [4]:
## Dataset 1 - Origninal data (Using data with outliers)

X_trn, y_trn = data_trn.iloc[:, :-1], data_trn.iloc[:, -1]
X_tst, y_tst = data_tst.iloc[:, :-1], data_tst.iloc[:, -1]
# Scaling data
scaler_d1 = StandardScaler().fit(X_trn)
X_trn_scaled, X_tst_scaled = scaler_d1.transform(X_trn), scaler_d1.transform(X_tst)

In [5]:
# Dataset 2 - Cleaning data using S-DAGMM (Detecting and removing outliers)
model_dagmms = []
data_rate = 80 # consider 20% data is outliers
data_trn_clean_d2 = pd.DataFrame()
energy_thresholds = []

# Offline phase
n = len(data_trn['label'].unique()) 

# create and train 8 individual DAGMMs according to 8 different data labels (normal and seven different fault types)
for i in range(n): 
    model_dagmm = DAGMM(comp_hiddens=[50, 30, 20, 10], comp_activation="elu",
                      est_hiddens=[15, 3], est_activation="elu", est_dropout_ratio=0.2,
                      n_epochs=5, batch_size=32, normalize=True)
    data_trn_c, X_trn_c = data_trn[y_trn==i], X_trn[y_trn==i] # select the data according to the label
    model_dagmm.build(X_trn_c)
    model_dagmm.fit(X_trn_c)
    energy = model_dagmm.predict(X_trn_c) # calculate the energy value
    energy_threshold = np.percentile(energy, data_rate) # get the energy threshold to seperate inliers and outliers
    data_trn_clean_d2 = data_trn_clean_d2.append(data_trn_c[energy <= energy_threshold]) # filter outliers
    model_dagmms.append(model_dagmm) # save DAGMM models for online phase
    energy_thresholds.append(energy_threshold) # save energy threshold for online phase

X_trn_clean_d2, y_trn_clean_d2 = data_trn_clean_d2.iloc[:, :-1], data_trn_clean_d2.iloc[:, -1]

# Online phase
energies = []

# Using trained DAGMMs to calculate the energy of testing samples

for i in range(len(model_dagmms)):
    energy = model_dagmms[i].predict(X_tst)
    energies.append(energy)

energies_np = np.asanyarray(energies)

energy_thresholds_np = np.asarray(energy_thresholds).reshape(-1, 1) # Using energy_thresholds calculated in the offline phase
votes = np.where(energies_np > energy_thresholds_np, 1, 0).T
votes_sum = np.sum(votes, axis=1)

# Using voting scheme to determine outliers (If five in eight individual DAGMMs determine
# a sample as a outlier, S-DAGMM considers that sample as a outlier)
idx_outlier = np.where(votes_sum >= 5) 
idx = np.where(votes_sum <= 5) # inlier data index
data_tst_clean_d2 = data_tst.iloc[idx]
X_tst_clean_d2, y_tst_clean_d2 = data_tst_clean_d2.iloc[:, :-1], data_tst_clean_d2.iloc[:, -1]

# Scaling data (some base classifiers need scaled data for training)
scaler_d2 = StandardScaler().fit(X_trn_clean_d2)
X_trn_clean_scaled_d2, X_tst_clean_scaled_d2 = scaler_d2.transform(X_trn_clean_d2), scaler_d2.transform(X_tst_clean_d2)



Epoch 1/5
Best Epoch: 1
Epoch 2/5
Best Epoch: 2
Epoch 3/5
Best Epoch: 3
Epoch 4/5
Best Epoch: 4
Epoch 5/5
Best Epoch: 5




Epoch 1/5
Best Epoch: 1
Epoch 2/5
Best Epoch: 2
Epoch 3/5
Best Epoch: 2
Epoch 4/5
Best Epoch: 2
Epoch 5/5
Best Epoch: 5
Epoch 1/5
  32/2568 [>.............................] - mean_loss: 215.5642



Best Epoch: 1
Epoch 2/5
Best Epoch: 2
Epoch 3/5
Best Epoch: 3
Epoch 4/5
Best Epoch: 4
Epoch 5/5
Best Epoch: 4




Epoch 1/5
Best Epoch: 1
Epoch 2/5
Best Epoch: 2
Epoch 3/5
Best Epoch: 2
Epoch 4/5
Best Epoch: 4
Epoch 5/5
Best Epoch: 5




Epoch 1/5
Best Epoch: 1
Epoch 2/5
Best Epoch: 2
Epoch 3/5
Best Epoch: 3
Epoch 4/5
Best Epoch: 3
Epoch 5/5
Best Epoch: 5




Epoch 1/5
Best Epoch: 1
Epoch 2/5
Best Epoch: 2
Epoch 3/5
Best Epoch: 3
Epoch 4/5
Best Epoch: 4
Epoch 5/5
Best Epoch: 5
Epoch 1/5




Best Epoch: 1
Epoch 2/5
Best Epoch: 1
Epoch 3/5
Best Epoch: 3
Epoch 4/5
Best Epoch: 4
Epoch 5/5
Best Epoch: 5
Epoch 1/5
  32/2591 [>.............................] - mean_loss: 379.3294



Best Epoch: 1
Epoch 2/5
Best Epoch: 2
Epoch 3/5
Best Epoch: 3
Epoch 4/5
Best Epoch: 4
Epoch 5/5
Best Epoch: 4


In [6]:
X_trn_c.shape

(2591, 65)

In [7]:
# Dataset 1 - Original dataset (Using outlier-contaminated data for training and testing classifiers)
X_trn_, X_tst_ = X_trn, X_tst
X_trn_scaled_, X_tst_scaled_ = X_trn_scaled, X_tst_scaled
y_trn_, y_tst_ = y_trn, y_tst

X_trn_.shape, X_tst_.shape

((20764, 65), (20764, 65))

In [8]:
## 1. KNN MODEL

print("KNN MODEL:")

k = [i for i in range(2,10)]
p = [j for j in range(1,3)]
param_grid = [{'n_neighbors': k, 'p': p}]
knn_grid_search = GridSearchCV(KNeighborsClassifier(), param_grid=param_grid, cv=3,
                           scoring='f1_weighted', n_jobs=-1, return_train_score=True,
                           verbose=1)

knn_grid_search.fit(X_trn_scaled_, y_trn_)
knn_clf = knn_grid_search.best_estimator_
print(knn_clf)

print("Train Accuracy:", 100*knn_clf.score(X_trn_scaled_, y_trn_), chr(37))
print("Test Accuracy:", 100*knn_clf.score(X_tst_scaled_, y_tst_), chr(37))

y_tst_pred_ = knn_clf.predict(X_tst_scaled_)
print("====================================================================")
print("Confusion matrix: \n", confusion_matrix(y_tst_, y_tst_pred_))
print("====================================================================")
print("Classification report: \n", classification_report(y_tst_, y_tst_pred_, digits=4))
print("====================================================================")


## 2. SVM MODEL

print("SVM MODEL:")

C = [2**i for i in range(0, 4)]
gamma = [2**j for j in range(-2,2)]
param_grid = [{'C': C, 'gamma': gamma}]
svm_grid_search = GridSearchCV(SVC(), param_grid=param_grid, cv=3,
                           scoring='f1_weighted', n_jobs=-1, return_train_score=True,
                           verbose=1)
svm_grid_search.fit(X_trn_scaled_, y_trn_)
svm_clf = svm_grid_search.best_estimator_
print(svm_clf)

print("Train Accuracy:", 100*svm_clf.score(X_trn_scaled_, y_trn_), chr(37))
print("Test Accuracy:", 100*svm_clf.score(X_tst_scaled_, y_tst_), chr(37))

y_tst_pred_ = svm_clf.predict(X_tst_scaled_)
print("====================================================================")
print("Confusion matrix: \n", confusion_matrix(y_tst_, y_tst_pred_))
print("====================================================================")
print("Classification report: \n", classification_report(y_tst_, y_tst_pred_, digits=4))
print("====================================================================")

## 3. RANDOM FOREST MODEL

print("RANDOM FOREST MODEL:")

n_estimators_ = [int(x) for x in np.linspace(100, 550, 10)]

param_grid = {'n_estimators':n_estimators_}
rf_grid_search = GridSearchCV(RandomForestClassifier(), param_grid=param_grid, cv=3,
                              scoring='f1_weighted', n_jobs=-1, return_train_score=True,
                              verbose=1)
rf_grid_search.fit(X_trn_, y_trn_)

rf_clf = rf_grid_search.best_estimator_

print(rf_clf)

print("Train Accuracy:", 100*rf_clf.score(X_trn_, y_trn_), chr(37))
print("Test Accuracy:", 100*rf_clf.score(X_tst_, y_tst_), chr(37))

y_tst_pred_ = rf_clf.predict(X_tst_)
print("====================================================================")
print("Confusion matrix: \n", confusion_matrix(y_tst_, y_tst_pred_))
print("====================================================================")
print("Classification report: \n", classification_report(y_tst_, y_tst_pred_, digits=4))
print("====================================================================")


# ## 4. LOGISTIC REGRESSION MODEL

# C= np.logspace(-4,4,9)
# solver=['newton-cg']

# param_grid = [{'solver': solver, 'C': C}]
# lr_grid_search = GridSearchCV(LogisticRegression(max_iter=100),
#                            param_grid=param_grid, cv=3, scoring='f1_weighted',
#                            n_jobs=-1, return_train_score=True, verbose=1)
# lr_grid_search.fit(X_trn_scaled_, y_trn_)
# lr_clf = lr_grid_search.best_estimator_
# print(lr_clf)

# print("Train Accuracy:", 100*lr_clf.score(X_trn_scaled_, y_trn_), chr(37))
# print("Test Accuracy:", 100*lr_clf.score(X_tst_scaled_, y_tst_), chr(37))

## 5. ADABOOST MODEL

print("ADABOOST MODEL:")

param_grid = {
    'n_estimators' : [100, 200, 300],
    'learning_rate' : [0.001, 0.01, 0.1]
}
ad_grid_search = GridSearchCV(AdaBoostClassifier(), param_grid = param_grid,
                                cv=3, scoring='f1_weighted', n_jobs=-1, return_train_score=True, verbose=1
                                )
ad_grid_search.fit(X_trn_, y_trn_)

ad_clf = ad_grid_search.best_estimator_

print(ad_clf)

print("Train Accuracy:", 100*ad_clf.score(X_trn_, y_trn_), chr(37))
print("Test Accuracy:", 100*ad_clf.score(X_tst_, y_tst_), chr(37))

y_tst_pred_ = ad_clf.predict(X_tst_)
print("====================================================================")
print("Confusion matrix: \n", confusion_matrix(y_tst_, y_tst_pred_))
print("====================================================================")
print("Classification report: \n", classification_report(y_tst_, y_tst_pred_, digits=4))
print("====================================================================")


# ## 6. GAUSIAN NAIVE BAYSESSIAN

# params_NB = {'var_smoothing': np.logspace(0,-9, num=100)}
# gnb_grid_search = GridSearchCV(GaussianNB(), param_grid=params_NB, cv=3, scoring='accuracy',return_train_score=True)
# gnb_grid_search.fit(X_trn_scaled_, y_trn_)

# gnb_clf = gnb_grid_search.best_estimator_

# print(gnb_clf)

# print("Train Accuracy:", 100*gnb_clf.score(X_trn_scaled_, y_trn_), chr(37))
# print("Test Accuracy:", 100*gnb_clf.score(X_tst_scaled_, y_tst_), chr(37))


KNN MODEL:
Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:  2.2min finished


KNeighborsClassifier(n_neighbors=9, p=1)
Train Accuracy: 90.10787902138317 %
Test Accuracy: 86.19244846850319 %
Confusion matrix: 
 [[2277   37   41  108   57   54   18   15]
 [  93 2347   22   51   49   34    5   19]
 [ 135   22 2352   44   27   15   10   18]
 [ 136   39   43 2124  128   63   27   24]
 [  91   30   26  164 2150   93   16   29]
 [  81   30   38  152  114 2122   13   25]
 [  73   27   24   60   42   28 2243   59]
 [  67   24   27   60   65   33   42 2282]]
Classification report: 
               precision    recall  f1-score   support

           0     0.7711    0.8734    0.8191      2607
           1     0.9182    0.8958    0.9069      2620
           2     0.9141    0.8967    0.9053      2623
           3     0.7687    0.8220    0.7945      2584
           4     0.8169    0.8272    0.8220      2599
           5     0.8690    0.8241    0.8459      2575
           6     0.9448    0.8775    0.9099      2556
           7     0.9235    0.8777    0.9000      2600

    accura

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:  5.0min finished


SVC(C=8, gamma=0.5)
Train Accuracy: 99.79291080716625 %
Test Accuracy: 84.56944712001541 %
Confusion matrix: 
 [[2421    9   16   58   35   39   13   16]
 [ 331 2151   12   19   14   48   19   26]
 [ 277   10 2242   16   17   26   16   19]
 [ 306    7   20 2129   32   39   26   25]
 [ 306    8   15   43 2142   50   12   23]
 [ 298    6   12   35   50 2144   15   15]
 [ 301    3   15   19   15   27 2147   29]
 [ 281    7   16   20   31   31   30 2184]]
Classification report: 
               precision    recall  f1-score   support

           0     0.5355    0.9287    0.6793      2607
           1     0.9773    0.8210    0.8923      2620
           2     0.9549    0.8547    0.9020      2623
           3     0.9102    0.8239    0.8649      2584
           4     0.9170    0.8242    0.8681      2599
           5     0.8918    0.8326    0.8612      2575
           6     0.9425    0.8400    0.8883      2556
           7     0.9345    0.8400    0.8847      2600

    accuracy                   

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   49.8s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   49.8s finished


RandomForestClassifier(n_estimators=550)
Train Accuracy: 100.0 %
Test Accuracy: 99.81699094586784 %
Confusion matrix: 
 [[2596    0    0    1    7    1    2    0]
 [   0 2619    0    0    1    0    0    0]
 [   0    0 2622    0    1    0    0    0]
 [   3    0    0 2567   13    1    0    0]
 [   1    0    0    3 2593    2    0    0]
 [   0    0    0    0    1 2573    0    1]
 [   0    0    0    0    0    0 2556    0]
 [   0    0    0    0    0    0    0 2600]]
Classification report: 
               precision    recall  f1-score   support

           0     0.9985    0.9958    0.9971      2607
           1     1.0000    0.9996    0.9998      2620
           2     1.0000    0.9996    0.9998      2623
           3     0.9984    0.9934    0.9959      2584
           4     0.9912    0.9977    0.9944      2599
           5     0.9984    0.9992    0.9988      2575
           6     0.9992    1.0000    0.9996      2556
           7     0.9996    1.0000    0.9998      2600

    accuracy          

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 out of  27 | elapsed:   37.1s remaining:    4.5s
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:   42.3s finished


AdaBoostClassifier(learning_rate=0.1, n_estimators=200)
Train Accuracy: 65.56058562897323 %
Test Accuracy: 65.52205740705065 %
Confusion matrix: 
 [[  69    0    0 2210  304   18    4    2]
 [   0 2608    0    4    4    1    0    3]
 [   0    0 2614    0    2    7    0    0]
 [   4    0    0  797 1754   19    1    9]
 [   0    0    0   38 2528   17    0   16]
 [   9    0    0  888 1655   13    0   10]
 [   1    0    0    9   29    3 2467   47]
 [   0    0    0    0   83    1    7 2509]]
Classification report: 
               precision    recall  f1-score   support

           0     0.8313    0.0265    0.0513      2607
           1     1.0000    0.9954    0.9977      2620
           2     1.0000    0.9966    0.9983      2623
           3     0.2020    0.3084    0.2441      2584
           4     0.3975    0.9727    0.5644      2599
           5     0.1646    0.0050    0.0098      2575
           6     0.9952    0.9652    0.9799      2556
           7     0.9665    0.9650    0.9657      2

In [9]:
# Dataset 2 - Cleaning data using S-DAGMM (Using data cleaned by S-DAGMM to train and test classifiers)
X_trn_, X_tst_ = X_trn_clean_d2, X_tst_clean_d2
X_trn_scaled_, X_tst_scaled_ = X_trn_clean_scaled_d2, X_tst_clean_scaled_d2
y_trn_, y_tst_ = y_trn_clean_d2, y_tst_clean_d2

In [10]:
## 1. KNN MODEL

print("KNN MODEL:")

k = [i for i in range(2,10)]
p = [j for j in range(1,3)]
param_grid = [{'n_neighbors': k, 'p': p}]
knn_grid_search = GridSearchCV(KNeighborsClassifier(), param_grid=param_grid, cv=3,
                           scoring='f1_weighted', n_jobs=-1, return_train_score=True,
                           verbose=1)

knn_grid_search.fit(X_trn_scaled_, y_trn_)
knn_clf = knn_grid_search.best_estimator_
print(knn_clf)

print("Train Accuracy:", 100*knn_clf.score(X_trn_scaled_, y_trn_), chr(37))
print("Test Accuracy:", 100*knn_clf.score(X_tst_scaled_, y_tst_), chr(37))

y_tst_pred_ = knn_clf.predict(X_tst_scaled_)
print("====================================================================")
print("Confusion matrix: \n", confusion_matrix(y_tst_, y_tst_pred_))
print("====================================================================")
print("Classification report: \n", classification_report(y_tst_, y_tst_pred_, digits=4))
print("====================================================================")


## 2. SVM MODEL

print("SVM MODEL:")

C = [2**i for i in range(0, 4)]
gamma = [2**j for j in range(-2,2)]
param_grid = [{'C': C, 'gamma': gamma}]
svm_grid_search = GridSearchCV(SVC(), param_grid=param_grid, cv=3,
                           scoring='f1_weighted', n_jobs=-1, return_train_score=True,
                           verbose=1)
svm_grid_search.fit(X_trn_scaled_, y_trn_)
svm_clf = svm_grid_search.best_estimator_
print(svm_clf)

print("Train Accuracy:", 100*svm_clf.score(X_trn_scaled_, y_trn_), chr(37))
print("Test Accuracy:", 100*svm_clf.score(X_tst_scaled_, y_tst_), chr(37))

y_tst_pred_ = svm_clf.predict(X_tst_scaled_)
print("====================================================================")
print("Confusion matrix: \n", confusion_matrix(y_tst_, y_tst_pred_))
print("====================================================================")
print("Classification report: \n", classification_report(y_tst_, y_tst_pred_, digits=4))
print("====================================================================")

## 3. RANDOM FOREST MODEL

print("RANDOM FOREST MODEL:")

n_estimators_ = [int(x) for x in np.linspace(100, 550, 10)]

param_grid = {'n_estimators':n_estimators_}
rf_grid_search = GridSearchCV(RandomForestClassifier(), param_grid=param_grid, cv=3,
                              scoring='f1_weighted', n_jobs=-1, return_train_score=True,
                              verbose=1)
rf_grid_search.fit(X_trn_, y_trn_)

rf_clf = rf_grid_search.best_estimator_

print(rf_clf)

print("Train Accuracy:", 100*rf_clf.score(X_trn_, y_trn_), chr(37))
print("Test Accuracy:", 100*rf_clf.score(X_tst_, y_tst_), chr(37))

y_tst_pred_ = rf_clf.predict(X_tst_)
print("====================================================================")
print("Confusion matrix: \n", confusion_matrix(y_tst_, y_tst_pred_))
print("====================================================================")
print("Classification report: \n", classification_report(y_tst_, y_tst_pred_, digits=4))
print("====================================================================")


# ## 4. LOGISTIC REGRESSION MODEL

# C= np.logspace(-4,4,9)
# solver=['newton-cg']

# param_grid = [{'solver': solver, 'C': C}]
# lr_grid_search = GridSearchCV(LogisticRegression(max_iter=100),
#                            param_grid=param_grid, cv=3, scoring='f1_weighted',
#                            n_jobs=-1, return_train_score=True, verbose=1)
# lr_grid_search.fit(X_trn_scaled_, y_trn_)
# lr_clf = lr_grid_search.best_estimator_
# print(lr_clf)

# print("Train Accuracy:", 100*lr_clf.score(X_trn_scaled_, y_trn_), chr(37))
# print("Test Accuracy:", 100*lr_clf.score(X_tst_scaled_, y_tst_), chr(37))

## 5. ADABOOST MODEL

print("ADABOOST MODEL:")

param_grid = {
    'n_estimators' : [100, 200, 300],
    'learning_rate' : [0.001, 0.01, 0.1]
}
ad_grid_search = GridSearchCV(AdaBoostClassifier(), param_grid = param_grid,
                                cv=3, scoring='f1_weighted', n_jobs=-1, return_train_score=True, verbose=1
                                )
ad_grid_search.fit(X_trn_, y_trn_)

ad_clf = ad_grid_search.best_estimator_

print(ad_clf)

print("Train Accuracy:", 100*ad_clf.score(X_trn_, y_trn_), chr(37))
print("Test Accuracy:", 100*ad_clf.score(X_tst_, y_tst_), chr(37))

y_tst_pred_ = ad_clf.predict(X_tst_)
print("====================================================================")
print("Confusion matrix: \n", confusion_matrix(y_tst_, y_tst_pred_))
print("====================================================================")
print("Classification report: \n", classification_report(y_tst_, y_tst_pred_, digits=4))
print("====================================================================")


# ## 6. GAUSIAN NAIVE BAYSESSIAN

# params_NB = {'var_smoothing': np.logspace(0,-9, num=100)}
# gnb_grid_search = GridSearchCV(GaussianNB(), param_grid=params_NB, cv=3, scoring='accuracy',return_train_score=True)
# gnb_grid_search.fit(X_trn_scaled_, y_trn_)

# gnb_clf = gnb_grid_search.best_estimator_

# print(gnb_clf)

# print("Train Accuracy:", 100*gnb_clf.score(X_trn_scaled_, y_trn_), chr(37))
# print("Test Accuracy:", 100*gnb_clf.score(X_tst_scaled_, y_tst_), chr(37))

KNN MODEL:
Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:  1.1min finished


KNeighborsClassifier(n_neighbors=3, p=1)
Train Accuracy: 99.40397350993378 %
Test Accuracy: 97.1746677840297 %
Confusion matrix: 
 [[2054    4   22   22    4    7    0    0]
 [  21 1882    0    4    1    0    0    0]
 [  91    1 2058    6    1    0    0    0]
 [  40    1    1 2028   32   13    0    0]
 [  10    0    1   46 2023   24    0    0]
 [  18    0    0   31   22 2037    0    0]
 [  13    1    2   11    2    7 2034    1]
 [   3    0    1    1    3    0    4 2118]]
Classification report: 
               precision    recall  f1-score   support

           0     0.9129    0.9721    0.9416      2113
           1     0.9963    0.9864    0.9913      1908
           2     0.9871    0.9541    0.9703      2157
           3     0.9437    0.9589    0.9512      2115
           4     0.9689    0.9615    0.9652      2104
           5     0.9756    0.9663    0.9709      2108
           6     0.9980    0.9821    0.9900      2071
           7     0.9995    0.9944    0.9969      2130

    accurac

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:  2.2min finished


SVC(C=8, gamma=0.25)
Train Accuracy: 99.98795906080674 %
Test Accuracy: 98.55740452532025 %
Confusion matrix: 
 [[2068    0   12   22    2    8    1    0]
 [   1 1891   16    0    0    0    0    0]
 [   0    0 2157    0    0    0    0    0]
 [  11    0    8 2068   13   13    2    0]
 [   6    0   13   21 2055    8    1    0]
 [   2    0    7    8   16 2075    0    0]
 [   1    0   28    0    0    0 2035    7]
 [   0    0    9    0    2    0    3 2116]]
Classification report: 
               precision    recall  f1-score   support

           0     0.9899    0.9787    0.9843      2113
           1     1.0000    0.9911    0.9955      1908
           2     0.9587    1.0000    0.9789      2157
           3     0.9759    0.9778    0.9769      2115
           4     0.9842    0.9767    0.9804      2104
           5     0.9862    0.9843    0.9853      2108
           6     0.9966    0.9826    0.9895      2071
           7     0.9967    0.9934    0.9951      2130

    accuracy                  

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   37.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   37.0s finished


RandomForestClassifier(n_estimators=250)
Train Accuracy: 100.0 %
Test Accuracy: 99.86831078654376 %
Confusion matrix: 
 [[2100    0    0   11    0    2    0    0]
 [   0 1908    0    0    0    0    0    0]
 [   0    0 2157    0    0    0    0    0]
 [   3    0    0 2110    0    2    0    0]
 [   0    0    0    0 2100    4    0    0]
 [   0    0    0    0    0 2108    0    0]
 [   0    0    0    0    0    0 2071    0]
 [   0    0    0    0    0    0    0 2130]]
Classification report: 
               precision    recall  f1-score   support

           0     0.9986    0.9938    0.9962      2113
           1     1.0000    1.0000    1.0000      1908
           2     1.0000    1.0000    1.0000      2157
           3     0.9948    0.9976    0.9962      2115
           4     1.0000    0.9981    0.9990      2104
           5     0.9962    1.0000    0.9981      2108
           6     1.0000    1.0000    1.0000      2071
           7     1.0000    1.0000    1.0000      2130

    accuracy          

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 out of  27 | elapsed:   29.6s remaining:    3.6s
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:   33.8s finished


AdaBoostClassifier(learning_rate=0.01, n_estimators=300)
Train Accuracy: 67.13425647200482 %
Test Accuracy: 66.97593678917754 %
Confusion matrix: 
 [[ 580    0    0 1334  198    1    0    0]
 [   0 1908    0    0    0    0    0    0]
 [   0    0 2157    0    0    0    0    0]
 [ 109    0    0 1173  830    2    0    1]
 [   1    0    0  102 1987    2    0   12]
 [   4    0    0 1056 1044    4    0    0]
 [   1    0    0    0    0    1 1305  764]
 [   6    0    0    0   49    0    0 2075]]
Classification report: 
               precision    recall  f1-score   support

           0     0.8274    0.2745    0.4122      2113
           1     1.0000    1.0000    1.0000      1908
           2     1.0000    1.0000    1.0000      2157
           3     0.3201    0.5546    0.4059      2115
           4     0.4837    0.9444    0.6397      2104
           5     0.4000    0.0019    0.0038      2108
           6     1.0000    0.6301    0.7731      2071
           7     0.7276    0.9742    0.8330      