In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
)
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras import regularizers
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping
from tqdm.keras import TqdmCallback
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def create_autoencoder(input_dim):
    input_layer = Input(shape=(input_dim,), name="Input")
    encoded = Dense(
        4,
        activation="relu",
        activity_regularizer=regularizers.l1(10e-5),
        name="Encoding_1",
    )(input_layer)
    latent = Dense(2, activation="relu", name="Latent")(encoded)
    decoded = Dense(4, activation="relu", name="Decoding_2")(latent)
    output_layer = Dense(input_dim, activation="linear", name="Output")(decoded)
    autoencoder = Model(input_layer, output_layer)

    return autoencoder



def predict(model, data, threshold):

    reconstructions = model.predict(data)

    loss = tf.keras.losses.mae(reconstructions, data)

    return (loss.numpy() < threshold).astype(int)



def print_stats(predictions, labels):

    accuracy = accuracy_score(labels, predictions)

    precision = precision_score(labels, predictions)

    recall = recall_score(labels, predictions)

    f1 = f1_score(labels, predictions)

    print(f"Accuracy = {accuracy}")

    print(f"Precision = {precision}")

    print(f"Recall = {recall}")

    print(f"F1 score = {f1}")

    return accuracy, precision, recall, f1

### Session0

In [3]:
data = pd.read_csv("data.csv")

X = data.drop(columns=["Label"])
y = data["Label"]

In [4]:
data["Label"].value_counts()

Label
1    181880
0      1275
Name: count, dtype: int64

In [5]:
X = X[['365', '101', '86', '100', '130']]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

In [7]:
y_test.value_counts()

Label
1    36365
0      266
Name: count, dtype: int64

In [8]:
X.shape[1]


5

In [9]:
n_features = X.shape[1]

In [10]:
autoencoder = create_autoencoder(input_dim=n_features)
autoencoder.compile(optimizer="adadelta", loss="mse")
autoencoder.summary()

# Fit the model
history = autoencoder.fit(
    X_train,
    X_train,
    batch_size=64,
    epochs=10,
    verbose=0,
    validation_split=0.15,
    callbacks=[TqdmCallback(), EarlyStopping(patience=3)],
)

# Predict reconstruction errors for the training set
reconstructions = autoencoder.predict(X_train)
train_loss = tf.keras.losses.mae(reconstructions, X_train).numpy()
threshold = np.mean(train_loss) + np.std(train_loss)
print("Threshold: ", threshold)

# Predict reconstruction errors for the test set
reconstructions = autoencoder.predict(X_test)
test_loss = tf.keras.losses.mae(reconstructions, X_test).numpy()
preds = predict(autoencoder, X_test, threshold)

accuracy, precision, recall, f1 = print_stats(preds, y_test)
conf_matrix = confusion_matrix(y_test, preds)
print(conf_matrix)

100%|██████████| 10/10 [00:23<00:00,  2.38s/epoch, loss=0.00122, val_loss=0.00136]

[1m   1/4579[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m5:40[0m 74ms/step




[1m4579/4579[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 769us/step
Threshold:  0.03293104734478961
[1m1145/1145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 792us/step
[1m1145/1145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 780us/step
Accuracy = 0.9987169337446425
Precision = 0.9998623651178155
Recall = 0.9988450433108759
F1 score = 0.9993534453111028
[[  261     5]
 [   42 36323]]


### Session 1

In [11]:
data = pd.read_csv("Session1/set1.csv")

X = data[["365", "101", "86", "100", "130"]]
y = data["Label"]

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
n_features = X.shape[1]

In [14]:

autoencoder = create_autoencoder(input_dim=n_features)
autoencoder.compile(optimizer="adadelta", loss="mse")

history = autoencoder.fit(
    X_train,
    X_train,
    batch_size=64,
    epochs=10,
    verbose=0,
        validation_split=0.15,
    )

    # Predict reconstruction errors for the training set
reconstructions = autoencoder.predict(X_train)
train_loss = tf.keras.losses.mae(reconstructions, X_train).numpy()
threshold = np.mean(train_loss) + np.std(train_loss)
print("Threshold: ", threshold)

    # Predict reconstruction errors for the test set
reconstructions = autoencoder.predict(X_test)
test_loss = tf.keras.losses.mae(reconstructions, X_test).numpy()
preds = predict(autoencoder, X_test, threshold)

accuracy, precision, recall, f1 = print_stats(preds, y_test)
conf_matrix = confusion_matrix(y_test, preds)
print(conf_matrix)


[1m4474/4474[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 903us/step
Threshold:  0.03360458238462908
[1m1119/1119[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 695us/step
[1m1119/1119[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 866us/step
Accuracy = 0.9987147606940292
Precision = 0.9998591231826891
Recall = 0.9988459806349921
F1 score = 0.999352295128133
[[  258     5]
 [   41 35487]]


In [15]:
train_data = pd.read_csv("Session3/set1.csv")
test_data = pd.read_csv("Session3/set2.csv")

In [16]:
train_data["Label"].value_counts()

Label
1    94037
0     1275
Name: count, dtype: int64

In [17]:
test_data["Label"].value_counts()

Label
1    87843
Name: count, dtype: int64

In [18]:
train_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,386,387,388,389,390,391,392,393,394,Label
0,3.900883e-07,4.324641e-07,0.405991,0.004828,3.333360e-07,1.949446e-01,2.420850e-01,0.511961,0.004676,1.703327e-02,...,0.020853,0.668037,0.027608,0.00000,0.326612,0.900893,0.029277,0.003587,0.0,1
1,3.900883e-07,4.324641e-07,0.405991,0.004828,3.333360e-07,2.444077e-01,2.657545e-01,0.506693,0.004241,3.295399e-03,...,0.020942,0.669267,0.027799,0.00000,0.289497,0.866148,0.029697,0.005701,0.0,1
2,7.119112e-07,7.848839e-07,0.405991,0.004828,6.083383e-07,2.747186e-01,4.158942e-01,0.520178,0.005817,3.341714e-03,...,0.019703,0.659058,0.022932,0.00000,0.320401,0.901713,0.029353,0.003946,0.0,1
3,7.655483e-07,8.436193e-07,0.405991,0.004828,6.541720e-07,3.889100e-01,6.353417e-01,0.521062,0.005882,2.440278e-05,...,0.022982,0.650530,0.018111,0.00000,0.418601,0.935507,0.028277,0.000965,0.0,1
4,4.871228e-06,5.339467e-06,0.405991,0.004828,4.162534e-06,0.000000e+00,0.000000e+00,0.500000,0.005964,0.000000e+00,...,0.004918,0.953516,0.488683,0.00017,0.998497,0.067765,0.006707,0.574281,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95307,3.161303e-02,2.884627e-01,0.504219,0.031784,1.742885e-03,4.334645e-08,1.852341e-07,0.710695,0.225524,1.638320e-08,...,0.000000,0.574298,0.000000,0.00000,0.015714,0.000634,0.029836,0.013857,0.0,1
95308,3.381838e-02,2.222273e-01,0.525749,0.052803,1.747972e-03,5.426113e-08,1.282664e-07,0.634595,0.087363,1.638320e-08,...,0.000000,0.574298,0.000000,0.00000,0.011134,0.010639,0.029387,0.013857,0.0,1
95309,2.285149e-02,2.159538e-01,0.519457,0.043363,1.713201e-03,3.522735e-08,1.119962e-07,0.674515,0.139519,1.638320e-08,...,0.000000,0.574298,0.000000,0.00000,0.015505,0.000933,0.029796,0.013857,0.0,1
95310,1.063805e-01,5.423212e-01,0.459420,0.012623,1.726206e-03,8.596697e-08,2.457140e-07,0.652903,0.116049,1.638320e-08,...,0.000000,0.574298,0.000000,0.00000,0.011043,0.009447,0.029208,0.013857,0.0,1


In [19]:
X_train = train_data[["365", "101", "86", "100", "130"]]
y_train = train_data["Label"]
X_test = test_data[["365", "101", "86", "100", "130"]]
y_test = test_data["Label"]

In [20]:
y_test.value_counts()

Label
1    87843
Name: count, dtype: int64

In [21]:
preds = pd.DataFrame(preds)

In [22]:
preds.value_counts()

1    35492
0      299
Name: count, dtype: int64

In [24]:
autoencoder = create_autoencoder(input_dim=n_features)
autoencoder.compile(optimizer="adadelta", loss="mse")

history = autoencoder.fit(
    X_train,
    X_train,
    batch_size=64,
    epochs=10,
    verbose=0,
    validation_split=0.15,
)

# Predict reconstruction errors for the training set
reconstructions = autoencoder.predict(X_train)
train_loss = tf.keras.losses.mae(reconstructions, X_train).numpy()
threshold = np.mean(train_loss) + np.std(train_loss)
print("Threshold: ", threshold)

# Predict reconstruction errors for the test set
reconstructions = autoencoder.predict(X_test)
test_loss = tf.keras.losses.mae(reconstructions, X_test).numpy()
preds = predict(autoencoder, X_test, threshold)

accuracy, precision, recall, f1 = print_stats(preds, y_test)
conf_matrix = confusion_matrix(y_test, preds)
print(conf_matrix)

[1m2979/2979[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 853us/step
Threshold:  0.04630022152436946
[1m2746/2746[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 890us/step
[1m2746/2746[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 862us/step
Accuracy = 0.9991006682376513
Precision = 1.0
Recall = 0.9991006682376513
F1 score = 0.9995501318284579
[[    0     0]
 [   79 87764]]


### Session 2

In [25]:
data = pd.read_csv("Session2/set1.csv")

X = data[["365", "101", "86", "100", "130"]]
y = data["Label"]

In [26]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
n_features = X.shape[1]

In [27]:
autoencoder = create_autoencoder(input_dim=n_features)
autoencoder.compile(optimizer="adadelta", loss="mse")

history = autoencoder.fit(
    X_train,
    X_train,
    batch_size=64,
    epochs=10,
    verbose=0,
    validation_split=0.15,
)

# Predict reconstruction errors for the training set
reconstructions = autoencoder.predict(X_train)
train_loss = tf.keras.losses.mae(reconstructions, X_train).numpy()
threshold = np.mean(train_loss) + np.std(train_loss)
print("Threshold: ", threshold)

# Predict reconstruction errors for the test set
reconstructions = autoencoder.predict(X_test)
test_loss = tf.keras.losses.mae(reconstructions, X_test).numpy()
preds = predict(autoencoder, X_test, threshold)

accuracy, precision, recall, f1 = print_stats(preds, y_test)
conf_matrix = confusion_matrix(y_test, preds)
print(conf_matrix)

[1m2471/2471[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 863us/step
Threshold:  0.046231011129557506
[1m618/618[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
[1m618/618[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 979us/step
Accuracy = 0.9985835694050992
Precision = 0.9993846153846154
Recall = 0.9991796554552912
F1 score = 0.9992821249102656
[[  252    12]
 [   16 19488]]


In [28]:
train_data = pd.read_csv("Session2/set1.csv")
test_data = pd.read_csv("Session2/set2.csv")

In [29]:
X_train = train_data[["365", "101", "86", "100", "130"]]
y_train = train_data["Label"]
X_test = test_data[["365", "101", "86", "100", "130"]]
y_test = test_data["Label"]

In [30]:
autoencoder = create_autoencoder(input_dim=n_features)
autoencoder.compile(optimizer="adadelta", loss="mse")

history = autoencoder.fit(
    X_train,
    X_train,
    batch_size=64,
    epochs=10,
    verbose=0,
    validation_split=0.15,
)

# Predict reconstruction errors for the training set
reconstructions = autoencoder.predict(X_train)
train_loss = tf.keras.losses.mae(reconstructions, X_train).numpy()
threshold = np.mean(train_loss) + np.std(train_loss)
print("Threshold: ", threshold)

# Predict reconstruction errors for the test set
reconstructions = autoencoder.predict(X_test)
test_loss = tf.keras.losses.mae(reconstructions, X_test).numpy()
preds = predict(autoencoder, X_test, threshold)

accuracy, precision, recall, f1 = print_stats(preds, y_test)
conf_matrix = confusion_matrix(y_test, preds)
print(conf_matrix)

[1m3089/3089[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 678us/step
Threshold:  0.044820388711354665
[1m2635/2635[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 650us/step
[1m2635/2635[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 621us/step
Accuracy = 0.9991698193764158
Precision = 1.0
Recall = 0.9991698193764158
F1 score = 0.9995847373166912
[[    0     0]
 [   70 84249]]


### Session 3

In [31]:
data = pd.read_csv("Session3/set1.csv")

X = data[["365", "101", "86", "100", "130"]]
y = data["Label"]

In [32]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [33]:
n_features = X.shape[1]

In [34]:
autoencoder = create_autoencoder(input_dim=n_features)
autoencoder.compile(optimizer="adadelta", loss="mse")

history = autoencoder.fit(
    X_train,
    X_train,
    batch_size=64,
    epochs=10,
    verbose=0,
    validation_split=0.15,
)

# Predict reconstruction errors for the training set
reconstructions = autoencoder.predict(X_train)
train_loss = tf.keras.losses.mae(reconstructions, X_train).numpy()
threshold = np.mean(train_loss) + np.std(train_loss)
print("Threshold: ", threshold)

# Predict reconstruction errors for the test set
reconstructions = autoencoder.predict(X_test)
test_loss = tf.keras.losses.mae(reconstructions, X_test).numpy()
preds = predict(autoencoder, X_test, threshold)

accuracy, precision, recall, f1 = print_stats(preds, y_test)
conf_matrix = confusion_matrix(y_test, preds)
print(conf_matrix)

[1m2383/2383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 706us/step
Threshold:  0.058166598476014034
[1m596/596[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 714us/step
[1m596/596[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 608us/step
Accuracy = 0.998583643707706
Precision = 0.9995213020584012
Recall = 0.999043062200957
F1 score = 0.9992821249102656
[[  244     9]
 [   18 18792]]


In [35]:
train_data = pd.read_csv("Session3/set1.csv")
test_data = pd.read_csv("Session3/set2.csv")
X_train = train_data[["365", "101", "86", "100", "130"]]
y_train = train_data["Label"]
X_test = test_data[["365", "101", "86", "100", "130"]]
y_test = test_data["Label"]

In [36]:
autoencoder = create_autoencoder(input_dim=n_features)
autoencoder.compile(optimizer="adadelta", loss="mse")

history = autoencoder.fit(
    X_train,
    X_train,
    batch_size=64,
    epochs=10,
    verbose=0,
    validation_split=0.15,
)

# Predict reconstruction errors for the training set
reconstructions = autoencoder.predict(X_train)
train_loss = tf.keras.losses.mae(reconstructions, X_train).numpy()
threshold = np.mean(train_loss) + np.std(train_loss)
print("Threshold: ", threshold)

# Predict reconstruction errors for the test set
reconstructions = autoencoder.predict(X_test)
test_loss = tf.keras.losses.mae(reconstructions, X_test).numpy()
preds = predict(autoencoder, X_test, threshold)

accuracy, precision, recall, f1 = print_stats(preds, y_test)
conf_matrix = confusion_matrix(y_test, preds)
print(conf_matrix)

[1m2979/2979[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 617us/step
Threshold:  0.04724993063342148
[1m2746/2746[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 607us/step
[1m2746/2746[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 607us/step
Accuracy = 0.9991348200767278
Precision = 1.0
Recall = 0.9991348200767278
F1 score = 0.9995672228233017
[[    0     0]
 [   76 87767]]
