In [None]:
from sklearn.datasets import make_classification

In [None]:
import pandas as pd
import numpy as np

In [None]:
from collections import Counter

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
X, y = make_classification(n_samples=100000, n_features=32, n_informative=32,n_redundant=0, n_repeated=0, n_classes=2,
                           n_clusters_per_class=1,
                           weights=[0.995, 0.005],
                           class_sep=0.5, random_state=0)


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print('The number of records in the training dataset is', X_train.shape[0])
print('The number of records in the test dataset is', X_test.shape[0])
print(f"The training dataset has {sorted(Counter(y_train).items())[0][1]} records for the majority class and {sorted(Counter(y_train).items())[1][1]} records for the minority class.")


In [None]:
X_train

In [None]:
X_train.shape

In [None]:
import keras
from keras.layers import Dense
from keras.models import Sequential
from sklearn.metrics import classification_report

In [None]:
X_train_normal = X_train[np.where(y_train == 0)]

input = keras.layers.Input(shape=(32,))

encoder = Sequential([
  Dense(16, activation='relu'),
  Dense(8, activation='relu'),
  Dense(4, activation='relu')])(input)

decoder = Sequential([
      Dense(8, activation="relu"),
      Dense(16, activation="relu"),
      Dense(32, activation="sigmoid")])(encoder)

autoencoder = keras.Model(inputs=input, outputs=decoder)


In [None]:
autoencoder.summary()

In [None]:
autoencoder.compile(optimizer='adam', loss='mae')

history = autoencoder.fit(X_train_normal, X_train_normal,
          epochs=2,
          batch_size=64,
          validation_data=(X_test, X_test),
          shuffle=True)


In [None]:
plt.plot(history.history["loss"], label="Training Loss")
plt.plot(history.history["val_loss"], label="Validation Loss")
plt.legend()


In [None]:
prediction = autoencoder.predict(X_test)

prediction_loss = keras.losses.mae(prediction, X_test)

loss_threshold = np.percentile(prediction_loss, 98)
print(f'The prediction loss threshold for 2% of outliers is {loss_threshold:.2f}')

sns.histplot(prediction_loss, bins=30, alpha=0.8)
plt.axvline(x=loss_threshold, color='orange')


In [None]:
threshold_prediction = [0 if i < loss_threshold else 1 for i in prediction_loss]

print(classification_report(y_test, threshold_prediction))
