In [1]:
# Read dataset
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

df = pd.read_csv('data/selected_data.csv')
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

df_correct = df.loc[df['target'] == 1].iloc[:,:-1]
df_incorrect = df.loc[df['target'] == 0].iloc[:,:-1]

print('Target 1 shape: ' + str(df_correct.shape))
print('Target 0 shape: ' + str(df_incorrect.shape))

Target 1 shape: (5055, 38)
Target 0 shape: (4959, 38)


In [2]:
# Standarize the data
from sklearn.preprocessing import StandardScaler

cols = df_correct.columns.values
df_correct = pd.DataFrame(StandardScaler().fit_transform(df_correct), columns=cols)
df_incorrect = pd.DataFrame(StandardScaler().fit_transform(df_incorrect), columns=cols)

In [3]:
from sklearn.decomposition import PCA
from models.model import ReconstructionErrorModel

# Fit reconstruction error model with correct data
model = ReconstructionErrorModel(df_correct, model=PCA(n_components=0.95))

# Calculate anomaly threshold with correct data
errors = model.predict(df_correct)
threshold = np.std(errors) * 3

In [4]:
# Calculate how many incorrect instances are detected as anomalies
errors = model.predict(df_incorrect)
df_incorrect['target'] = 0
for anomaly in np.where(errors > threshold):    
    df_incorrect.at[anomaly, 'target'] = 1

df_incorrect['target'].value_counts()

0    4565
1     394
Name: target, dtype: int64