In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Read and standarize the dataset
df = pd.read_csv('data/selected_data1.csv')
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
X = df.iloc[:,:-1]
y = df.iloc[:,-1]
df = pd.DataFrame(StandardScaler().fit_transform(X), columns=X.columns.values)
df['target'] = y

# Divide dataframe depending on target
df_correct = df.loc[df['target'] == 1]
df_incorrect = df.loc[df['target'] == 0]
print(df_correct['target'].value_counts())
print(df_incorrect['target'].value_counts())

# Print each target shape
print()
print('Target 1 shape: ' + str(df_correct.shape))
print('Target 0 shape: ' + str(df_incorrect.shape))

1    5049
Name: target, dtype: int64
0    4951
Name: target, dtype: int64

Target 1 shape: (5049, 25)
Target 0 shape: (4951, 25)


In [2]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()
scores = cross_val_score(clf, df.iloc[:,:-1], df.iloc[:,-1], cv=5)

print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.81 (+/- 0.01)


In [3]:
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

df_train, df_test_correct = train_test_split(df_correct, test_size=0.2, random_state=0)
df_test_incorrect = df_incorrect

X_train = df_train.iloc[:,:-1]
X_test_correct = df_test_correct.iloc[:,:-1]
X_test_incorrect = df_test_incorrect.iloc[:,:-1]

# PCA

In [4]:
from sklearn.decomposition import PCA
from models.model import ReconstructionErrorModel

# Fit reconstruction error model with correct data
model = ReconstructionErrorModel(X_train, model=PCA(n_components=0.9))

# Calculate anomaly threshold with correct data
threshold = np.std(model.predict(X_train)) * 3

# Print principal components
print('%i Principal Components' % len(model.get_model().explained_variance_))
print('Threshold ', threshold)

11 Principal Components
Threshold  0.23278923232987037


In [5]:
# Calculate how many correct instances are detected as anomalies
errors = model.predict(X_test_correct)
anomalies = np.where(errors > threshold)[0]
print('Anomalies with correct data %s' % str(len(anomalies)))

# Calculate how many correct instances are detected as anomalies
errors = model.predict(X_test_incorrect)
anomalies = np.where(errors > threshold)[0]
print('Anomalies with incorrect data %s' % str(len(anomalies)))

Anomalies with correct data 18
Anomalies with incorrect data 990
