In [1]:
# Read dataset
import numpy as np
import pandas as pd

df = pd.read_csv('data/selected_data.csv')
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df = df.iloc[:,:-1]

In [2]:
# Standarize the data
from sklearn.preprocessing import StandardScaler

cols = df.columns.values
df = pd.DataFrame(StandardScaler().fit_transform(df), columns=cols)

In [3]:
# Calculate reconstruction errors
from sklearn.decomposition import PCA
from models.model import ReconstructionErrorModel

model = ReconstructionErrorModel(df, model=PCA(n_components=0.95))
errors = model.predict(df)

In [4]:
# Classify anomalies
threshold = np.std(errors) * 3
df['target'] = 0


for anomaly in np.where(errors > threshold):    
    #df['target'][anomaly] = 1
    df.at[anomaly, 'target'] = 1

df['target'].value_counts()

0    9977
1      37
Name: target, dtype: int64

### Prepare data

In [5]:
from sklearn.model_selection import cross_val_score, train_test_split

seed = 0

X = df.iloc[:,:-1]
y = df.iloc[:,-1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=seed, stratify=y)

### DecisionTreeClassifier

In [6]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score

dt_model = DecisionTreeClassifier(max_depth=10, random_state=0).fit(X_train, y_train)
y_pred = dt_model.predict(X_test)

f1_score(y_test, y_pred)

0.6153846153846153

### RandomForestClassifier

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

rf_model = RandomForestClassifier(max_depth=10, random_state=0).fit(X_train, y_train)
y_pred = rf_model.predict(X_test)

f1_score(y_test, y_pred)

0.5882352941176471

### SVC

In [8]:
from sklearn.svm import SVC
from sklearn.metrics import f1_score

svc_model = SVC(random_state=0).fit(X_train, y_train)
y_pred = svc_model.predict(X_test)

f1_score(y_test, y_pred)

0.2857142857142857