# Anomaly Detection 

## 0 - Imports

In [1]:
%%capture
import json
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

# Proprietary 
from ydata.connectors import LocalConnector
from ydata.connectors.filetype import FileType

## 1 - Load Data 

In [2]:
# Initialize connector and read data
connector = LocalConnector()

# Read the data. Augumented and original
aug_df = connector.read_file('data_augmented.csv', file_type = FileType.CSV).to_pandas()
orig_df = connector.read_file('data_processed.csv', file_type = FileType.CSV).to_pandas()

## 2 - Classification 

### 2.1 - Split the Data 

In [None]:
# Split the data into train and test
x_train, x_test = train_test_split(orig_df)
x_train_augmented = aug_df.drop(x_test.index)

### 2.2 - Model Training 

In [4]:
# Create the DecisionTreeClassifier. One for the normal data and another for the augmented data.
orig_tree_clf = DecisionTreeClassifier()
aug_tree_clf = DecisionTreeClassifier()

# Fit the model and predict
orig_tree_clf.fit(x_train.drop('Unusual', axis=1), x_train['Unusual'])
aug_tree_clf.fit(x_train_augmented.drop('Unusual', axis=1), x_train_augmented['Unusual'])

### 2.3 - Predict 

In [5]:
preds = orig_tree_clf.predict(x_test.drop('Unusual', axis=1))
aug_preds = aug_tree_clf.predict(x_test.drop('Unusual', axis=1))

### 2.4 - Evaluate Predictions 

In [6]:
# Look at F1 score and Accuracy
acc = accuracy_score(x_test['Unusual'].values, preds)
f1 = f1_score(x_test['Unusual'].values, preds)
aug_acc = accuracy_score(x_test['Unusual'].values, aug_preds)
aug_f1 = f1_score(x_test['Unusual'].values, aug_preds)

## 3 - Create Artifacts 

In [7]:
# Create Artifact. The table with the metrics will be shown on the "Run Output"  section of the "Runs". 
metrics = {
    'metrics': [
        {
            'name': 'accuracy-score',
            'numberValue':  acc,
            'format': 'PERCENTAGE'
        },
        {
            'name': 'f1-score',
            'numberValue':  f1,
            'format': 'PERCENTAGE'
        },
         {
            'name': 'augmented-accuracy-score',
            'numberValue':  aug_acc,
            'format': 'PERCENTAGE'
        },
         {
            'name': 'augmented-f1-score',
            'numberValue':  aug_f1,
            'format': 'PERCENTAGE'
        }
    ]
  }

with open("mlpipeline-metrics.json", 'w') as f:
    json.dump(metrics, f)