# Anomaly Detection 

## 0 - Imports

In [1]:
%%capture
import json
import pandas as pd

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

# Proprietary 
from ydata.connectors import LocalConnector
from ydata.connectors.filetype import FileType

## 1 - Load Data 

In [7]:
# Read the data. Augumented and original
data_processed = pd.read_csv('data_processed.csv')
data_sampled = pd.read_csv('data_sampled.csv')
x_test = pd.read_csv('test_set.csv').drop('Unnamed: 0', axis=1)

# Concatenate synthesized data and original data
aug_df = pd.concat([data_processed, data_sampled])

## 2 - Classification 

### 2.1 - Split the Data 

In [8]:
# Split the data into train and test
x_train_augmented = aug_df.drop(x_test.index)

In [9]:
#Removing the index columns as the data is sorted
x_train_augmented = x_train_augmented.drop(['Time', 'CellName'], axis=1)
x_test = x_test.drop(['Time', 'CellName'], axis=1)

### 2.2 - Model Training 

In [10]:
# Create the DecisionTreeClassifier. One for the normal data and another for the augmented data.
aug_tree_clf = DecisionTreeClassifier(random_state=4)

# Fit the model and predict
aug_tree_clf.fit(x_train_augmented.drop('Unusual', axis=1), x_train_augmented['Unusual'])

DecisionTreeClassifier(random_state=4)

### 2.3 - Predict 

In [11]:
aug_preds = aug_tree_clf.predict(x_test.drop('Unusual', axis=1))

### 2.4 - Evaluate Predictions 

In [12]:
# Look at F1 score and Accuracy
aug_acc = accuracy_score(x_test['Unusual'].values, aug_preds)
aug_f1 = f1_score(x_test['Unusual'].values, aug_preds)
aug_recall = recall_score(x_test['Unusual'].values, aug_preds)
aug_precision = precision_score(x_test['Unusual'].values, aug_preds)

## 3 - Create Artifacts 

In [13]:
# Create Artifact. The table with the metrics will be shown on the "Run Output" section of the "Runs". 
metrics = {
    'metrics': [
         {
            'name': 'augmented-accuracy-score',
            'numberValue':  aug_acc,
            'format': 'PERCENTAGE'
        },
         {
            'name': 'augmented-f1-score',
            'numberValue':  aug_f1,
            'format': 'PERCENTAGE'
        },
        {
            'name': 'augmented-recall-score',
            'numberValue':  aug_recall,
            'format': 'PERCENTAGE'
        },
        {
            'name': 'augmented-precision-score',
            'numberValue':  aug_precision,
            'format': 'PERCENTAGE'
        },
    ]
  }

with open("mlpipeline-metrics.json", 'w') as f:
    json.dump(metrics, f)

In [14]:
from sklearn.metrics import confusion_matrix 

pos_neg = confusion_matrix(x_test['Unusual'].values, aug_preds).ravel()

matrix = [
    ['normal', 'normal', pos_neg[0]],
    ['normal', 'unusual', pos_neg[1]],
    ['unusual', 'normal', pos_neg[2]],
    ['unusual', 'unusual', pos_neg[3]]
]

df = pd.DataFrame(matrix,columns=['target','predicted','count'])

metadata = {
    "outputs": [
        {
            "type": "confusion_matrix",
            "format": "csv",
            "schema": [
                {
                    "name": "target",
                    "type": "CATEGORY"
                },
                {
                    "name": "predicted",
                    "type": "CATEGORY"
                },
                {
                    "name": "count",
                    "type": "NUMBER"
                }
            ],
            "source": df.to_csv(header=False, index=False),
            "storage": "inline",
            "labels": [
                "normal",
                "unusual"
            ]
        }
    ]
}

with open('mlpipeline-ui-metadata.json', 'w') as f:
    json.dump(metadata, f)