# 5 - Train and Classify on original data 

## 0 - Imports

In [2]:
import json 
import pandas as pd

from ydata.connectors import LocalConnector
from ydata.connectors.filetype import FileType
from ydata.utils.formats import read_json

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

## 1 - Read Data

In [3]:
# Read the data
orig_df = pd.read_csv('data_processed.csv')
orig_df=orig_df.set_index(['CellName', 'Time'])

## 2 - Classification

### 2.1 - Split Data

In [4]:
# Split into train and test
x_train, x_test = train_test_split(orig_df, random_state=6)

### 2.2 - Train and Predict 

In [5]:
# Initialize Classifier and predict results
orig_tree_clf = DecisionTreeClassifier(random_state=4)
orig_tree_clf.fit(x_train.drop('Unusual', axis=1), x_train['Unusual'])
preds = orig_tree_clf.predict(x_test.drop('Unusual', axis=1))

colours### 2.3 - Calculate Metrics

In [6]:
# Calculate the scores
acc = accuracy_score(x_test['Unusual'].values, preds)
f1 = f1_score(x_test['Unusual'].values, preds)
recall = recall_score(x_test['Unusual'].values, preds)
precision = precision_score(x_test['Unusual'].values, preds)

colours## 3 - Create Artifact

In [7]:
# Create Artifact. The table with the metrics will be shown on the "Run Output"  section of the "Runs". 
metrics = {
    'metrics': [
        {
            'name': 'Accuracy-score',
            'numberValue':  acc,
            'format': 'PERCENTAGE'
        },
        {
            'name': 'F1-score',
            'numberValue':  f1,
            'format': 'PERCENTAGE'
        },
         {
            'name': 'Recall',
            'numberValue':  recall,
            'format': 'PERCENTAGE'
        },
         {
            'name': 'Precision',
            'numberValue':  precision,
            'format': 'PERCENTAGE'
        }
    ]
  }

with open("mlpipeline-metrics.json", 'w') as f:
    json.dump(metrics, f)

In [8]:
from sklearn.metrics import confusion_matrix 

pos_neg = confusion_matrix(x_test['Unusual'].values, preds).ravel()

matrix = [
    ['normal', 'normal', pos_neg[0]],
    ['normal', 'unusual', pos_neg[1]],
    ['unusual', 'normal', pos_neg[2]],
    ['unusual', 'unusual', pos_neg[3]]
]

df = pd.DataFrame(matrix,columns=['target','predicted','count'])

metadata = {
    "outputs": [
        {
            "type": "confusion_matrix",
            "format": "csv",
            "schema": [
                {
                    "name": "target",
                    "type": "CATEGORY"
                },
                {
                    "name": "predicted",
                    "type": "CATEGORY"
                },
                {
                    "name": "count",
                    "type": "NUMBER"
                }
            ],
            "source": df.to_csv(header=False, index=False),
            "storage": "inline",
            "labels": [
                "normal",
                "unusual"
            ]
        }
    ]
}

with open('mlpipeline-ui-metadata.json', 'w') as f:
    json.dump(metadata, f)

## 4 - Store Data

In [9]:
# Pass onto the next pipeline the test set 
x_test.index.name = 'test_ind'
x_test.reset_index(inplace=True)
x_test.to_csv('test_set.csv', index=True)