# Edge-ml example: Train a classifier from data stored on edgeml

In [91]:
from edgeml import edgeml
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.preprocessing import Normalizer

#### Set some global variables

In [92]:
BACKEND_URL = "https://beta.edge-ml.org" # You don't need to change this
READ_KEY = "86a0e3e2fdbe2835b49463fe99f62af8" # Enter your own key here

# Set parameters as you wish
LABELING_NAME = "ShakeRest"

WINDOW_LEN = 20
WINDOW_OVERLAP = 10

#### Load the data from edge-ml

In [93]:
datasetRetriever = edgeml.DatasetReceiver(BACKEND_URL, READ_KEY)
datasetRetriever.loadData()
datasets = datasetRetriever.datasets

labelings = datasetRetriever.labeligns
print("Labelings in project: ", ",".join([x["name"] for x in labelings]))

labelingMap = {x["_id"]: x for x in datasetRetriever.labeligns[0]["labels"]}

Labelings in project:  ShakeRest


#### Convert datasets to numpy array

In [94]:
def datasets_to_numpy(datasets):
    res_datasets = []
    for dataset in datasets:
        merged_df = dataset.timeSeries[0].data
        for time_series in dataset.timeSeries[1:]:
            merged_df = pd.merge(merged_df, time_series.data, on="time", how="outer")
        merged_df.sort_values(by="time", inplace=True)
        merged_df.reset_index(drop=True, inplace=True)

        interpol_df = merged_df.interpolate(method="linear").ffill().bfill()
        interpol_df["time"] = interpol_df["time"].values.astype(np.int64) // 10**6

        labels = next((labeling.labels for labeling in dataset.labelings if labeling.name == LABELING_NAME), None)

        if labels is None:
            continue

        interpol_df["label"] = ""
        for label in labels:
            interpol_df.loc[(interpol_df["time"] >= label.start) & (interpol_df["time"] <= label.end), "label"] = labelingMap[label.type]["name"]

        interpol_df.drop(columns=["time"], inplace=True)
        res_datasets.append(interpol_df.to_numpy())
    return res_datasets

#### Window the data

In [95]:
def window_data(datasets):
    windows = []
    for data in datasets:
        for i in range(0, len(data) - WINDOW_LEN, WINDOW_OVERLAP):
            windows.append(data[i:i+WINDOW_LEN])
        
    window_labels = np.array(windows)[:, :,-1]
        # Reduce to most likely label
    window_labels = np.array([max(set(x), key=list(x).count) for x in window_labels])
    return np.array(windows)[:, :,:-1], window_labels

#### Normalize the data

In [96]:
def normalize(X_train, X_test, Y_train, Y_test):
    normalizer = Normalizer()
    X_train = normalizer.fit_transform(X_train)
    X_test = normalizer.transform(X_test)
    return X_train, X_test, Y_train, Y_test

#### Classifier

In [97]:
def train_classifier(X_data, Y_data):
    clf = DecisionTreeClassifier()
    clf.fit(X_data, Y_data)
    return clf

def evaluate_classifier(clf, X_data, Y_data):
    y_pred = clf.predict(X_data)
    accuracy = accuracy_score(Y_data, y_pred)
    f1 = f1_score(Y_data, y_pred, average='weighted')
    precision = precision_score(Y_data, y_pred, average='weighted')
    recall = recall_score(Y_data, y_pred, average='weighted')
    return accuracy, f1, precision, recall



### Put it all together

In [98]:
datasets = datasets_to_numpy(datasets)
X_data, Y_data = window_data(datasets)
X_train, X_test, Y_train, Y_test = train_test_split(X_data, Y_data, test_size=0.2, random_state=42)

# Flatten the arrays for sklearn
X_train = X_train.reshape(X_train.shape[0], -1)
X_test = X_test.reshape(X_test.shape[0], -1)

X_train, X_test, Y_train, Y_test = normalize(X_train, X_test, Y_train, Y_test)
clf = train_classifier(X_train, Y_train)
accuracy, f1_score, precision, recall = evaluate_classifier(clf, X_test, Y_test)
print("Accuracy: {:.2f}%".format(accuracy * 100))
print("F1 Score: {:.2f}%".format(f1_score * 100))
print("Precision: {:.2f}%".format(precision * 100))
print("Recall: {:.2f}%".format(recall * 100))


Accuracy: 88.89%
F1 Score: 88.78%
Precision: 89.44%
Recall: 88.89%
