In [240]:
from edgeml import edgeml
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [241]:
BACKEND_URL = "https://beta.edge-ml.org"
READ_KEY = "86a0e3e2fdbe2835b49463fe99f62af8"


WINDOW_LEN = 20
WINDOW_OVERLAP = 10


datasetRetriever = edgeml.DatasetReceiver(BACKEND_URL, READ_KEY)
datasetRetriever.loadData()
datasets = datasetRetriever.datasets

# Used labels

In [242]:
labelings = datasetRetriever.labeligns
print("Labelings in project: ", ",".join([x["name"] for x in labelings]))

labelingId = labelings[0]["_id"]

datasetRetriever.labeligns[0]["labels"]
labelingMap = {x["_id"]: x for x in datasetRetriever.labeligns[0]["labels"]}

Labelings in project:  ShakeRest


## Convert datasets to numpy array

In [243]:
def datasets_to_numpy(datasets):
    res_datasets = []
    for dataset in datasets:
        merged_df = dataset.timeSeries[0].data
        for time_series in dataset.timeSeries[1:]:
            merged_df = pd.merge(merged_df, time_series.data, on="time", how="outer")
        merged_df.sort_values(by="time", inplace=True)
        merged_df.reset_index(drop=True, inplace=True)

        interpol_df = merged_df.interpolate(method="linear").fillna(method="ffill").fillna(method="bfill")
        interpol_df["time"] = interpol_df["time"].values.astype(np.int64) // 10**6

        labels = next((labeling.labels for labeling in dataset.labelings if labeling.name == "ShakeRest"), None)

        if labels is None:
            continue

        interpol_df["label"] = ""
        for label in labels:
            interpol_df.loc[(interpol_df["time"] >= label.start) & (interpol_df["time"] <= label.end), "label"] = labelingMap[label.type]["name"]

        interpol_df.drop(columns=["time"], inplace=True)
        res_datasets.append(interpol_df.to_numpy())
    return res_datasets

In [244]:
# Window the data

def window_data(datasets):
    windows = []
    for data in datasets:
        for i in range(0, len(data) - WINDOW_LEN, WINDOW_OVERLAP):
            windows.append(data[i:i+WINDOW_LEN])
        
    window_labels = np.array(windows)[:, :,-1]
        # Reduce to most likely label
    window_labels = np.array([max(set(x), key=list(x).count) for x in window_labels])
    return np.array(windows)[:, :,:-1], window_labels

In [245]:
# Train a decision tree model


def train_classifier(X_data, Y_data):
    clf = DecisionTreeClassifier()
    clf.fit(X_data.reshape(X_data.shape[0], -1), Y_data)
    return clf

def evaluate_classifier(clf, X_data, Y_data):
    y_pred = clf.predict(X_data.reshape(X_data.shape[0], -1))
    accuracy = accuracy_score(Y_data, y_pred)
    return accuracy



In [246]:
datasets = datasets_to_numpy(datasets)
X_data, Y_data = window_data(datasets)
X_train, X_test, Y_train, Y_test = train_test_split(X_data, Y_data, test_size=0.2, random_state=42)
clf = train_classifier(X_train, Y_train)
accuracy = evaluate_classifier(clf, X_test, Y_test)
print("Accuracy: ", accuracy)

Accuracy:  0.8333333333333334


  interpol_df = merged_df.interpolate(method="linear").fillna(method="ffill").fillna(method="bfill")
  interpol_df = merged_df.interpolate(method="linear").fillna(method="ffill").fillna(method="bfill")
