In this notebook, we train a model using the orientation data of people during fitness exercises. The model tries to predict whether the person is in a vertical or a horizontal position.

In [1]:
import sys
import os
import subprocess
import zipfile
import numpy as np

from oodles import Framework
from oodles import Signal
from oodles import monitor
from oodles import ModelSignal, AnnotationMethod, Anomaly, DataDriftAlgo

from dataset import input_to_dataset_transformation, read_json, write_json, KpsDataset
from pushup_signal import pushup_signal, plot_all_cluster
from contextlib import redirect_stdout

import tensorflow as tf
import joblib
import json

Download dataset from remote

In [2]:
data_dir = "data"
remote_url = "https://oodles-dev-training-data.s3.amazonaws.com/data.zip"
orig_training_file = 'data/training_data.json'
if not os.path.exists(data_dir):
    try:
        file_downloaded_ok = subprocess.check_output("wget " + remote_url, shell=True)
    except:
        print("Could not load training data")
    with zipfile.ZipFile("data.zip", 'r') as zip_ref:
        zip_ref.extractall("./")

    full_training_data = read_json(orig_training_file)
    np.random.seed(1)
    np.random.shuffle(full_training_data)
    reduced_training_data = full_training_data[0:1000]
    write_json(orig_training_file, reduced_training_data)

In [3]:
real_world_test_cases = 'data/real_world_testing_data.json'
golden_testing_file = 'data/golden_testing_data.json'
annotation_args = {'master_file': 'data/master_annotation_data.json'}

# Defining the egde-case signal
pushup_edge_case = Signal("Pushup", pushup_signal)
inference_batch_size = 1

Next, we train our network using Deep Neural Network

In [4]:
from model_dnn import get_accuracy_dnn, train_model_dnn
train_model_dnn('data/training_data.json', 'version_0')

Training on:  data/training_data.json  which has  1000  data-points
Trained model exists. Skipping training again.


Next, we get the model accuracy on testing dataset, which is again low due to misclassification of Pushup signals.

In [5]:
get_accuracy_dnn(golden_testing_file, 'version_0')

2022-12-13 14:14:07.494944: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Evaluating on  15731  data-points


0.22382556735109022

Update the Oodles config with new training workflows and checks. Let's also add a check for edge-cases when model confidence is low (because why not!).

In [6]:
cfg = {
    # Define your signal to identify edge cases
    "checks": [{
        'type': Anomaly.DATA_DRIFT,
        'reference_dataset': orig_training_file,
        'cluster_plot_func': plot_all_cluster,
    },
    {
        'type': Anomaly.CONCEPT_DRIFT,
        'algorithm': DataDriftAlgo.DDM  
    }],
    "data_identifier": "id",
    "batch_size": inference_batch_size,

    # Connect training pipeline to annotate data and retrain the model
    "training_args": {
        "data_transformation_func": input_to_dataset_transformation,  
        "annotation_method": {"method": AnnotationMethod.MASTER_FILE, "args": annotation_args}, 
        "training_func": train_model_dnn, 
        "fold_name": 'oodles_smart_data',  
        "orig_training_file": orig_training_file,  
    },

    # Connect evaluation pipeline to test retrained model against original model
    "evaluation_args": {
        "inference_func": get_accuracy_dnn,
        "golden_testing_dataset": golden_testing_file,
        "metrics_to_check": ['accuracy']
    }
}

In [7]:
framework_dnn = Framework(cfg)

@monitor(framework_dnn)
def model_predict(model, inputs):
    with open('evaluation_logs.txt', 'w') as f:
        with redirect_stdout(f):
            return model.predict(inputs['data'])

Deleting the folder:  oodles_smart_data


In [8]:
model_dir = 'trained_models_dnn/'
model_save_name = 'version_0'
real_world_dataset = KpsDataset(
    real_world_test_cases, batch_size=inference_batch_size, shuffle=False, augmentations=False, is_test=True
)
model = tf.keras.models.load_model(model_dir + model_save_name)
gt_data = read_json(annotation_args['master_file'])
all_gt_ids = [x['id'] for x in gt_data]

for i,elem in enumerate(real_world_dataset):

    # Do model prediction
    preds, idens = model_predict(model, {"data": elem[0]["data"], "id": elem[0]["id"]})

    # Attach ground truth
    this_elem_gt = [gt_data[all_gt_ids.index(x)]['gt'] for x in elem[0]['id']]
    framework_dnn.attach_ground_truth({'id': idens, 'gt': np.array(this_elem_gt)})

    # Retrain only once
    if framework_dnn.version > 1:
        break

50  edge-cases collected out of  1855  inferred samples
100  edge-cases collected out of  3620  inferred samples
150  edge-cases collected out of  5274  inferred samples
200  edge-cases collected out of  6929  inferred samples
250  edge-cases collected out of  8654  inferred samples
Kicking off re-training
251 data-points selected out of 8676
Training on:  oodles_smart_data/1/training_dataset.json  which has  2255  data-points
Trained model exists. Skipping training again.
Model retraining done...
Generating comparison report...
Training on:  data/training_data.json  which has  1000  data-points
Trained model exists. Skipping training again.
Evaluating on  15731  data-points
Evaluating on  15731  data-points
---------------------------------------------
---------------------------------------------
Old model accuracy:  0.22382556735109022
Retrained model accuracy (ie 251 smartly collected data-points added):  0.6579365583878966
---------------------------------------------
------------