In [1]:
from deepview import DeepView
import matplotlib.pyplot as plt
import numpy as np
import time
# ---------------------------
import demo_utils as demo

%load_ext autoreload
%autoreload 2
%matplotlib qt

In [2]:
# matplotlib qt seems to be a bit buggy with notebooks, so we execute it multiple times
%matplotlib qt

## Getting data and models
 
Each section in this notebook can be run independently, thus at the beginning of each section, the according model (i.e. torch/knn/decision tree) and the dataset will be initialized. The reason for this is, that running both torch and tensorflow simultaneously on the GPU may lead to problems.
This notebook tests the DeepView framework on different classifiers

 * ResNet-20 on CIFAR10
 * DecisionTree on MNIST
 * RandomForest on MNIST
 * KNN on MNIST

---

## DeepView Usage Instructions

 1. Create a wrapper funktion like ```pred_wrapper``` which receives a numpy array of samples and returns according class probabilities from the classifier as numpy arrays
 2. Initialize DeepView-object and pass the created method to the constructor
 3. Run your code and call ```add_samples(samples, labels)``` at any time to add samples to the visualization together with the ground truth labels.
    * The ground truth labels will be visualized along with the predicted labels
    * The object will keep track of a maximum number of samples specified by ```max_samples``` and it will throw away the oldest samples first
 4. Call the ```show``` method to render the plot

The following parameters must be specified on initialization:

| <p align="left">Variable               | <p align="left">Meaning           |
|------------------------|-------------------|
| <p align="left">(!)```pred_wrapper```     | <p align="left">Wrapper function allowing DeepView to use your model. Expects a single argument, which should be a batch of samples to classify. Returns (valid / softmaxed) prediction probabilities for this batch of samples. |
| <p align="left">(!)```classes```          | <p align="left">Names of all different classes in the data. |
| <p align="left">(!)```max_samples```      | <p align="left">The maximum amount of samples that DeepView will keep track of. When more samples are added, the oldest samples are removed from DeepView. |
| <p align="left">(!)```batch_size```       | <p align="left">The batch size used for classification |
| <p align="left">(!)```data_shape```       | <p align="left">Shape of the input data (complete shape; excluding the batch dimension) |
| <p align="left">```resolution```       | <p align="left">x- and y- Resolution of the decision boundary plot. A high resolution will compute significantly longer than a lower resolution, as every point must be classified, default 100. |
| <p align="left">```cmap```             | <p align="left">Name of the colormap that should be used in the plots, default 'tab10'. |
| <p align="left">```interactive```      | <p align="left">When ```interactive``` is True, this method is non-blocking to allow plot updates. When ```interactive``` is False, this method is blocking to prevent termination of python scripts, default True. |
| <p align="left">```title```            | <p align="left">Title of the deepview-plot. |
| <p align="left">```data_viz```         | <p align="left">DeepView has a reactive plot, that responds to mouse clicks and shows the according data sample, when it is clicked. You can pass a custom visualization function, if ```data_viz``` is None, DeepView will try to show each sample as an image, if possible. (optional, default None)  |
| <p align="left">```mapper```           | <p align="left">An object that maps samples from the data space to 2D space. Normally UMAP is used for this, but you can pass a custom mapper as well. (optional)  |
| <p align="left">```inv_mapper```       | <p align="left">An object that maps samples from the 2D space to the data space. Normally ```deepview.embeddings.InvMapper``` is used for this, but you can pass a custom inverse mapper as well. (optional)  |
| <p align="left">```kwargs```       | <p align="left">Configuration for the embeddings in case they are not specifically given in mapper and inv_mapper. Defaults to ```deepview.config.py```.  (optional)  |

## Demo with Torch model

In [2]:
import torch

# device will be detected automatically
# Set to 'cpu' or 'cuda:0' to set the device manually
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
# get torch model
torch_model = demo.create_torch_model(device)
# get CIFAR-10 data
testset = demo.make_cifar_dataset()

print('\nUsing device:', device)

Created PyTorch model:	 ResNet
 * Dataset:		 CIFAR10
 * Best Test prec:	 91.78000183105469
Files already downloaded and verified

Using device: cuda:0


In [4]:
# softmax operation to use in pred_wrapper
softmax = torch.nn.Softmax(dim=-1)

# this is the prediction wrapper, it encapsulates the call to the model
# and does all the casting to the appropriate datatypes
def pred_wrapper(x):
    with torch.no_grad():
        x = np.array(x, dtype=np.float32)
        tensor = torch.from_numpy(x).to(device)
        logits = torch_model(tensor)
        probabilities = softmax(logits).cpu().numpy()
    return probabilities

def visualization(image, point2d, pred, label=None, title=None):
    f, a = plt.subplots()
    a.set_title(title)
    a.imshow(image.transpose([1, 2, 0]))

# the classes in the dataset to be used as labels in the plots
classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

# --- Deep View Parameters ----
batch_size = 512
max_samples = 500
data_shape = (3, 32, 32)
n = 5
lam = .65
resolution = 100
cmap = 'tab10'
title = 'ResNet-20 - CIFAR10'

deepview = DeepView(pred_wrapper, classes, max_samples, batch_size, 
                    data_shape, n, lam, resolution, cmap, title=title, data_viz=None)

In [5]:
n_samples = 150
sample_ids = np.random.choice(len(testset), n_samples)
X = np.array([ testset[i][0].numpy() for i in sample_ids ])
Y = np.array([ testset[i][1] for i in sample_ids ])

t0 = time.time()
deepview.add_samples(X, Y)
deepview.show()


print('Time to calculate visualization for %d samples: %.2f sec' % (n_samples, time.time() - t0))
# 236.5 sec for 150 samples on laptp 4800U
# 26.5 sec for 150 samples on desktop with GPU 12400 RTX 3090

Distance calculation 20.00 %
Distance calculation 40.00 %
Distance calculation 60.00 %
Distance calculation 80.00 %
Distance calculation 100.00 %
Embedding samples ...
Computing decision regions ...
Time to calculate visualization for 150 samples: 25.68 sec


## Add new samples to the visualization

In [6]:
n_new = 200

sample_ids = np.random.choice(len(testset), n_new)
X = np.array([ testset[i][0].numpy() for i in sample_ids ])
Y = np.array([ testset[i][1] for i in sample_ids ])

t0 = time.time()
deepview.add_samples(X, Y)
deepview.show()

print('Time to add %d samples to visualization: %.2f sec' % (n_new, time.time() - t0))

Distance calculation 20.00 %
Distance calculation 40.00 %
Distance calculation 60.00 %
Distance calculation 80.00 %
Distance calculation 100.00 %
Embedding samples ...
Computing decision regions ...
Time to add 200 samples to visualization: 62.68 sec


### Example output

As the plot is updatable, it is shown in a separate Qt-window. With the CIFAR-data and the model loaded above, the following plot was produced after 200 samples where added:

**Hyperparameters:**
n = 10
lam = 0.2
resolution = 100

![sample_plot](https://user-images.githubusercontent.com/30961397/72370639-fbab6f00-3702-11ea-98f4-0dc7335777fc.png)

## Tuning the $\lambda$-Hyperparameter

> The $\lambda$-Hyperparameter weights the euclidian distance component.
> When the visualization doesn't show class-clusters, **try a smaller lambda** to put more emphasis on the discriminative distance component that considers the class.
> A smaller $\lambda$ will pull the datapoints further into their class-clusters.
> Therefore, a **too small $\lambda$** can lead to collapsed clusters that don't represent any structural properties of the datapoints. Of course this behaviour also depends on the data and how well the label corresponds to certain structural properties.

Due to separate handling of euclidian and class-discriminative distances, the $\lambda$ parameter can easily be adjusted. Distances don't need to be recomputed, only the embeddings and therefore also the plot of the decision boundary.

In [10]:
deepview.set_lambda(.7)
deepview.show()

Embedding samples ...
Computing decision regions ...


## Compare performance

For this test, DeepView was run on a GPU (GTX 2060 6GB).
Adding samples may be a bit more time consuming, then just running DeepView on the desired amount of samples to be visualized. This is because the decision boundaries must be calculated twice with a similar time complexity. However, the step of adding 100 samples to 100 existing samples takes less time then computing it from scratch for 200 samples. This is because distances were already computed for half of the samples and can be reused.

| <p align="left">Szenario | Time |
| -------- | ---- |
| <p align="left">From scratch for 100 samples | 31.20 sec |
| <p align="left">Adding 100 samples (100 already added) | 66.89 sec |
| <p align="left">From scratch for 200 samples | 71.16 sec |
| <p align="left">200 samples when adding 100 samples in two steps | 98.19 sec |

In [7]:
deepview.reset()

n_samples = 200
sample_ids = np.random.choice(len(testset), n_samples)
X = np.array([ testset[i][0].numpy() for i in sample_ids ])
Y = np.array([ testset[i][1] for i in sample_ids ])

t0 = time.time()
deepview.add_samples(X, Y)
deepview.show()

print('Time to calculate visualization for %d samples: %.2f sec' % (n_samples, time.time() - t0))

Distance calculation 20.00 %
Distance calculation 40.00 %
Distance calculation 60.00 %
Distance calculation 80.00 %
Distance calculation 100.00 %
Embedding samples ...
Computing decision regions ...
Time to calculate visualization for 200 samples: 31.88 sec


# Evaluate 

These evaluations can be run with an initialized instance of DeepView.

In [8]:
from deepview.evaluate import evaluate_umap

print('Evaluation of DeepView: %s\n' % deepview.title)
evaluate_umap(deepview)

Evaluation of DeepView: ResNet-20 - CIFAR10

error of a knn classifier in projecion space using classifier labels:  0.07


## Evaluate the Inverse Mapping

Evaluation of the inverse mapping (i.e. the mapping from 2D back into sample-space) is done by first, passing some training samples to DeepView. It will classify them with the given model, train the mappers (UMAP and inverse) on them, and embed them into 2D space.
A fraction of the embedded samples will be used to train the inverse mapper from ground up. After reconstructing the same set of samples, they will be classified again. The predictions are compared against the prior predictions from deepview and used to calculate the train accuracy.

The spare samples are used as testing samples, they were not used during training of the inverse mapper. They are mapped back into sample-space as well, classified and these classification are used to calculate the test accuracy of the inverse mapper.

> **To run this cell**, run [Demo with Torch model](#Demo-with-Torch-model) first, as the evaluation is done on the CIFAR dataset

In [9]:
from deepview.evaluate import evaluate_inv_umap

# for testing, reset deepview and add some samples
# a fraction of these will serve as training set for the evaluation
n_samples = 600
fraction = 0.7

sample_ids = np.random.choice(len(testset), n_samples)
X = np.array([ testset[i][0].numpy() for i in sample_ids ])
Y = np.array([ testset[i][1] for i in sample_ids ])

train_acc, test_acc = evaluate_inv_umap(deepview, X, Y, fraction)

print('Inverse-Mapper train accuracy:\t%.2f%%' % train_acc)
print('Inverse-Mapper test accuracy:\t%.2f%%' % test_acc)

Distance calculation 20.00 %
Distance calculation 40.00 %
Distance calculation 60.00 %
Distance calculation 80.00 %
Distance calculation 100.00 %
Embedding samples ...
Computing decision regions ...
Inverse-Mapper train accuracy:	82.38%
Inverse-Mapper test accuracy:	78.89%


In [11]:
deepview.close()

## Demo with Tensorflow <small>And visualizing intermediate embeddings</small>

This demo shows the usage of DeepView for tensorflow models (it doesn't differ at all from the procedure with torch models). However, this demo also shows how to feed intermediate embeddings of the data to DeepView. To do so, we only need to encode the datapoints before feeding them to DeepView. We proceed as follows:

 1. Create a model that provides access to intermediate embeddings (i.e. output of some hidden layer) 
 2. Train the model, the example here is a simple feed forward neural network that reaches roughly 93.5% training accuracy
 3. Encode the datapoints with the first layer(s) of the neural network into an embedding
 4. Instantiate DeepView 
     1. The prediction wrapper now needs to be model_head, because the data samples will already be embedded by the first part of the model.
     2. Instead of the raw data samples, feed the embedded data as input to DeepView

In [12]:
import tensorflow as tf
import demo_utils_tensorflow as demo_tf


verbose = 1

# get MNIST dataset
digits_X, digits_y = demo.make_digit_dataset()

# create a tensorflow models, 
# model_embd will encode images to an intermediate embedding
# model_head will predict classes from the intermediate embedding
# model is model_embd and model_head combined into one model for training
model_embd, model_head, model = demo_tf.create_tf_model_intermediate()

model.compile(optimizer='adam', loss=tf.keras.losses.SparseCategoricalCrossentropy(), metrics=['accuracy'])
_ = model.fit(digits_X, digits_y, batch_size=8, epochs=5, verbose=verbose)

Epoch 1/5


2023-02-23 17:40:24.763940: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-02-23 17:40:24.764161: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcufft.so.10'; dlerror: libcufft.so.10: cannot open shared object file: No such file or directory
2023-02-23 17:40:24.764213: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcurand.so.10'; dlerror: libcurand.so.10: cannot open shared object file: No such file or directory
2023-02-23 17:40:24.764244: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcusolver.so.11'; dlerror: libcusolver.so.11: cannot open shared object file: No such file or directory
2023-02-23 17:40:24.764273: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could no

Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [13]:
# Get the embedded data
n_samples = 300
sample_ids = np.random.choice(len(digits_X), n_samples)

# Encode the digits with the first two layers
embedded_digits = model_embd.predict(digits_X, batch_size=64)

X = np.array([ embedded_digits[i] for i in sample_ids ])
Y = np.array([ digits_y[i] for i in sample_ids ])

In [16]:
# note that here, the head (last layers) must be used in the prediction wrapper,
# as we want to pass the embedded data to deepview
pred_wrapper = DeepView.create_simple_wrapper(model_head)

# the digit dataset is used, so classes are [0..9]
classes = np.arange(10)

# --- Deep View Parameters ----
batch_size = 64
max_samples = 500
sample_shape = (64,)
n = 10
lam = 0.5
resolution = 100
cmap = 'tab10'
title = 'TF-Model - Embedded MNIST'

# create DeepView object
deepview = DeepView(pred_wrapper, classes, max_samples, batch_size, sample_shape, 
                    n, lam, resolution, cmap, title=title, data_viz=demo.mnist_visualization)

t0 = time.time()
deepview.add_samples(X, Y)
deepview.show()

print('Time to calculate visualization for %d samples: %.2f sec' % (n_samples, time.time() - t0))

Distance calculation 20.00 %
Distance calculation 40.00 %
Distance calculation 60.00 %
Distance calculation 80.00 %
Distance calculation 100.00 %
Embedding samples ...
Computing decision regions ...
Time to calculate visualization for 300 samples: 13.83 sec


### Just an embedding ...

To visualize the embedding purely based on the euclidian distances between the embedded vectors, you can use $\lambda = 1$. In this case DeepView will ignore the fisher distance from the probabilities and produce just a 2D representation of the embedded vectors. This corresponds to applying UMAP on the data-embedding.

In [17]:
deepview.set_lambda(1.)
deepview.show()

Embedding samples ...
Computing decision regions ...


In [18]:
deepview.close()

## Demo with RandomForest

In [19]:
# get MNIST dataset
digits_X, digits_y = demo.make_digit_dataset()
# initialize random forest
random_forest = demo.create_random_forest(digits_X, digits_y, n_estimators=100)

Created random forest
 * No. of Estimators:	 100
 * Dataset:		 MNIST
 * Train score:		 1.0


In [20]:
pred_wrapper = DeepView.create_simple_wrapper(random_forest.predict_proba)

# the digit dataset is used, so classes are [0..9]
classes = np.arange(10)

# --- Deep View Parameters ----
batch_size = 64
max_samples = 500
sample_shape = (64,)
n = 10
lam = 0.5
resolution = 100
cmap = 'tab10'
title = 'RandomForest - MNIST'

# create DeepView object
deepview = DeepView(pred_wrapper, classes, max_samples, batch_size, sample_shape, 
                    n, lam, resolution, cmap, title=title, data_viz=demo.mnist_visualization)

# add data samples
n_samples = 50
sample_ids = np.random.choice(len(digits_X), n_samples)
X = np.array([ digits_X[i] for i in sample_ids ])
Y = np.array([ digits_y[i] for i in sample_ids ])

t0 = time.time()
deepview.add_samples(X, Y)
deepview.show()

print('Time to calculate visualization for %d samples: %.2f sec' % (n_samples, time.time() - t0))

Distance calculation 20.00 %
Distance calculation 40.00 %
Distance calculation 60.00 %
Distance calculation 80.00 %
Distance calculation 100.00 %
Embedding samples ...
Computing decision regions ...
Time to calculate visualization for 50 samples: 4.30 sec


![random_forest](https://user-images.githubusercontent.com/30961397/78502477-a6ab5200-7761-11ea-8be3-e0b4c8e6a966.png)

In [21]:
deepview.close()

## Demo with DecisionTree

In [22]:
# get MNIST dataset
digits_X, digits_y = demo.make_digit_dataset()
# initialize decision tree
decision_tree = demo.create_decision_tree(digits_X, digits_y, max_depth=10)

Created decision tree
 * Depth:		 10
 * Dataset:		 MNIST
 * Train score:		 0.9821925431274346


In [23]:
# --- Deep View Parameters ----
batch_size = 256
max_samples = 500
# the data can also be represented as a vector
sample_shape = (64,)
n = 10
lam = 0.65
resolution = 100
cmap = 'gist_ncar'

# the digit dataset is used, so classes are [0..9]
classes = np.arange(10)

In [24]:
pred_wrapper = DeepView.create_simple_wrapper(decision_tree.predict_proba)

# create DeepView object
deepview = DeepView(pred_wrapper, classes, max_samples, batch_size, sample_shape, 
                    n, lam, resolution, cmap, data_viz=demo.mnist_visualization)

# add data samples
n_samples = 200
sample_ids = np.random.choice(len(digits_X), n_samples)
X = np.array([ digits_X[i] for i in sample_ids ])
Y = np.array([ digits_y[i] for i in sample_ids ])

t0 = time.time()
deepview.add_samples(X, Y)
deepview.show()

print('Time to calculate visualization for %d samples: %.2f sec' % (n_samples, time.time() - t0))

Distance calculation 20.00 %
Distance calculation 40.00 %
Distance calculation 60.00 %
Distance calculation 80.00 %
Distance calculation 100.00 %
Embedding samples ...
Computing decision regions ...
Time to calculate visualization for 200 samples: 5.80 sec


In [25]:
deepview.set_lambda(.4)
deepview.show()

Embedding samples ...
Computing decision regions ...


In [26]:
deepview.close()

## Demo: KNN-Classifier

In [27]:
# get MNIST dataset
digits_X, digits_y = demo.make_digit_dataset()
# initialize knn classifier
kn_neighbors = demo.create_kn_neighbors(digits_X, digits_y, k=10)

Created knn classifier
 * No. of Neighbors:	 10
 * Dataset:		 MNIST
 * Train score:		 0.9855314412910406


In [28]:
pred_wrapper = DeepView.create_simple_wrapper(kn_neighbors.predict_proba)

# create DeepView object
deepview = DeepView(pred_wrapper, classes, max_samples, batch_size, sample_shape, 
                    n, lam, resolution, cmap, data_viz=demo.mnist_visualization)

# add data samples
n_samples = 200
sample_ids = np.random.choice(len(digits_X), n_samples)
X = np.array([ digits_X[i] for i in sample_ids ])
Y = np.array([ digits_y[i] for i in sample_ids ])

t0 = time.time()
deepview.add_samples(X, Y)
deepview.show()

print('Time to calculate visualization for %d samples: %.2f sec' % (n_samples, time.time() - t0))

Distance calculation 20.00 %
Distance calculation 40.00 %
Distance calculation 60.00 %
Distance calculation 80.00 %
Distance calculation 100.00 %
Embedding samples ...
Computing decision regions ...
Time to calculate visualization for 200 samples: 17.20 sec


![knn](https://user-images.githubusercontent.com/30961397/78502740-dc046f80-7762-11ea-82cf-efc8251539db.png)


In [29]:
deepview.close()

## Py test

In [33]:
from sklearn.svm import SVC
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier

df = pd.read_csv('../pyrite_dbm_backup/data/PyTE.csv')

y = df['type']
X = df.copy().drop(['type', 'reference'], axis=1)
features = X.columns
# print(X.describe())
scaler0 = MinMaxScaler()
X = np.log10(X+1)
X = scaler0.fit_transform(X)

# print(y.value_counts())
le = LabelEncoder()
y = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=43, stratify=y)

sampler = SMOTE()
# X_over, y_over = sampler.fit_resample(X_train, y_train)

clf = SVC(probability=True)
# clf = RandomForestClassifier(n_estimators=100, n_jobs=-1)
clf.fit(X_train, y_train)

pred_wrapper = DeepView.create_simple_wrapper(clf.predict_proba)

# the digit dataset is used, so classes are [0..9]
classes = np.arange(6)

# --- Deep View Parameters ----
batch_size = 64
max_samples = 1000
sample_shape = (11,)
n = 10
lam = 0.9
resolution = 200
cmap = 'Set2'
title = 'RandomForest - Py'

# create DeepView object
deepview = DeepView(pred_wrapper, classes, max_samples, batch_size, sample_shape, 
                    n, lam, resolution, cmap, title=title, data_viz=demo.mnist_visualization )

# add data samples
n_samples = 200
sample_ids = np.random.choice(len(X_train), n_samples, )
X = np.array([ X_train[i] for i in sample_ids ])
Y = np.array([ y_train[i] for i in sample_ids ])

t0 = time.time()
deepview.add_samples(X, Y)
deepview.show()

print('Time to calculate visualization for %d samples: %.2f sec' % (n_samples, time.time() - t0))

Distance calculation 20.00 %
Distance calculation 40.00 %
Distance calculation 60.00 %
Distance calculation 80.00 %
Distance calculation 100.00 %
Embedding samples ...
Computing decision regions ...
Time to calculate visualization for 200 samples: 33.42 sec


In [2]:
import numpy as np
corners = np.array([
    [-5, -10],  # 1
    [-7, 6],  # 7
    [2, -8],  # 2
    [12, 4],  # 0
])

test_pts = np.array([
    (corners[0]*(1-x) + corners[1]*x)*(1-y) +
    (corners[2]*(1-x) + corners[3]*x)*y
    for y in np.linspace(0, 1, 10)
    for x in np.linspace(0, 1, 10)
])

test_pts.shape

(100, 2)