# Using tf.similarity on iris dataset

This tutorial uses tf.similiarity package to show how we can use tf.similarity on the iris dataset.

## Setup

In [None]:
# run this cell if you want to suppress warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# imports
from absl import app, flags
from tensorflow_similarity.api.engine.simhash import SimHash
from tensorflow_similarity.api.engine.augmentation import Augmentation
import numpy as np
import six
import tabulate
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
import sklearn


In [None]:
# check tensorflow version, it should be 2.x
import tensorflow as tf
print(tf.__version__)

!pip install --upgrade keras-tuner
import kerastuner

### Read in iris dataset

In [None]:
# Relative datapath to the downloaded iris dataset
DEFAULT_IRIS_DATA_PATH = "iris.csv"

In [None]:
def read_iris_data(data_path):
    """ Returns the iris data.
    
    Opens the data file specified by the argument, read each
    line and puts 20% of the data into the testing set.
    
    Args:
        data_path: A string that points to the iris dataset.
    
    Returns:
        A tuple that contains two elements. The first element
        is a tuple that contains data used for training and
        the second element is a tuple that contains data used
        for testing. Both of those two tuples have the same
        structure, they both contains two elements. The first
        element contains a dictionary for the specs of iris
        flowers (in 2d np array), the second element contains
        an np array of labels of class.
        For example:
        
        (
          ({'example': [[0,1,3,4],[2,1,3,5]]}, [0,2]),
          ({'example': [[0,2,3,5],[2,1,4,5]]}, [1,2])
        )
    """
    
    with tf.io.gfile.GFile(data_path, "r") as f:
        lines = f.readlines()
        x_train = []
        y_train = []
        x_test = []
        y_test = []
        for idx, line in enumerate(lines):
            tokens = line.split(",")
            y = int(tokens[-1])
            x = [float(i) for i in tokens[:-1]]

            if idx % 10 == 0:
                x_test.append(x)
                y_test.append(y)
            else:
                x_train.append(x)
                y_train.append(y)

        x_train = {"example": np.array(x_train)}
        x_test = {"example": np.array(x_test)}

        return ((x_train, np.array(y_train)), (x_test, np.array(y_test)))

In [None]:
def create_targets(x_test, y_test):
    """Creates targets from the test dataset.
    
    First we group the data by the labels (the value in y_test),
    then for each labels we compute the mean of the data.
    
    Args:
        x_test: A dictionary that contains a single key
            with the value of an 2d np array. For example,
            {"example": [[1,3,4,2], [2,1,4,7]]}
        y_test: A 1d np array containing the classification.
            For example,
            [0,1]
    
    Returns:
        x_targets: A dictionary that contains a single key
            with the value of an 2d np array. The length of
            the np array should be the number of classes.
        y_targets: A 1d np array ocntaining the classification.
    """
    
    by_label = {0: [], 1: [], 2: []}

    for x, y in zip(x_test["example"], y_test):
        by_label[y].append(x)

    x_targets = []
    y_targets = []

    for label, data in six.iteritems(by_label):
        mean = np.mean(data, axis=0)
        x_targets.append(mean)
        y_targets.append(label)
    x_targets = np.array(x_targets)
    x_targets = {"example": x_targets}
    y_targets = np.array(y_targets)

    return x_targets, y_targets

In [None]:
def get_iris_data():
    """Computes and returns the training, testing, and target datasets."""
    
    (x_train, y_train), (x_test, y_test) = read_iris_data(DEFAULT_IRIS_DATA_PATH)
    (x_targets, y_targets) = create_targets(x_test, y_test)
    return (x_train, y_train), (x_test, y_test), (x_targets, y_targets)

### Define tower models and custom augmentation

In [None]:
def simple_model():
    """A simple tower model for iris dataset.
    
    Returns:
        model: A tensorflow model that has 3 hidden
            layers that has 10, 8, 6 neurons
            respectively.         
    """
    
    i = Input(shape=(4,), name='example')
    o = Dense(10, activation='tanh')(i)
    o = Dense(8, activation='tanh')(o)
    o = Dense(6, activation='tanh')(o)
    o = Dense(3)(o)
    model = Model(i, o)
    return model

In [None]:
class Fuzz(Augmentation):
    """An Augmentation class that disturbed the data."""
    
    def augment(self, x):
        """Returns disturbed data."""
        
        x = x["example"]

        FUZZ = .01
        fuzz = np.random.random_sample(x.shape) * FUZZ - FUZZ / 2.0

        x = x - fuzz
        return {"example": x}

### Helper methods

In [None]:
def display_metrics(train_metrics, test_metrics):
    unpacked_train_metrics = [(i[0], i[1]) for i in six.iteritems(train_metrics)]
    unpacked_test_metrics = [(i[0], i[1]) for i in six.iteritems(test_metrics)]

    print("")
    print("TRAINING")
    print(tabulate.tabulate(unpacked_train_metrics, ["Metric", "Value"]))

    print("")
    print("TEST")
    print(tabulate.tabulate(unpacked_test_metrics, ["Metric", "Value"]))

## Example usage 1: basic usage

In [None]:
def basic_similarity_run(data, strategy, tower_model, epochs):
    """A basic example usage of tf.similarity using iris dataset.
    
    This basic similarity run will first unpackage training,
    testing, and target data from the arguments and then construct a
    simple moirai model, fit the model with training data, then
    evaluate our model with training and testing datasets.
    
    Args:
        data: Sets, contains training, testing, and target datasets.
        strategy: String, specify the strategy to use for mining triplets.
        tower_model: tf.Model, the tower model to fit into moirai.
        epochs: Integer, number of epochs to fit our moirai model.
        callbacks: List of callback functions, 
    
    Returns:
        moirai_model: SimHash
        train_metrics: Dictionary, containing metrics performed on the
            training dataset. The key is the name of the metric and the
            value is the np array of the metric values.
        test_metrics: Dictionary, containing metrics performed on the
            testing dataset. The key is the name of the metric and the
            value is the np array of the metric values.
    """
    
    # unpackage data
    (x_train, y_train), (x_test, y_test), (x_targets, y_targets) = data

    print("Initial tower model summary:")
    tower_model.summary()

    moirai_model = SimHash(
        tower_model,
        augmentation=Fuzz(),
        optimizer=Adam(lr=0.001),
        strategy=strategy)

    early_stopping_callback = tf.keras.callbacks.EarlyStopping(
        monitor="loss", mode='min', min_delta=0.00000001, patience=50)
    
    callbacks = [early_stopping_callback]
    
    moirai_model.fit(
        x_train,
        y_train,
        epochs=epochs,
        verbose=1,
        callbacks=callbacks,
    )
    
    train_metrics = moirai_model.evaluate(x_train, y_train, x_targets, y_targets)
    test_metrics = moirai_model.evaluate(x_test, y_test, x_targets, y_targets)
    
    return moirai_model, train_metrics, test_metrics

In [None]:
data = get_iris_data()
tower_model = simple_model()
strategy = "hard_quadruplet_loss"
epochs = 5

basic_moirai_model, train_metrics, test_metrics = basic_similarity_run(data, strategy, tower_model, epochs)

In [None]:
display_metrics(train_metrics, test_metrics)

## Example usage 2: With Visualization Callback

In [None]:
# additional imports
import datetime
from tensorflow_similarity.api.callbacks.metrics_callbacks import MetricsCallback
from tensorflow_similarity.api.callbacks.plugins import ConfusionMatrixCallbackPlugin

In [None]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

In [None]:
# Uncomment and run the below line to clear any logs from previous runs
!rm -rf ./logs/

In [None]:
def similarity_run_with_visualization(data, strategy, tower_model, epochs):
    """A basic example usage of tf.similarity using iris dataset with visualization callbacks.
    
    This basic similarity run will first unpackage training,
    testing, and target data from the arguments and then construct a
    simple moirai model, fit the model with training data, then
    evaluate our model with training and testing datasets.
    
    Args:
        data: Sets, contains training, testing, and target datasets.
        strategy: String, specify the strategy to use for mining triplets.
        tower_model: tf.Model, the tower model to fit into moirai.
        epochs: Integer, number of epochs to fit our moirai model.
        callbacks: List of callback functions, 
    
    Returns:
        moirai_model: SimHash
        train_metrics: Dictionary, containing metrics performed on the
            training dataset. The key is the name of the metric and the
            value is the np array of the metric values.
        test_metrics: Dictionary, containing metrics performed on the
            testing dataset. The key is the name of the metric and the
            value is the np array of the metric values.
    """
    
    # unpackage data
    (x_train, y_train), (x_test, y_test), (x_targets, y_targets) = data

    print("Initial tower model summary:")
    tower_model.summary()

    moirai_model = SimHash(
        tower_model,
        augmentation=Fuzz(),
        optimizer=Adam(lr=0.001),
        strategy=strategy)

    early_stopping_callback = tf.keras.callbacks.EarlyStopping(
        monitor="loss", mode='min', min_delta=0.00000001, patience=50)
    
    log_dir="logs/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    confusion_matrix_log_dir = log_dir + "/confusion_matrix"
    
    confusion_matrix_plugin = ConfusionMatrixCallbackPlugin(confusion_matrix_log_dir)
    metrics_callbacks = MetricsCallback(
        [confusion_matrix_plugin],
        x_test,
        y_test,
        x_targets,
        y_targets)
    
    callbacks = [early_stopping_callback, metrics_callbacks]
    
    moirai_model.fit(
        x_train,
        y_train,
        epochs=epochs,
        verbose=1,
        callbacks=callbacks,
    )
    
    train_metrics = moirai_model.evaluate(x_train, y_train, x_targets, y_targets)
    test_metrics = moirai_model.evaluate(x_test, y_test, x_targets, y_targets)
    
    return moirai_model, train_metrics, test_metrics

In [None]:
data = get_iris_data()
tower_model = simple_model()
strategy = "hard_quadruplet_loss"
epochs = 5

basic_moirai_model, train_metrics, test_metrics = similarity_run_with_visualization(data, strategy, tower_model, epochs)

In [None]:
%tensorboard --logdir logs

In [None]:
display_metrics(train_metrics, test_metrics)