In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys

sys.path.insert(0, "..")

In [None]:
# Install Trulens
!{sys.executable} -m pip install "git+https://github.com/truera/trulens.git#subdirectory=trulens_explain"
!{sys.executable} -m pip install 'tensorflow-text~=2.11.0'
!{sys.executable} -m pip install 'tf-models-official~=2.11.0'
!{sys.executable} -m pip install 'protobuf==3.20.0'
!{sys.executable} -m pip install -U tensorflow-hub gdown opencv-python

In [None]:
import numpy as np
import tensorflow as tf
import pandas as pd
from IPython.display import HTML as html_print
import plotly.express as px

from trulens.nn.models import get_model_wrapper
from trulens.nn.attribution import InternalInfluence
from trulens.nn.slices import OutputCut, Slice, Cut
from trulens.nn.quantities import MaxClassQoI
from trulens.nn.distributions import LinearDoi
import gdown

tf.get_logger().setLevel("ERROR")
print(tf.__version__)

In [None]:
gdown.download(
    id="1-bVFx-qU_kD7gGqV2E8ucRrV0LKFxHzB", output="resources.zip", quiet=False
)

In [None]:
# Download notebook resources.
!mkdir -p resources
!unzip -o -d resources resources.zip
!rm resources.zip

# Loading the model
The notebook resources include a model checkpoint. The model uses the Tensorflow Hub [Text Preprocessing layer](https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3) and [Small Bert layer](https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/2) followed by several convolutional and fully connected layers.

The model has already been trained on a sentiment analysis task with the Covid-19 Tweets dataset.


In [None]:
model_name = "classifierbert-cnn"

model = tf.keras.models.load_model("./resources/" + model_name, compile=False)
model.summary()

## Model Vocabulary
We also load the vocabulary behind the model. This helps us translate our token IDs back into tokenized words.

In [None]:
vocab_file = f"./resources/{model_name}/assets/vocab.txt"
with open(vocab_file) as f:
    vocab = f.read().splitlines()

This model describes the sentiment of tweets into 5 classes: positive, extremely positive, negative, extremely negative, or neutral. Lets try it out on some examples.

In [None]:
sentences = [
    "Fill up the fridge with enough food, ready medical supplies, water, avoid crowd, be updated with news, dont panic, work from home if feasible, boost immune system by drinking vitamins and always wash hands. Stay safe and healthy! #COVID2019 #metroManilaCovid",
    "Big thanks to all the retail, supermarket workers &amp; nurses out there. This is mental and the subsequent panic buying and rise in cases shows just how important they are #Covid_19",
    "I understand food being out of stock, but why toilet paper? what's up with that? #covid_19 #coronavirus",
    "This Friday the 13th is a nightmare for supermarket employees. People are panic buying a day after Duterte announced an NCR lockdown. Carts are filled w/ all sorts of noodles. I guess these Metro Manila residents will be on pancit canton/bihon diet for a month #Covid_19 https://t.co/33Bw2ZKnds",
    "Food, emergency supply stores struggle to meet demand #coronavirus #yzf https://t.co/wZ6yLBU2rl https://t.co/2Ef6Fy9u8y",
]

In [None]:
classes = [
    "Extremely Negative",
    "Negative",
    "Neutral",
    "Positive",
    "Extremely Positive",
]

predictions = model(tf.constant(sentences)).numpy()
for sentence, pred in zip(sentences, predictions):
    print(f"Predicted {classes[np.argmax(pred)]}: '{sentence}'")

# Model Wrapper

As in the prior notebooks, we need to wrap the model with the appropriate Trulens functionality. As we are using a tf.keras model, it should be specified in the backend parameter.

In [None]:
k_model = get_model_wrapper(model)

# Attributions

The model takes in text as input, which get tokenized in the `preprocessing` layer and translated into embeddings in the `BERT_encoder` layer. Since we cannot take the gradient with respect to the raw input text or tokenized text directly, we must use the embedding representation of our inputs.

Below, we can inspect the available layers in our model.

In [None]:
[layer_name for layer_name in k_model._layers]

## Parameters

Above, `BERT_encoder/bert_encoder/word_embeddings` is the layer that produces a continuous representation of each input token so we will use that layer as the one defining the **distribution of interest**. While most neural NLP models contain a token embedding, the layer name will differ.

The second thing to note is the form of model outputs. Specifically, outputs are structures which contain a 'logits' attribute that stores the model scores.

Putting these things together, we instantiate `InternalInfluence` to attribute each embedding dimension to the maximum class (i.e. the predicted class).

In [None]:
embedding_layer_name = "BERT_encoder/bert_encoder/word_embeddings"

infl = InternalInfluence(
    model=k_model,
    cuts=Slice(Cut(embedding_layer_name, anchor="out"), OutputCut()),
    qoi=MaxClassQoI(),
    doi=LinearDoi(resolution=10, cut=Cut(embedding_layer_name, anchor="in")),
)

We apply the preprocessing step to tokenize our input text. Using the model vocabulary, the token IDs (`sentence_encodings`) can be translated back into tokenized words (`tokens`).

In [None]:
# Define preprocessor
inp = model.input
preprocessing_layer = model.get_layer("preprocessing").get_output_at(-1)
pp_func = tf.keras.backend.function(inp, preprocessing_layer)

sentence_encodings = pp_func(tf.constant(sentences))["input_word_ids"]
tokens = [[vocab[i] for i in sentence] for sentence in sentence_encodings]

Getting attributions uses the same call as model evaluation and returns a tensor. We can aggregate the attributions across the embedding dimension to get an approximate look at the influence of each token.

In [None]:
attrs_internal = infl.attributions(np.array(sentences))
total_attrs = attrs_internal.sum(axis=2)

# Visualizing Influences
Here we display visualizations that describe the influence of each token on the final prediction. 

In [None]:
def rgb_str(r, g, b):
    return "rgb(%d,%d,%d)" % (r, g, b)


def cstr(s, color="black", background="white"):
    return "<text style=color:{};background-color:{}>{}</text>".format(
        color, background, s
    )

In [None]:
html = ""
for sentence_idx in range(len(sentences)):
    html += classes[np.argmax(predictions[sentence_idx])] + ": "

    # Define the coloring for each token. Green=positive, Red=negative.
    # Color intensity describes the magnitude of the influence in either direction.
    max_imp = max(abs(total_attrs[sentence_idx]))
    rgbs = []
    for imp in total_attrs[sentence_idx]:
        normed_imp = int(imp / max_imp * 256)
        intensity = abs(normed_imp)
        if normed_imp > 0:  # green
            rgbs.append(rgb_str(256 - intensity, 256, 256 - intensity))
        else:  # red
            rgbs.append(rgb_str(256, 256 - intensity, 256 - intensity))

    for i, token in enumerate(tokens[sentence_idx]):
        if token != "[PAD]":
            html += cstr(token, "black", rgbs[i]) + " "
    html += "</br></br>"
html_print(html)

In [None]:
for sentence_idx in range(len(sentences)):
    df = pd.DataFrame(
        {"Tokens": tokens[sentence_idx], "Importance": total_attrs[sentence_idx]}
    )
    fig = px.bar(df, x="Tokens", y="Importance")
    fig.update_layout(
        width=1200,
        height=300,
    )
    fig.show()