# Tutorial: Approximate Local Text Explaination (ALTE)
Explaining an IMDb Movies Text Classification Tensorflow Model localy for one datapoint with Approximate Local Text Explaination via linear models.

In [1]:
import re
import string
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
import pandas as pd

from tensorflow.python.keras import Sequential
from tensorflow.python.keras.layers import Embedding, Dropout, GlobalAveragePooling1D, Dense
from tensorflow.python.keras.losses import BinaryCrossentropy
from tensorflow.python.keras.metrics import BinaryAccuracy
from ate.base import ALTE, ATE_Options

In [2]:
MAX_FEATURES = 10000
EMBEDDING_DIM = 16
SEQUENCE_LENGTH = 250
BATCH_SIZE = 32
AUTOTUNE = tf.data.AUTOTUNE

In [3]:
raw_train_ds = tfds.load('imdb_reviews', split='train').batch(BATCH_SIZE)
raw_test_ds = tfds.load('imdb_reviews', split='test').batch(BATCH_SIZE)

Metal device set to: Apple M1 Pro


2023-03-21 23:02:28.504920: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-03-21 23:02:28.505005: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [4]:
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
    return tf.strings.regex_replace(stripped_html, '[%s]' % re.escape(string.punctuation), '')

In [5]:
vectorize_layer = tf.keras.layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=MAX_FEATURES,
    output_mode='int',
    output_sequence_length=SEQUENCE_LENGTH)

In [6]:
train_text = raw_train_ds.map(lambda x: x['text'])
vectorize_layer.adapt(train_text)

2023-03-21 23:02:28.612503: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2023-03-21 23:02:28.660453: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


In [7]:
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label

In [8]:
train_ds = raw_train_ds.map(lambda x: vectorize_text(x['text'], x['label']))
test_ds = raw_test_ds.map(lambda x: vectorize_text(x['text'], x['label']))

In [9]:
train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [10]:
model = Sequential([
    Embedding(MAX_FEATURES + 1, EMBEDDING_DIM),
    Dropout(0.2),
    GlobalAveragePooling1D(),
    Dropout(0.2),
    Dense(1)
])

In [11]:
model.compile(loss=BinaryCrossentropy(from_logits=True),
              optimizer='adam',
              metrics=BinaryAccuracy(threshold=0.0))

In [12]:
history = model.fit(train_ds, epochs=5)

Epoch 1/5


2023-03-21 23:02:31.069198: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [13]:
loss, accuracy = model.evaluate(test_ds)

print("Loss: ", loss)
print("Accuracy: ", accuracy)

  8/782 [..............................] - ETA: 6s - loss: 0.3898 - binary_accuracy: 0.8320  

2023-03-21 23:03:25.175898: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Loss:  0.33687707781791687
Accuracy:  0.8638800382614136


In [14]:
ate = ALTE(
    lambda x: np.array(x.lower().split()),
    lambda x: x, #INFO: Only needed for effect transformation.
    lambda x: model.predict([vectorize_layer(' '.join(e)).numpy().tolist() if len(e) > 0 else [0]*SEQUENCE_LENGTH for e in x['text'].tolist()])
)
options = ATE_Options(['text'], 5, 10000, 1, 10)

In [15]:
test_df = tfds.as_dataframe(raw_test_ds).head(1)
test_df = pd.DataFrame([test_df['text'][0][0].decode('UTF-8')], columns=['text'])
effects = ate.explain(test_df, options)

print("Text: ", test_df['text'][0])
print("Effects: ", effects)

  0%|                                                                                                                                                        | 0/5 [00:00<?, ?it/s]2023-03-21 23:03:30.051433: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 1/10


2023-03-21 23:03:30.350384: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


 20%|████████████████████████████▊                                                                                                                   | 1/5 [00:02<00:10,  2.64s/it]

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


 40%|█████████████████████████████████████████████████████████▌                                                                                      | 2/5 [01:38<02:51, 57.29s/it]

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


 60%|██████████████████████████████████████████████████████████████████████████████████████▍                                                         | 3/5 [03:16<02:32, 76.05s/it]

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


 80%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                            | 4/5 [04:51<01:23, 83.54s/it]

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [06:25<00:00, 77.18s/it]

Text:  ['there' 'are' 'films' 'that' 'make' 'careers.' 'for' 'george' 'romero,'
 'it' 'was' 'night' 'of' 'the' 'living' 'dead;' 'for' 'kevin' 'smith,'
 'clerks;' 'for' 'robert' 'rodriguez,' 'el' 'mariachi.' 'add' 'to' 'that'
 'list' 'onur' "tukel's" 'absolutely' 'amazing' 'ding-a-ling-less.'
 'flawless' 'film-making,' 'and' 'as' 'assured' 'and' 'as' 'professional'
 'as' 'any' 'of' 'the' 'aforementioned' 'movies.' 'i' "haven't" 'laughed'
 'this' 'hard' 'since' 'i' 'saw' 'the' 'full' 'monty.' '(and,' 'even'
 'then,' 'i' "don't" 'think' 'i' 'laughed' 'quite' 'this' 'hard...' 'so'
 'to' 'speak.)' "tukel's" 'talent' 'is' 'considerable:' 'ding-a-ling-less'
 'is' 'so' 'chock' 'full' 'of' 'double' 'entendres' 'that' 'one' 'would'
 'have' 'to' 'sit' 'down' 'with' 'a' 'copy' 'of' 'this' 'script' 'and'
 'do' 'a' 'line-by-line' 'examination' 'of' 'it' 'to' 'fully' 'appreciate'
 'the,' 'uh,' 'breadth' 'and' 'width' 'of' 'it.' 'every' 'shot' 'is'
 'beautifully' 'composed' '(a' 'clear' 'sign' 'of' 'a




In [20]:
effect_df = pd.DataFrame(effects, columns=['word', 'effect'])
effect_df['effect'] = effect_df['effect'].apply(lambda x: x[0])
effect_df[(effect_df['effect'] > 1.5) | (effect_df['effect'] < -1.5)]

Unnamed: 0,word,effect
0,there,-1.834488
4,make,-2.875175
9,it,1.765029
15,dead;,-1.669371
18,"smith,",1.800793
32,amazing,7.428958
34,flawless,4.299712
36,and,1.811271
39,and,1.671421
43,any,-3.983705
