# Tutorial: Approximate Global Text Explaination (AGTE)
Explaining an IMDb Movies Text Classification Tensorflow Model globally for a dataset with Approximate Global Text Explaination via linear models.

In [None]:
import re
import string
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
import pandas as pd

from tensorflow.python.keras import Sequential
from tensorflow.python.keras.layers import Embedding, Dropout, GlobalAveragePooling1D, Dense
from tensorflow.python.keras.losses import BinaryCrossentropy
from tensorflow.python.keras.metrics import BinaryAccuracy
from ate.base import AGTE, ATE_Options

In [None]:
MAX_FEATURES = 10000
EMBEDDING_DIM = 16
SEQUENCE_LENGTH = 250
BATCH_SIZE = 32
AUTOTUNE = tf.data.AUTOTUNE

In [None]:
raw_train_ds = tfds.load('imdb_reviews', split='train').batch(BATCH_SIZE)
raw_test_ds = tfds.load('imdb_reviews', split='test').batch(BATCH_SIZE)

In [None]:
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
    return tf.strings.regex_replace(stripped_html, '[%s]' % re.escape(string.punctuation), '')

In [None]:
vectorize_layer = tf.keras.layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=MAX_FEATURES,
    output_mode='int',
    output_sequence_length=SEQUENCE_LENGTH)

In [None]:
train_text = raw_train_ds.map(lambda x: x['text'])
vectorize_layer.adapt(train_text)

In [None]:
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label

In [None]:
train_ds = raw_train_ds.map(lambda x: vectorize_text(x['text'], x['label']))
test_ds = raw_test_ds.map(lambda x: vectorize_text(x['text'], x['label']))

In [None]:
train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [None]:
model = Sequential([
    Embedding(MAX_FEATURES + 1, EMBEDDING_DIM),
    Dropout(0.2),
    GlobalAveragePooling1D(),
    Dropout(0.2),
    Dense(1)
])

In [None]:
model.compile(loss=BinaryCrossentropy(from_logits=True),
              optimizer='adam',
              metrics=BinaryAccuracy(threshold=0.0))

In [None]:
history = model.fit(train_ds, epochs=5)

In [None]:
loss, accuracy = model.evaluate(test_ds)

print("Loss: ", loss)
print("Accuracy: ", accuracy)

In [None]:
ate = AGTE(
    lambda x: np.array(x.lower().split()),
    lambda x: x, #INFO: Only needed for effect transformation.
    lambda x: model.predict([vectorize_layer(' '.join(e)).numpy().tolist() if len(e) > 0 else [0]*SEQUENCE_LENGTH for e in x['text'].tolist()])
)
options = ATE_Options(['text'], 2, 100, 1, 2)

In [None]:
test_df = tfds.as_dataframe(raw_test_ds).head(10)
test_df = pd.DataFrame([i['text'][0].decode('UTF-8') for _, i in test_df.iterrows()], columns=['text'])
effects = ate.transform_effects(ate.explain(test_df, options))

In [None]:
pd.DataFrame(effects)