In [None]:
import warnings
warnings.filterwarnings('ignore')

import tqdm
import numpy as np
# for recall and precision metrics
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.layers import Embedding, LSTM, Dropout, Dense
from keras.models import Sequential
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint, TensorBoard
from sklearn.model_selection import train_test_split
import time
import numpy as np
import pickle
import os.path
from keras.models import model_from_json

In [None]:

SEQUENCE_LENGTH = 100 # the length of all sequences (number of words per sample)
EMBEDDING_SIZE = 100  # Using 100-Dimensional GloVe embedding vectors
TEST_SIZE = 0.25 # ratio of testing set

BATCH_SIZE = 64
EPOCHS = 20 # number of epochs

# to convert labels to integers and vice-versa
label2int = {"ham": 0, "spam": 1}
int2label = {0: "ham", 1: "spam"}

In [None]:

import pandas as pd
combined_df = pd.read_csv('/content/SMSSpamCollection.csv', delimiter='\t',header=None)
combined_df.columns = ['label', 'text']

In [None]:

# clean text and store as a column in original df
X = combined_df['text'].values.tolist()
y = combined_df['label'].values.tolist()

In [None]:
X

['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
 'Ok lar... Joking wif u oni...',
 "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
 'U dun say so early hor... U c already then say...',
 "Nah I don't think he goes to usf, he lives around here though",
 "FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv",
 'Even my brother is not like to speak with me. They treat me like aids patent.',
 "As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune",
 'WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.',
 'Had you

In [None]:
y

['ham',
 'ham',
 'spam',
 'ham',
 'ham',
 'spam',
 'ham',
 'ham',
 'spam',
 'spam',
 'ham',
 'spam',
 'spam',
 'ham',
 'ham',
 'spam',
 'ham',
 'ham',
 'ham',
 'spam',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'spam',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'spam',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'spam',
 'ham',
 'spam',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'spam',
 'ham',
 'spam',
 'spam',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'spam',
 'ham',
 'spam',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'spam',
 'ham',
 'ham',
 'spam',
 'ham',
 'ham',
 'spam',
 'spam',
 'h

In [None]:

# Text tokenization
# vectorizing text, turning each text into sequence of integers
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
# convert to sequence of integers
X = tokenizer.texts_to_sequences(X)

In [None]:
# convert to numpy arrays
X = np.array(X)
y = np.array(y)
# pad sequences at the beginning of each sequence with 0's
# for example if SEQUENCE_LENGTH=4:
# [[5, 3, 2], [5, 1, 2, 3], [3, 4]]
# will be transformed to:
# [[0, 5, 3, 2], [5, 1, 2, 3], [0, 0, 3, 4]]
X = pad_sequences(X, maxlen=SEQUENCE_LENGTH)

In [None]:
y = [ label2int[label] for label in y ]
y = to_categorical(y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=7)


In [None]:
def get_embedding_vectors(tokenizer, dim=100):
    embedding_index = {}
    with open(f"/content/drive/MyDrive/data_DL/glove.6B.{dim}d.txt", encoding='utf8') as f:
        for line in tqdm.tqdm(f, "Reading GloVe"):
            values = line.split()
            word = values[0]
            vectors = np.asarray(values[1:], dtype='float32')
            embedding_index[word] = vectors

    word_index = tokenizer.word_index
    embedding_matrix = np.zeros((len(word_index)+1, dim))
    for word, i in word_index.items():
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            # words not found will be 0s
            embedding_matrix[i] = embedding_vector
            
    return embedding_matrix

In [None]:
def get_model(tokenizer, lstm_units):
    """
    Constructs the model,
    Embedding vectors => LSTM => 2 output Fully-Connected neurons with softmax activation
    """
    # get the GloVe embedding vectors
    embedding_matrix = get_embedding_vectors(tokenizer)
    model = Sequential()
    model.add(Embedding(len(tokenizer.word_index)+1,
              EMBEDDING_SIZE,
              weights=[embedding_matrix],
              trainable=False,
              input_length=SEQUENCE_LENGTH))

    model.add(LSTM(lstm_units, recurrent_dropout=0.2))
    model.add(Dropout(0.3))
    model.add(Dense(2, activation="softmax"))
    # compile as rmsprop optimizer
    # aswell as with recall metric
    model.compile(optimizer="rmsprop", loss="categorical_crossentropy")
    model.summary()
    return model

In [None]:
from tensorflow.python.keras.metrics import Metric


In [None]:
model = get_model(tokenizer=tokenizer, lstm_units=128)


Reading GloVe: 400000it [00:12, 31514.80it/s]


Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 100, 100)          901000    
_________________________________________________________________
lstm_4 (LSTM)                (None, 128)               117248    
_________________________________________________________________
dropout_4 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 2)                 258       
Total params: 1,018,506
Trainable params: 117,506
Non-trainable params: 901,000
_________________________________________________________________


In [None]:
to_train = True

if (to_train): 

    # initialize our ModelCheckpoint and TensorBoard callbacks
    # model checkpoint for saving best weights
    model_checkpoint = ModelCheckpoint("results/spam_classifier_{val_loss:.2f}", save_best_only=True,
                                        verbose=1)
    # for better visualization
    tensorboard = TensorBoard(f"logs/spam_classifier_{time.time()}")
    # print our data shapes
    print("X_train.shape:", X_train.shape)
    print("X_test.shape:", X_test.shape)
    print("y_train.shape:", y_train.shape)
    print("y_test.shape:", y_test.shape)
    # train the model
    model.fit(X_train, y_train, validation_data=(X_test, y_test),
              batch_size=BATCH_SIZE, epochs=EPOCHS,
              callbacks=[tensorboard, model_checkpoint],
              verbose=1)
    
    # serialize model to JSON
    model_json = model.to_json()
    with open("sms-lstm-forprotodash.json", "w") as json_file:
        json_file.write(model_json)

    # serialize weights to HDF5
    model.save_weights("sms-lstm-forprotodash.h5")
    print("Saved model to disk")
        
else: 

    # load json and create model
    json_file = open("sms-lstm-forprotodash.json", 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    model = model_from_json(loaded_model_json)

    # load weights into new model
    model.load_weights("sms-lstm-forprotodash.h5")
    print("Loaded model from disk")

    # print model 
    model.summary()

    model.compile(optimizer="rmsprop", loss="categorical_crossentropy",
                  metrics=["accuracy", keras_metrics.precision(), keras_metrics.recall()])

X_train.shape: (4179, 100)
X_test.shape: (1393, 100)
y_train.shape: (4179, 2)
y_test.shape: (1393, 2)
Epoch 1/20
Instructions for updating:
use `tf.profiler.experimental.stop` instead.
Epoch 00001: val_loss improved from inf to 0.10446, saving model to results/spam_classifier_0.10
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: results/spam_classifier_0.10/assets
Epoch 2/20
Epoch 00002: val_loss did not improve from 0.10446
Epoch 3/20
Epoch 00003: val_loss did not improve from 0.10446
Epoch 4/20
Epoch 00004: val_loss improved from 0.10446 to 0.09092, saving model to results/spam_classifier_0.09
INFO:tensorflow:Assets written to: results/spam_classifier_0.09/assets
Epoch 5/20
Epoch 00005: val_loss improved from 0.09092 to 0.08284, saving model to results/spam_classifier

In [None]:
# get the loss and metrics
result = model.evaluate(X_test, y_test)




In [None]:
result

0.16347205638885498

Step 2. Get model predictions for the dataset


In [None]:
def get_predictions(doclist):
    
    sequence = tokenizer.texts_to_sequences(doclist)
    
    # pad the sequence
    sequence = pad_sequences(sequence, maxlen=SEQUENCE_LENGTH)

    # get the prediction as one-hot encoded vector
    prediction = model.predict(sequence)
    
    return (prediction)

In [None]:

text = "Congratulations! you have won 100,000$ this week, click here to claim fast"
pred = get_predictions([text])
print(int2label [ np.argmax(pred, axis=1)[0] ] )

spam


In [None]:

text = "Hi man, I was wondering if we can meet tomorrow."
pred = get_predictions([text])
print(int2label [ np.argmax(pred, axis=1)[0] ] )

ham


In [None]:
doclist = combined_df['text'].values.tolist()
one_hot_prediction = get_predictions(doclist)
label_prediction = np.argmax(one_hot_prediction, axis=1)

# 0: ham, 1:spam
idx_ham = (label_prediction == 0)
idx_spam = (label_prediction == 1)

In [None]:
!pip install git+https://github.com/Trusted-AI/AIX360.git

Collecting git+https://github.com/Trusted-AI/AIX360.git
  Cloning https://github.com/Trusted-AI/AIX360.git to /tmp/pip-req-build-93q3x33h
  Running command git clone -q https://github.com/Trusted-AI/AIX360.git /tmp/pip-req-build-93q3x33h
Collecting tensorflow==1.14
[?25l  Downloading https://files.pythonhosted.org/packages/de/f0/96fb2e0412ae9692dbf400e5b04432885f677ad6241c088ccc5fe7724d69/tensorflow-1.14.0-cp36-cp36m-manylinux1_x86_64.whl (109.2MB)
[K     |████████████████████████████████| 109.2MB 98kB/s 
Collecting keras==2.3.1
[?25l  Downloading https://files.pythonhosted.org/packages/ad/fd/6bfe87920d7f4fd475acd28500a42482b6b84479832bdc0fe9e589a60ceb/Keras-2.3.1-py2.py3-none-any.whl (377kB)
[K     |████████████████████████████████| 378kB 51.3MB/s 
Collecting xport
  Downloading https://files.pythonhosted.org/packages/6a/a0/ade37253fe2c7a457a9a8703e93e4b1517dd53315e3941416ee4f7463f08/xport-2.0.2-py2.py3-none-any.whl
Collecting xgboost==1.0.2
[?25l  Downloading https://files.pytho

# **Step** 3. Use protodash explainer to compute spam and ham **prototypes**


In [None]:

from sklearn.feature_extraction.text import TfidfVectorizer
from aix360.algorithms.protodash import ProtodashExplainer

In [None]:
# create the transform
vectorizer = TfidfVectorizer()

# tokenize and build vocab
vectorizer.fit(doclist)

vec = vectorizer.transform(doclist)
docvec = vec.toarray()
print(docvec.shape)

(5572, 8713)


In [None]:
# separate spam and ham messages and corrsponding vectors

docvec_spam = docvec[idx_spam, :]
docvec_ham = docvec[idx_ham, :]

df_spam = combined_df[idx_spam]['text']
df_ham = combined_df[idx_ham]['text']

In [None]:
print(df_spam.shape)
print(df_ham.shape)

(838,)
(4734,)


In [None]:
explainer = ProtodashExplainer()


In [None]:
m = 10

# call protodash explainer
# S contains indices of the selected prototypes
# W contains importance weights associated with the selected prototypes 
(W_spam, S_spam, _) = explainer.explain(docvec_spam, docvec_spam, m=m)
(W_ham, S_ham, _) = explainer.explain(docvec_ham, docvec_ham, m=m)

In [None]:
# get prototypes from index
df_spam_prototypes = df_spam.iloc[S_spam].copy()
df_ham_prototypes = df_ham.iloc[S_ham].copy()

#normalize weights
W_spam = np.around(W_spam/np.sum(W_spam), 2) 
W_ham = np.around(W_ham/np.sum(W_ham), 2)

In [None]:
print("SPAM prototypes with weights:")
print("----------------------------")
for i in range(m):
    print(W_spam[i], df_spam_prototypes.iloc[i])

SPAM prototypes with weights:
----------------------------
0.13 We tried to call you re your reply to our sms for a video mobile 750 mins UNLIMITED TEXT + free camcorder Reply of call 08000930705 Now
0.12 You have WON a guaranteed £1000 cash or a £2000 prize.To claim yr prize call our customer service representative on
0.12 Get ur 1st RINGTONE FREE NOW! Reply to this msg with TONE. Gr8 TOP 20 tones to your phone every week just £1.50 per wk 2 opt out send STOP 08452810071 16
0.1 December only! Had your mobile 11mths+? You are entitled to update to the latest colour camera mobile for Free! Call The Mobile Update Co FREE on 08002986906
0.09 Dear Voucher Holder, To claim this weeks offer, at you PC please go to http://www.e-tlp.co.uk/expressoffer Ts&Cs apply. To stop texts, txt STOP to 80062
0.09 YES! The only place in town to meet exciting adult singles is now in the UK. Txt CHAT to 86688 now! 150p/Msg.
0.09 URGENT! We are trying to contact U. Todays draw shows that you have won a £800 p

In [None]:
print("HAM prototypes with weights:")
print("----------------------------")
for i in range(m):
    print(W_ham[i], df_ham_prototypes.iloc[i])

HAM prototypes with weights:
----------------------------
0.14 The last thing i ever wanted to do was hurt you. And i didn't think it would have. You'd laugh, be embarassed, delete the tag and keep going. But as far as i knew, it wasn't even up. The fact that you even felt like i would do it to hurt you shows you really don't know me at all. It was messy wednesday, but it wasn't bad. The problem i have with it is you HAVE the time to clean it, but you choose not to. You skype, you take pictures, you sleep, you want to go out. I don't mind a few things here and there, but when you don't make the bed, when you throw laundry on top of it, when i can't have a friend in the house because i'm embarassed that there's underwear and bras strewn on the bed, pillows on the floor, that's something else. You used to be good about at least making the bed.
0.11 What do u want when i come back?.a beautiful necklace as a token of my heart for you.thats what i will give but ONLY to MY WIFE OF MY LIKING.

In [None]:

k = 0
sample_text = df_spam.iloc[k]
sample_vec = docvec_spam[k]
sample_vec = sample_vec.reshape(1, sample_vec.shape[0])

In [None]:
print(sample_text)
print(sample_vec.shape)

Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
(1, 8713)


In [None]:
docvec_spam_other = docvec_spam[np.arange(docvec_spam.shape[0]) != k, :] 
df_spam_other = df_spam.iloc[np.arange(docvec_spam.shape[0]) != k].copy()

In [None]:
# Take a sample spam text and find samples similar to it. 
(W1_spam, S1_spam, _) = explainer.explain(sample_vec, docvec_spam_other, m=m)

In [None]:
#normalize weights
W1_spam = np.around(W1_spam/np.sum(W1_spam), 2)

In [None]:
S1_spam


array([191, 300, 433, 238, 780,  63, 112, 231, 232, 344])

In [None]:
# similar spam prototypes
print("original text")
print("-------------")
print(sample_text)
print("")

print("Similar SPAM prototypes:")
print("------------------------")
m = 10
for i in range(m):
    print(W1_spam[i], df_spam_other.iloc[S1_spam[i]])

original text
-------------
Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's

Similar SPAM prototypes:
------------------------
1.0 Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
0.0 Free entry in 2 a weekly comp for a chance to win an ipod. Txt POD to 80182 to get entry (std txt rate) T&C's apply 08452810073 for details 18+
0.0 You've won tkts to the EURO2004 CUP FINAL or £800 CASH, to collect CALL 09058099801 b4190604, POBOX 7876150ppm
0.0 SMS. ac JSco: Energy is high, but u may not know where 2channel it. 2day ur leadership skills r strong. Psychic? Reply ANS w/question. End? Reply END JSCO
0.0 Just send a text. We'll skype later.
0.0 100 dating service cal;l 09064012103 box334sk38ch
0.0 08714712388 between 10am-7pm Cost 10p
0.0 CLAIRE here am havin borin time & am now a

Observation
Note several spam messages repeat in the dataset as these may have been sent by the same entity to multiple users. As a consequence, the explainer retireves these. Try with a different k above to see prototypes corrsponding to other sample messages.

Given a ham message, look for similar messages that are classified as spam by classifier¶


In [None]:
k = 3
sample_text = df_ham.iloc[k]
sample_vec = docvec_ham[k]
sample_vec = sample_vec.reshape(1, sample_vec.shape[0])

In [None]:

print(sample_text)
print(sample_vec.shape)

Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
(1, 8713)


In [None]:

docvec_ham_other = docvec_ham[np.arange(docvec_ham.shape[0]) != k, :] 
df_ham_other = df_ham.iloc[np.arange(docvec_ham.shape[0]) != k].copy()

In [None]:
# Take a sample spam text and find samples similar to it. 
(W1_ham, S1_ham, _) = explainer.explain(sample_vec, docvec_ham_other, m=m)

In [None]:

#normalize weights
W1_ham = np.around(W1_ham/np.sum(W1_ham), 2)

In [None]:
S1_ham


array([4045, 3870, 1252,  110, 1121, 2862, 4650,   81, 4372, 1028])

In [None]:

# similar spam prototypes
print("original text")
print("-------------")
print(sample_text)
print("")

print("Similar HAM prototypes:")
print("------------------------")
m = 10
for i in range(m):
    print(W1_ham[i], df_ham_other.iloc[S1_ham[i]])

original text
-------------
Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's

Similar HAM prototypes:
------------------------
0.11 if you text on your way to cup stop that should work. And that should be BUS
0.12 Gettin rdy to ship comp
0.12 Ok i juz receive..
0.09 First answer my question.
0.1 Sweet, we may or may not go to 4U to meet carlos so gauge patty's interest in that
0.1 Can... I'm free...
0.1 Dhoni have luck to win some big title.so we will win:)
0.1 I see a cup of coffee animation
0.09 Like  &lt;#&gt; , same question
0.08 have got * few things to do. may be in * pub later.


In [None]:
df = pd.read_csv('restaurant_reviews.tsv', delimiter='\t',quoting = 3)

In [None]:
df.columns = [ 'text','label',]

In [None]:
df.head()

Unnamed: 0,text,label
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [None]:

# clean text and store as a column in original df
X = df['text'].values.tolist()
y = df['label'].values.tolist()

In [None]:
# Text tokenization
# vectorizing text, turning each text into sequence of integers
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
# convert to sequence of integers
X = tokenizer.texts_to_sequences(X)

In [None]:

# convert to numpy arrays
X = np.array(X)
y = np.array(y)
# pad sequences at the beginning of each sequence with 0's
# for example if SEQUENCE_LENGTH=4:
# [[5, 3, 2], [5, 1, 2, 3], [3, 4]]
# will be transformed to:
# [[0, 5, 3, 2], [5, 1, 2, 3], [0, 0, 3, 4]]
X = pad_sequences(X, maxlen=SEQUENCE_LENGTH)

In [None]:
y = to_categorical(y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=7)


In [None]:
model = get_model(tokenizer=tokenizer, lstm_units=128)


Reading GloVe: 400000it [00:13, 30154.86it/s]


Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 100, 100)          207200    
_________________________________________________________________
lstm_6 (LSTM)                (None, 128)               117248    
_________________________________________________________________
dropout_6 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 2)                 258       
Total params: 324,706
Trainable params: 117,506
Non-trainable params: 207,200
_________________________________________________________________


In [None]:
to_train = True

if (to_train): 

    # initialize our ModelCheckpoint and TensorBoard callbacks
    # model checkpoint for saving best weights
    model_checkpoint = ModelCheckpoint("results/spam_classifier_{val_loss:.2f}", save_best_only=True,
                                        verbose=1)
    # for better visualization
    tensorboard = TensorBoard(f"logs/spam_classifier_{time.time()}")
    # print our data shapes
    print("X_train.shape:", X_train.shape)
    print("X_test.shape:", X_test.shape)
    print("y_train.shape:", y_train.shape)
    print("y_test.shape:", y_test.shape)
    # train the model
    model.fit(X_train, y_train, validation_data=(X_test, y_test),
              batch_size=BATCH_SIZE, epochs=EPOCHS,
              callbacks=[tensorboard, model_checkpoint],
              verbose=1)
    
    # serialize model to JSON
    model_json = model.to_json()
    with open("sms-lstm-forprotodash.json", "w") as json_file:
        json_file.write(model_json)

    # serialize weights to HDF5
    model.save_weights("sms-lstm-forprotodash.h5")
    print("Saved model to disk")
        
else: 

    # load json and create model
    json_file = open("sms-lstm-forprotodash.json", 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    model = model_from_json(loaded_model_json)

    # load weights into new model
    model.load_weights("sms-lstm-forprotodash.h5")
    print("Loaded model from disk")

    # print model 
    model.summary()

    model.compile(optimizer="rmsprop", loss="categorical_crossentropy",
                  metrics=["accuracy", keras_metrics.precision(), keras_metrics.recall()])

X_train.shape: (750, 100)
X_test.shape: (250, 100)
y_train.shape: (750, 2)
y_test.shape: (250, 2)
Epoch 1/20
Epoch 00001: val_loss improved from inf to 0.73506, saving model to results/spam_classifier_0.74
INFO:tensorflow:Assets written to: results/spam_classifier_0.74/assets
Epoch 2/20
Epoch 00002: val_loss improved from 0.73506 to 0.58684, saving model to results/spam_classifier_0.59
INFO:tensorflow:Assets written to: results/spam_classifier_0.59/assets
Epoch 3/20
Epoch 00003: val_loss did not improve from 0.58684
Epoch 4/20
Epoch 00004: val_loss did not improve from 0.58684
Epoch 5/20
Epoch 00005: val_loss improved from 0.58684 to 0.55037, saving model to results/spam_classifier_0.55
INFO:tensorflow:Assets written to: results/spam_classifier_0.55/assets
Epoch 6/20
Epoch 00006: val_loss did not improve from 0.55037
Epoch 7/20
Epoch 00007: val_loss did not improve from 0.55037
Epoch 8/20
Epoch 00008: val_loss improved from 0.55037 to 0.53871, saving model to results/spam_classifier_0.

In [None]:
# get the loss and metrics
result = model.evaluate(X_test, y_test)




In [None]:
int2label = {0: "negative", 1: "positive"}

In [None]:
text = "i like the taste of pizza"
pred = get_predictions([text])
print(int2label [ np.argmax(pred, axis=1)[0] ] )

positive


In [None]:
doclist = df['text'].values.tolist()
one_hot_prediction = get_predictions(doclist)
label_prediction = np.argmax(one_hot_prediction, axis=1)

# 0: ham, 1:spam
idx_ham = (label_prediction == 0)
idx_spam = (label_prediction == 1)

In [None]:
# create the transform
vectorizer = TfidfVectorizer()

# tokenize and build vocab
vectorizer.fit(doclist)

vec = vectorizer.transform(doclist)
docvec = vec.toarray()
print(docvec.shape)

(1000, 2035)


In [None]:

# separate spam and ham messages and corrsponding vectors

docvec_spam = docvec[idx_spam, :]
docvec_ham = docvec[idx_ham, :]

df_spam = df[idx_spam]['text']
df_ham = df[idx_ham]['text']

In [None]:
print(df_spam.shape)
print(df_ham.shape)

(499,)
(501,)


Compute prototypes for positive and negative reviews


In [None]:
m = 10

# call protodash explainer
# S contains indices of the selected prototypes
# W contains importance weights associated with the selected prototypes 
(W_spam, S_spam, _) = explainer.explain(docvec_spam, docvec_spam, m=m)
(W_ham, S_ham, _) = explainer.explain(docvec_ham, docvec_ham, m=m)

In [None]:
df_spam_prototypes = df_spam.iloc[S_spam].copy()
df_ham_prototypes = df_ham.iloc[S_ham].copy()

#normalize weights
W_spam = np.around(W_spam/np.sum(W_spam), 2) 
W_ham = np.around(W_ham/np.sum(W_ham), 2)

In [None]:
print("positive prototypes with weights:")
print("----------------------------")
for i in range(m):
    print(W_spam[i], df_spam_prototypes.iloc[i])

positive prototypes with weights:
----------------------------
0.17 The food was excellent and service was very good.
0.17 This place is great!!!!!!!!!!!!!!
0.1 The tables outside are also dirty a lot of the time and the workers are not always friendly and helpful with the menu.
0.1 My boyfriend and I came here for the first time on a recent trip to Vegas and could not have been more pleased with the quality of food and service.
0.09 The food is delicious and just spicy enough, so be sure to ask for spicier if you prefer it that way.
0.09 Of all the dishes, the salmon was the best, but all were great.
0.08 We had 7 at our table and the service was pretty fast.
0.07 The restaurant is very clean and has a family restaurant feel to it.
0.07 They have a really nice atmosphere.
0.07 Now the burgers aren't as good, the pizza which used to be amazing is doughy and flavorless.


In [None]:
print("negative prototypes with weights:")
print("----------------------------")
for i in range(m):
    print(W_ham[i], df_ham_prototypes.iloc[i])

negative prototypes with weights:
----------------------------
0.15 It was not good.
0.11 If you want to wait for mediocre food and downright terrible service, then this is the place for you.
0.1 We had so much to say about the place before we walked in that he expected it to be amazing, but was quickly disappointed.
0.11 My husband and I ate lunch here and were very disappointed with the food and service.
0.09 One of the few places in Phoenix that I would definately go back to again .
0.1 I will never go back to this place and will never ever recommended this place to anyone!
0.1 The selection of food was not the best.
0.09 I don't know what the big deal is about this place, but I won't be back "ya'all".
0.08 The burger had absolutely no flavor - the meat itself was totally bland, the burger was overcooked and there was no charcoal flavor.
0.08 The food was barely lukewarm, so it must have been sitting waiting for the server to bring it out to us.


Given a message, look for similar messages that are classified as spam by classifier¶

> Indented block




In [None]:
k = 0
sample_text = df_spam.iloc[k]
sample_vec = docvec_spam[k]
sample_vec = sample_vec.reshape(1, sample_vec.shape[0])

In [None]:
print(sample_text)
print(sample_vec.shape)

Wow... Loved this place.
(1, 2035)


In [None]:
docvec_spam_other = docvec_spam[np.arange(docvec_spam.shape[0]) != k, :] 
df_spam_other = df_spam.iloc[np.arange(docvec_spam.shape[0]) != k].copy()

In [None]:

# Take a sample spam text and find samples similar to it. 
(W1_spam, S1_spam, _) = explainer.explain(sample_vec, docvec_spam_other, m=m)

In [None]:
#normalize weights
W1_spam = np.around(W1_spam/np.sum(W1_spam), 2)

In [None]:
S1_spam


array([244, 233,  87, 256, 151, 178, 303, 299,  16, 490])

In [None]:
# similar spam prototypes
print("original text")
print("-------------")
print(sample_text)
print("")

print("Similar positive prototypes:")
print("------------------------")
m = 10
for i in range(m):
    print(W1_spam[i], df_spam_other.iloc[S1_spam[i]])

original text
-------------
Wow... Loved this place.

Similar positive prototypes:
------------------------
0.15 We loved the place.
0.23 Wow very spicy but delicious.
0.08 this place is good.
0.22 I LOVED it!
0.1 The goat taco didn't skimp on the meat and wow what FLAVOR!
0.07 I love this place.
0.05 I *heart* this place.
0.04 I really do recommend this place, you can go wrong with this donut place!
0.03 Overall, I like this place a lot.
0.03 This place lacked style!!


Observation¶
Note several spam messages repeat in the dataset as these may have been sent by the same entity to multiple users. As a consequence, the explainer retireves these. Try with a different k above to see prototypes corrsponding to other sample messages.

In [None]:
k = 3
sample_text = df_ham.iloc[k]
sample_vec = docvec_ham[k]
sample_vec = sample_vec.reshape(1, sample_vec.shape[0])

In [None]:
print(sample_text)
print(sample_vec.shape)

The potatoes were like rubber and you could tell they had been made up ahead of time being kept under a warmer.
(1, 2035)


In [None]:
docvec_ham_other = docvec_ham[np.arange(docvec_ham.shape[0]) != k, :] 
df_ham_other = df_ham.iloc[np.arange(docvec_ham.shape[0]) != k].copy()

In [None]:
# Take a sample spam text and find samples similar to it. 
(W1_ham, S1_ham, _) = explainer.explain(sample_vec, docvec_ham_other, m=m)

In [None]:
#normalize weights
W1_ham = np.around(W1_ham/np.sum(W1_ham), 2)

In [None]:
S1_ham


array([422, 482, 495, 240,  16, 294, 211, 187, 483, 134])

In [None]:
# similar spam prototypes
print("original text")
print("-------------")
print(sample_text)
print("")

print("Similar negative prototypes:")
print("------------------------")
m = 10
for i in range(m):
    print(W1_ham[i], df_ham_other.iloc[S1_ham[i]])

original text
-------------
The potatoes were like rubber and you could tell they had been made up ahead of time being kept under a warmer.

Similar negative prototypes:
------------------------
0.14 i felt insulted and disrespected, how could you talk and judge another human being like that?
0.12 I kept looking at the time and it had soon become 35 minutes, yet still no food.
0.14 I can't tell you how disappointed I was.
0.11 This place is two thumbs up....way up.
0.1 Poor service, the waiter made me feel like I was stupid every time he came to the table.
0.08 Maybe if they weren't cold they would have been somewhat edible.
0.08 The last 3 times I had lunch here has been bad.
0.08 TOTAL WASTE OF TIME.
0.08 I have been to very few places to eat that under no circumstances would I ever return to, and this tops the list.
0.07 Just don't know why they were so slow.
