In [1]:
import tensorflow as tf

In [2]:
import numpy as np
import pandas as pd
import os

In [3]:
with open('Assets/SMSSpamCollection', 'r') as f:
    data = f.read()
    
data[:300]

"ham\tGo until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...\nham\tOk lar... Joking wif u oni...\nspam\tFree entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 084528100"

In [4]:
# Remove punctuation and lowercase
from string import punctuation
all_text = ''.join([c for c in data if c not in punctuation])
all_text = all_text.lower()

# split label and text of each line.
messages = all_text.split('\n')
messages = [x.split('\t') for x in messages if len(x)>=1]
[labels, texts] = np.array([list(x) for x in zip(*messages)])

In [5]:
print("Example: ")
print("Label: {},\tText: {}".format(labels[0],texts[0]))
print("Label: {},\tText: {}".format(labels[1],texts[1]))
print("Label: {},\tText: {}".format(labels[2],texts[2]))

Example: 
Label: ham,	Text: go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat
Label: ham,	Text: ok lar joking wif u oni
Label: spam,	Text: free entry in 2 a wkly comp to win fa cup final tkts 21st may 2005 text fa to 87121 to receive entry questionstd txt ratetcs apply 08452810075over18s


In [6]:
labels = np.array([1 if each == 'spam' else 0 for each in labels])
labels

array([0, 0, 1, ..., 0, 0, 0])

In [7]:
# All words
all_text = ' '.join(texts)
words = all_text.split()
words[:20]

['go',
 'until',
 'jurong',
 'point',
 'crazy',
 'available',
 'only',
 'in',
 'bugis',
 'n',
 'great',
 'world',
 'la',
 'e',
 'buffet',
 'cine',
 'there',
 'got',
 'amore',
 'wat']

In [8]:
from collections import Counter
counts = Counter(words)
vocab = sorted(counts, key=counts.get, reverse = True)
vocab_to_int = {word: ii for ii, word in enumerate(vocab)}

In [9]:
vocab_to_int

{'to': 0,
 'i': 1,
 'you': 2,
 'a': 3,
 'the': 4,
 'u': 5,
 'and': 6,
 'is': 7,
 'in': 8,
 'me': 9,
 'my': 10,
 'for': 11,
 'your': 12,
 'it': 13,
 'of': 14,
 'call': 15,
 'have': 16,
 'on': 17,
 'that': 18,
 'are': 19,
 '2': 20,
 'now': 21,
 'im': 22,
 'so': 23,
 'not': 24,
 'but': 25,
 'or': 26,
 'at': 27,
 'can': 28,
 'do': 29,
 'ur': 30,
 'get': 31,
 'be': 32,
 'will': 33,
 'if': 34,
 'with': 35,
 'just': 36,
 'we': 37,
 'no': 38,
 'this': 39,
 'its': 40,
 'up': 41,
 '4': 42,
 'dont': 43,
 'when': 44,
 'go': 45,
 'ok': 46,
 'from': 47,
 'ltgt': 48,
 'free': 49,
 'all': 50,
 'out': 51,
 'how': 52,
 'what': 53,
 'know': 54,
 'like': 55,
 'got': 56,
 'ill': 57,
 'good': 58,
 'then': 59,
 'was': 60,
 'come': 61,
 'am': 62,
 'only': 63,
 'time': 64,
 'day': 65,
 'love': 66,
 'there': 67,
 'want': 68,
 'send': 69,
 'text': 70,
 'he': 71,
 'as': 72,
 'by': 73,
 'going': 74,
 'one': 75,
 'need': 76,
 'about': 77,
 'txt': 78,
 'home': 79,
 'lor': 80,
 'see': 81,
 'sorry': 82,
 'still': 83,


In [10]:
# Convert the reviews to integers, same shape as reviews list, but with integers
text_ints = []
for each in texts:
    text_ints.append([vocab_to_int[word] for word in each.split()])

In [11]:
text_ints[3]

[5, 231, 140, 23, 356, 2911, 5, 160, 143, 59, 140]

In [12]:
from collections import Counter
text_lens = Counter([len(x) for x in text_ints])
print("Zero-length text: {}".format(text_lens[0]))
print("Maximum text length: {}".format(max(text_lens)))

Zero-length text: 2
Maximum text length: 171


In [13]:
non_zero_idx = [ii for ii, texts in enumerate(text_ints) if len(texts) != 0]
len(non_zero_idx)

5572

In [14]:
len(texts)

5574

In [15]:
# fillter out that review with 0 length
text_ints = [text_ints[ii] for  ii in non_zero_idx]
labels = np.array([labels[ii] for ii in non_zero_idx])

In [16]:
seq_len = 170
features = np.zeros((len(text_ints), seq_len), dtype=int)
for i, row in enumerate(text_ints):
    features[i, -len(row):] = np.array(row)[:seq_len]

In [17]:
features[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

In [18]:
split_frac = 0.8
split_idx = int(len(features)*0.8)
train_x, val_x = features[:split_idx], features[split_idx:]
train_y, val_y = labels[:split_idx], labels[split_idx:]

test_idx = int(len(val_x)*0.5)
val_x, test_x = val_x[:test_idx], val_x[test_idx:]
val_y, test_y = val_y[:test_idx], val_y[test_idx:]

print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_x.shape), 
      "\nValidation set: \t{}".format(val_x.shape),
      "\nTest set: \t\t{}".format(test_x.shape))

			Feature Shapes:
Train set: 		(4457, 170) 
Validation set: 	(557, 170) 
Test set: 		(558, 170)


In [75]:
lstm_size = 256
lstm_layers = 2
batch_size = 250
learning_rate = 0.001
drop_out = 0.5
epochs = 10

In [76]:
n_words = len(vocab_to_int)

# Create the graph object
graph = tf.Graph()
# Add nodes to the graph
with graph.as_default():
    inputs_ = tf.compat.v1.placeholder(tf.int32, [None, None], name='inputs')
    labels_ = tf.compat.v1.placeholder(tf.int32, [None, None], name='labels')
    keep_prob = tf.compat.v1.placeholder(tf.float32, name='keep_prob')

In [77]:
# Size of the embedding vectors (number of units in the embedding layer)
embed_size = 300 

with graph.as_default():
    embedding = tf.Variable(tf.random.normal((n_words, embed_size), -1, 1))
    embed = tf.nn.embedding_lookup(embedding, inputs_)

In [78]:

with graph.as_default():
    def lstm_cell():
        cell = tf.compat.v1.nn.rnn_cell.LSTMCell(lstm_size, 
                                       initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2),
                                       state_is_tuple=True)
        drop = tf.compat.v1.nn.rnn_cell.DropoutWrapper(cell, output_keep_prob=keep_prob)
        return drop
    
    stack_cells = tf.compat.v1.nn.rnn_cell.MultiRNNCell([lstm_cell() for _ in range(lstm_layers)])
    
    initial_state = state = stack_cells.zero_state(batch_size, tf.float32)



In [79]:
with graph.as_default():
    outputs, final_state = tf.compat.v1.nn.dynamic_rnn(stack_cells, embed, initial_state=initial_state)

In [80]:
with graph.as_default():
    predictions = tf.compat.v1.layers.dense(outputs[:, -1], 1, activation=tf.sigmoid)
    cost = tf.compat.v1.losses.mean_squared_error(labels_, predictions)
    
    optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate).minimize(cost)

In [81]:
with graph.as_default():
    correct_pred = tf.equal(tf.cast(tf.round(predictions), tf.int32), labels_)
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [82]:
def get_batches(x, y, batch_size=100):
    
    n_batches = len(x)//batch_size
    x, y = x[:n_batches*batch_size], y[:n_batches*batch_size]
    for ii in range(0, len(x), batch_size):
        yield x[ii:ii+batch_size], y[ii:ii+batch_size]

In [83]:
with graph.as_default():
    saver = tf.compat.v1.train.Saver()

with tf.compat.v1.Session(graph=graph) as sess:
    sess.run(tf.compat.v1.global_variables_initializer())
    iteration = 1
    for e in range(epochs):
        state = sess.run(initial_state)
        
        for ii, (x, y) in enumerate(get_batches(train_x, train_y, batch_size), 1):
            feed = {inputs_: x,
                    labels_: y[:, None],
                    keep_prob: drop_out,
                    initial_state: state}
            loss, state, _ = sess.run([cost, final_state, optimizer], feed_dict=feed)
            
            if iteration%5==0:
                print("Epoch: {}/{}".format(e, epochs),
                      "Iteration: {}".format(iteration),
                      "Train loss: {:.3f}".format(loss))

            if iteration%25==0:
                val_acc = []
                val_state = sess.run(stack_cells.zero_state(batch_size, tf.float32))
                for x, y in get_batches(val_x, val_y, batch_size):
                    feed = {inputs_: x,
                            labels_: y[:, None],
                            keep_prob: 1,
                            initial_state: val_state}
                    batch_acc, val_state = sess.run([accuracy, final_state], feed_dict=feed)
                    val_acc.append(batch_acc)
                print("Val acc: {:.3f}".format(np.mean(val_acc)))
            iteration +=1
    saver.save(sess, "checkpoints/sentiment.ckpt")

Epoch: 0/10 Iteration: 5 Train loss: 0.125
Epoch: 0/10 Iteration: 10 Train loss: 0.101
Epoch: 0/10 Iteration: 15 Train loss: 0.087
Epoch: 1/10 Iteration: 20 Train loss: 0.109
Epoch: 1/10 Iteration: 25 Train loss: 0.075
Val acc: 0.882
Epoch: 1/10 Iteration: 30 Train loss: 0.059
Epoch: 2/10 Iteration: 35 Train loss: 0.070
Epoch: 2/10 Iteration: 40 Train loss: 0.054
Epoch: 2/10 Iteration: 45 Train loss: 0.041
Epoch: 2/10 Iteration: 50 Train loss: 0.036
Val acc: 0.954
Epoch: 3/10 Iteration: 55 Train loss: 0.035
Epoch: 3/10 Iteration: 60 Train loss: 0.015
Epoch: 3/10 Iteration: 65 Train loss: 0.035
Epoch: 4/10 Iteration: 70 Train loss: 0.009
Epoch: 4/10 Iteration: 75 Train loss: 0.014
Val acc: 0.968
Epoch: 4/10 Iteration: 80 Train loss: 0.025
Epoch: 4/10 Iteration: 85 Train loss: 0.015
Epoch: 5/10 Iteration: 90 Train loss: 0.013
Epoch: 5/10 Iteration: 95 Train loss: 0.020
Epoch: 5/10 Iteration: 100 Train loss: 0.012
Val acc: 0.968
Epoch: 6/10 Iteration: 105 Train loss: 0.014
Epoch: 6/10 Ite

In [84]:
test_acc = []
with tf.compat.v1.Session(graph=graph) as sess:
    saver.restore(sess, tf.compat.v1.train.latest_checkpoint('checkpoints'))
    test_state = sess.run(stack_cells.zero_state(batch_size, tf.float32))
    for ii, (x, y) in enumerate(get_batches(test_x, test_y, batch_size), 1):
        feed = {inputs_: x,
                labels_: y[:, None],
                keep_prob: 1,
                initial_state: test_state}
        batch_acc, test_state = sess.run([accuracy, final_state], feed_dict=feed)
        test_acc.append(batch_acc)
    print("Test accuracy: {:.3f}".format(np.mean(test_acc)))

INFO:tensorflow:Restoring parameters from checkpoints\sentiment.ckpt
Test accuracy: 0.986
