In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
database = pd.read_csv('SPAM text message 20170820 - Data.csv')
database.head(3)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...


# Validating the Database

In [3]:
database = database.as_matrix()
print(database[0])

['ham'
 'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...']


In [4]:
print(database[0][0])

ham


In [5]:
print(database[0][1])

Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...


In [6]:
print(database.shape)

(5572, 2)


In [7]:
print(type(database).__name__)

ndarray


In [8]:
messages = database[:, 1]
labels = database[:, 0]

In [9]:
print(labels[0])

ham


In [10]:
print(messages[0])

Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...


In [11]:
print(type(messages).__name__)

ndarray


# Time to vetorize the text

In [12]:
vocab = [''] #Because the 0 index will be use as padding
for ii in range(messages.shape[0]):
    for word in messages[ii].split():
        vocab.append(word) #Create a list of words

In [13]:
print(len(vocab))

86836


In [14]:
vocab_set = set(vocab) #Creates a set of unique words with indexes
print(len(vocab_set))

15687


# Create word2vec, vec2word

In [15]:
word2vec = {} #Will be used to transform words in indexes
vec2word = {} #Will be used to transform indexes in words

for i, word in enumerate(vocab_set):
    word2vec[word] = i
    vec2word[i] = word

# Preparing the database

In [16]:
messages = np.array([[word2vec[word] for word in message.split()] for message in messages])
print(messages[0])

[9603, 11709, 15406, 12286, 10448, 3541, 11690, 10338, 2993, 6028, 14043, 14207, 15470, 2584, 4395, 9572, 7786, 13634, 2773, 11147]


In [17]:
labels = np.array([1 if label == 'ham' else 0 for label in labels])
print(labels[0])

1


In [18]:
#Function to vetorize messages in fixed length arrays for the training

def vectorizing_messages(messages):
    
    vector_message = np.zeros((messages.shape[0], 300))
    
    for i, row in enumerate(messages):
        vector_message[i, -len(row):] = np.array(row)[:300]
    
    return vector_message

messages = vectorizing_messages(messages)
print(messages[0])

[    0.     0.     0.     0.     0.     0.     0.     0.     0.     0.
     0.     0.     0.     0.     0.     0.     0.     0.     0.     0.
     0.     0.     0.     0.     0.     0.     0.     0.     0.     0.
     0.     0.     0.     0.     0.     0.     0.     0.     0.     0.
     0.     0.     0.     0.     0.     0.     0.     0.     0.     0.
     0.     0.     0.     0.     0.     0.     0.     0.     0.     0.
     0.     0.     0.     0.     0.     0.     0.     0.     0.     0.
     0.     0.     0.     0.     0.     0.     0.     0.     0.     0.
     0.     0.     0.     0.     0.     0.     0.     0.     0.     0.
     0.     0.     0.     0.     0.     0.     0.     0.     0.     0.
     0.     0.     0.     0.     0.     0.     0.     0.     0.     0.
     0.     0.     0.     0.     0.     0.     0.     0.     0.     0.
     0.     0.     0.     0.     0.     0.     0.     0.     0.     0.
     0.     0.     0.     0.     0.     0.     0.     0.     0.     0.
     0

In [19]:
#Function to split the database

def set_training_data(messages, labels, train_size):
    training_size = int(len(messages)*train_size)
    train_x, val_x = messages[:training_size], messages[training_size:]
    train_y, val_y = labels[:training_size], labels[training_size:]
    
    test_size = int(len(val_x)*0.5)
    
    val_x, test_x = val_x[:test_size], val_x[test_size:]
    val_y, test_y = val_y[:test_size], val_y[test_size:]
    
    return train_x, val_x, train_y, val_y, test_x, test_y
    

## Preparing the RNN

In [20]:
#Getting the placeholders for the data

def get_inputs():
    
    X_ = tf.placeholder(tf.int32, shape=(None, None), name="inputs")
    y_ = tf.placeholder(tf.int32, shape=(None), name="labels")
    keep_prob = tf.placeholder(tf.float32)
    return X_, y_, keep_prob

In [21]:
#Embedding the vocab_set, this will help to speed training because will embed words with his ouwn neighbours instead of passing
#One big one-hot-encoded vector with all the words in the vocab_set

def get_embed(X, embed_size, vocab_set):
    
    init_embeds = tf.Variable(tf.random_uniform((len(vocab_set), embed_size),-1, 1))
    embeddings = tf.nn.embedding_lookup(init_embeds, X)
    
    return embeddings

In [22]:
#Return a Cell
def RNN_Cell(lstm_size):
    return tf.contrib.rnn.BasicLSTMCell(lstm_size)

In [23]:
#Build our RNN
def build_RNN(lstm_size, embed, keep_prob, batch_size, num_layers):
    
    Cell = [RNN_Cell(lstm_size) for layer in range(num_layers)]
    drop = [tf.contrib.rnn.DropoutWrapper(cell, input_keep_prob = keep_prob) for cell in Cell]
    multi_rnn = tf.contrib.rnn.MultiRNNCell(drop)
    initial_state = multi_rnn.zero_state(batch_size = batch_size, dtype=tf.float32)
    initial_state = tf.identity(initial_state, "initial_state")
    output, states = tf.nn.dynamic_rnn(multi_rnn, embed, dtype=tf.float32)

    return initial_state, output, states

In [24]:
#Our output layer will transform the output of the RNN in a Logit, the RNN output comes in shape [batch_size, seq_length, lstm_size]
#So we chose the last item in the seq_length column with all the batches and project it with a dense layer
#This projection will pass to a sigmoid function to scale the output in a logit 0 to 1
def output_layer(output, lstm_size,n_classes):
    
    logit = tf.layers.dense(output[:,-1], n_classes)
    logits = tf.nn.sigmoid(logit)
    
    return logits

# Build the training

In [25]:
#HyperParameters
lstm_size = 50
learning_rate = 0.1
batch_size = 100
n_epochs = 5
num_layers = 1
embed_size = 200
n_classes = 2

In [26]:
#Seting the data
train_x, val_x, train_y, val_y, test_x, test_y = set_training_data(messages, labels, 0.8)

In [27]:
#Building training

tf.reset_default_graph()

X, y, keep_prob = get_inputs() #placeholders

embed = get_embed(X, embed_size, vocab_set) #embedding the placeholder

initial_state, output, states = build_RNN(lstm_size, embed, keep_prob, batch_size, num_layers) #getting the RNN output

logits = output_layer(output, lstm_size, n_classes) #getting the Logit

cost = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels = y, logits = logits)) #Cost Function
#Our cost function is a sparse_softmax, this helps in a way that we dont have to one-hot-encode tha labels

optmizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(cost)
#Our optmizer will reduce the cost

correct = tf.nn.in_top_k(logits, y, 1) #Get the correct predictions, we use tf.nn.in_top_k with sparse_softmax
accuracy = tf.reduce_mean(tf.cast(correct, "float")) #Calculate the mean of the right predictions

init = tf.global_variables_initializer()

saver = tf.train.Saver()

with tf.Session() as sess:
    
    init.run()
    initial_state.eval()
    
    for epoch in range(n_epochs):
        
        n_batches = int(len(train_x)/batch_size)
        X_batches = np.array_split(train_x, n_batches)
        y_batches = np.array_split(train_y, n_batches)
        
        for ii in range(n_batches):
            
            X_batch, y_batch = X_batches[ii], y_batches[ii]
            
            feed = {X: X_batch, y: y_batch, keep_prob:0.8}
            
            loss = cost.eval(feed)
            opt = sess.run(optmizer, feed_dict = feed)
            
    
        print("Epoch:{}          :           Cost:{}      :     Accuracy:{}".format(epoch, loss, accuracy.eval({X:val_x, y:val_y, keep_prob:1})))
        
    save = saver.save(sess, './logs/rnn.ckpt')
    

Epoch:0          :           Cost:0.3357914984226227      :     Accuracy:0.9676840305328369
Epoch:1          :           Cost:0.3400900065898895      :     Accuracy:0.9712746739387512
Epoch:2          :           Cost:0.31378594040870667      :     Accuracy:0.9676840305328369
Epoch:3          :           Cost:0.31363633275032043      :     Accuracy:0.9533213376998901
Epoch:4          :           Cost:0.3153209686279297      :     Accuracy:0.9551166892051697


# Testing in the test set

In [29]:
from sklearn.metrics import f1_score

with tf.Session() as sess:
    saver.restore(sess, './logs/rnn.ckpt')
    initial_state.eval()
    pred = logits.eval({X:test_x, keep_prob:1})
    results = np.argmax(pred, 1)
    f1 = f1_score(results, test_y) 
    print('Accuracy on test set:{}'.format(f1))
    
    
    
    

INFO:tensorflow:Restoring parameters from ./logs/rnn.ckpt
Accuracy on test set:0.983739837398374


# Final considerations

This was some fun with RNN's, the code is very simplistic but yet we got a good result. I choose'd to not tokenize pontuaction on the messages such as "!" or "?" so I know i could improve on that, but the overall results was satisfacting for me.
Thanks!