In [2]:
import numpy as np # matrix math
import nltk # NLP
import re # NLP
import gensim.models.word2vec as word2vec # word2vec model
import multiprocessing # cpu_count
import os # for storing and retirieving the files
import codecs # for encoding suring opening

Using TensorFlow backend.


In [1]:
import tensorflow as tf
from tensorflow.contrib.rnn import GRUCell
from tensorflow.python.layers.core import Dense
from tensorflow.contrib import seq2seq as s2s

In [3]:
file_path = '~/data/chat.txt'
f = codecs.open(file_path, 'r', 'utf-8')
corpus = u""
corpus += f.read()

In [4]:
# tokenising the text
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
sentencesList = tokenizer.tokenize(corpus)

def sentence_to_wordlist(raw):
    clean = re.sub("[^a-zA-Z]"," ", raw)
    words = clean.split()
    return words

#sentence where each word is tokenized
sentences = []
for raw_sentence in sentencesList:
    if len(raw_sentence) > 0:
        sentences.append(sentence_to_wordlist(raw_sentence))

In [5]:
for s in sentences[:5]:
    print(s)

['yeah', 'i', 'm', 'preparing', 'myself', 'to', 'drop', 'a', 'lot', 'on', 'this', 'man', 'but', 'definitely', 'need', 'something', 'reliable', 'yeah', 'dude', 'i', 'would', 'definitely', 'consider', 'a', 'daniel', 'defence', 'super', 'reliable', 'and', 'they', 'are', 'just', 'bad', 'ass', 'i', 'm', 'about', 'to', 'meet', 'my', 'mans', 'ex', 'friend', 'with', 'benefit', 'tune', 'in', 'next', 'week', 'to', 'see', 'if', 'i', 'have', 'to', 'put', 'hands', 'on', 'i', 'm', 'dead', 'not', 'looking', 'forward', 'to', 'this', 'shouldn', 't', 'the', 'supporter', 's', 'natural', 'answer', 'to', 's', 'hashtag', 'be']
['or', 'just', 'insert', 'itl', 'to', 'make']
['you', 'want', 'to', 'turn', 'twitter', 'followers', 'into', 'blog', 'readers']
['how', 'do', 'you', 'do', 'this']
['besides', 'if', 'trump', 'say', 'his', 'condolences', 'it', 'won', 't', 'sound', 'genuine', 'ex', 'dwayne', 'wade', 'cousin', 'it', 'will', 'sound', 'all', 'political', 'and', 'petty', 'yea', 'you', 'right']


In [6]:
# Making dictionary of unique tokens
tokens = []
for s in sentences:
    tokens.extend(s)
tokens = sorted(list(set(tokens)))
word2id = dict((c,i) for i,c in enumerate(tokens))
print('[*]Total unique tokens:',len(word2id))

[*]Total unique tokens: 119510


In [7]:
# params for word2vec model
e_dim = 100 # dimension of the vector that we want
workers = multiprocessing.cpu_count() # for multprocessing
min_word_count = 1 # minimum number of times a word must come to be registered
context_size = 8 # length of context sentence that would be considered
downsample = 1e-5 # for words that come too frequently

In [8]:
# Defining the model
w2vector = word2vec.Word2Vec(sg=1, seed=1, size = e_dim, workers = workers,
                             min_count = min_word_count, window = context_size,
                             sample = downsample)

#### No need to perform this part again, simply load the model

In [39]:
w2vector.build_vocab(sentences)
print('[*]size of vocabulary:', w2vector.wv.vocab.__len__())

[*]size of vocabulary: 119510


In [41]:
print(w2vector.corpus_count)
print(len(sentences))
print(w2vector.iter)

716953
716953
5


In [42]:
# Training the model
w2vector.train(sentences, total_examples = w2vector.corpus_count, epochs = w2vector.iter)

13361009

In [28]:
w2vector.wv.most_similar('gym')

[('workout', 0.9794195294380188),
 ('cleaning', 0.9784203767776489),
 ('nap', 0.9771929979324341),
 ('dorm', 0.975080132484436),
 ('sleepy', 0.9730733633041382),
 ('stairs', 0.9730415940284729),
 ('semester', 0.9725555181503296),
 ('disneyland', 0.9722797870635986),
 ('hangout', 0.9721447229385376),
 ('packing', 0.9720941781997681)]

In [20]:
w2vector.wv.most_similar('drive')

[('bart', 0.9653815031051636),
 ('cab', 0.9623619318008423),
 ('parked', 0.9619486331939697),
 ('rides', 0.9606517553329468),
 ('ride', 0.9578251838684082),
 ('garage', 0.9576617479324341),
 ('lyft', 0.9543236494064331),
 ('walk', 0.9519018530845642),
 ('ac', 0.9517987966537476),
 ('crowded', 0.9516102075576782)]

In [17]:
def nearest_similarity_cosmul(start1, end1, end2):
    similarities = w2vector.wv.most_similar_cosmul(
        positive=[end2, start1],
        negative=[end1]
    )
    start2 = similarities[0][0]
    print("{0} is related to {1}, as {2} is related to {3}".format(start1, end1, start2, end2))
    return start2

In [19]:
nearest_similarity_cosmul('drink', 'drinking', 'driving')

drink is related to drinking, as cars is related to driving


'cars'

In [47]:
# saving the model, so that we can use them later
import os
if not os._exists('trained'):
    os.makedirs('trained')

w2vector.save(os.path.join('trained', 'w2vector.w2v'))

## Making a simple Seq2Seq model for chatbot using twitter data

In [8]:
# Making the model as we already have saved the embeddings
# we now need to convert the data to numbers for feeding into the model
tweets = [sentences[i] for i in range(0,len(sentences),2)]
responses = [sentences[i] for i in range(1, len(sentences), 2)]
print('[*]Total correspondance(training examples):', len(tweets))

[*]Total correspondance(training examples): 358477


In [9]:
# loading the model
w2vector = word2vec.Word2Vec.load(os.path.join('trained', 'w2vector.w2v'))

In [10]:
encoder_len = [len(s) for s in tweets]
decoder_len = [len(s) for s in responses]

In [11]:
print('[*]Tweets Maximum Length:', max(encoder_len))
print('[*]Tweets Minimum Lenght:', min(encoder_len))
print('[*]Responses Maximum Length:', max(decoder_len))
print('[*]Responses Minimum Length:', min(decoder_len))

[*]Tweets Maximum Length: 186
[*]Tweets Minimum Lenght: 0
[*]Responses Maximum Length: 190
[*]Responses Minimum Length: 0


In [12]:
def batch_to_embeddings(sentences, w2v_model, max_len, e_dim):
    embedded_sentences = np.zeros([len(sentences), max_len, e_dim])
    for i, sentence in enumerate(sentences):
        if i%30000 == 0:
            print('[!]Processed {0} sentences'.format(i+1))
        for j, token in enumerate(sentence):
            sentences[i][j] = w2v_model.wv[token]
    return embedded_sentences

In [13]:
tweets_embedded = batch_to_embeddings(tweets, w2vector, max(encoder_len), e_dim)

[!]Processed 1 sentences
[!]Processed 30001 sentences
[!]Processed 60001 sentences
[!]Processed 90001 sentences
[!]Processed 120001 sentences
[!]Processed 150001 sentences
[!]Processed 180001 sentences
[!]Processed 210001 sentences
[!]Processed 240001 sentences
[!]Processed 270001 sentences
[!]Processed 300001 sentences
[!]Processed 330001 sentences


In [14]:
responses_embedded = np.zeros([len(responses), max(decoder_len)], dtype = np.int32)
for i in range(len(responses)):
    if i%30000 == 0:
        print('[!]Processed {0} sentences'.format(i+1))
    for j in range(len(responses[i])):
        responses_embedded[i][j] = word2id[responses[i][j]]

[!]Processed 1 sentences
[!]Processed 30001 sentences
[!]Processed 60001 sentences
[!]Processed 90001 sentences
[!]Processed 120001 sentences
[!]Processed 150001 sentences
[!]Processed 180001 sentences
[!]Processed 210001 sentences
[!]Processed 240001 sentences
[!]Processed 270001 sentences
[!]Processed 300001 sentences
[!]Processed 330001 sentences


In [15]:
print('[*]Shape of Tweets:', tweets_embedded.shape)
print('[*]Shape of Responses:', responses_embedded.shape)

[*]Shape of Tweets: (358477, 186, 100)
[*]Shape of Responses: (358476, 190)


## Writing the model in tensorflow

In [32]:
# Declaring the parameters
n_hidden = 128
n_epochs = 5

In [26]:
# Declaring the placeholders
encoder_inputs = tf.placeholder(shape = [None, None, e_dim], dtype = tf.float32, name = 'encoder_inputs')
decoder_inputs = tf.placeholder(shape = [None, None, e_dim], dtype = tf.float32, name = 'decoder_inputs')
decoder_outputs = tf.placeholder(shape = [None, None], dtype = tf.int32, name = 'decoder_outputs')

In [18]:
# Encoder
encoder_cell = tf.contrib.rnn.LSTMCell(n_hidden)
encoder_outputs, encoder_final_state = tf.nn.dynamic_rnn(
    encoder_cell, encoder_inputs,
    dtype=tf.float32, time_major=True,
)

In [19]:
# Decoder
decoder_hidden_units = 128
decoder_cell = tf.contrib.rnn.LSTMCell(decoder_hidden_units)
decoder_outputs, decoder_final_state = tf.nn.dynamic_rnn(
    decoder_cell, decoder_inputs,
    initial_state=encoder_final_state,
    dtype=tf.float32, time_major=True, scope="plain_decoder",
)

In [21]:
decoder_logits = tf.contrib.layers.linear(decoder_outputs, len(word2id))
decoder_prediction = tf.argmax(decoder_logits, 2)

In [27]:
stepwise_cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
    labels=tf.one_hot(decoder_outputs, depth=len(word2id), dtype=tf.float32),
    logits=decoder_logits,
)
loss = tf.reduce_mean(stepwise_cross_entropy)
train_op = tf.train.AdamOptimizer().minimize(loss)

In [30]:
# Making decoder inputs
din_ = np.zeros(shape = [len(responses), max(decoder_len), e_dim], dtype = np.float32)
din_[0] = np.ones(shape = [max(decoder_len), e_dim], dtype = np.float32)
print(din_.shape)

(358476, 190, 100)


In [34]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())

In [None]:
# This may take sometime
for i in range(n_epochs):
    feed_dict = {encoder_inputs: tweets_embedded, decoder_inputs: din_, decoder_outputs: responses_embedded}
    loss, _ = sess.run([loss, train_op], feed_dict = feed_dict)
    print('[$]Epoch {0}, Loss {1}'.format(i+1, loss))

In [1]:
# Saving the model for future use
saver = tf.train.Saver()
saver.save(sess, 'seq2seq_simple')

NameError: name 'tf' is not defined