In [1]:
import re
import numpy as np
from random import shuffle

In [2]:
file = open('words.txt') #dataset of many words
words = []
word_embeddings = [] #vector representation of words, one unique number for each character
i = 0
for line in file:
        if not re.search('[/&\d]', line):
            if(len(line) > 1 and line[-2] == '!'):
                line = line[:-1]
            word = line[:-1].lower() + ' ' #add a space at the end of each word
            words.append(word)
unique_chars = set(''.join(words))
ref = dict(zip(unique_chars, np.arange(len(unique_chars)))) #dictionary mapping from character to number
backref = dict(zip(ref.values(), ref.keys())) #reverse of ^
word_embeddings = [[ref[char] for char in word] for word in words] #convert words to embeddings
#[print(''.join([backref[index] for index in word])) for word in word_embeddings[1000:1010]]
max_word_len = np.max([len(word) for word in words])
words_by_len = [list([]) for len in range(max_word_len)] #sort words by their length for training
for word in word_embeddings:
    words_by_len[len(word)-1].append(word)
for length, words_of_length in enumerate(words_by_len):
    print(length, len(words_of_length))

0 0
1 26
2 610
3 4632
4 11144
5 22939
6 39511
7 52087
8 61009
9 61755
10 54321
11 46410
12 37524
13 27924
14 19258
15 12148
16 7114
17 3982
18 2003
19 1053
20 506
21 238
22 102
23 49
24 18
25 7
26 2
27 3
28 2
29 2
30 0
31 1
32 0
33 0
34 0
35 0
36 0
37 0
38 0
39 0
40 0
41 0
42 0
43 0
44 0
45 1


In [3]:
###MODEL
import tensorflow as tf
num_classes = len(unique_chars)
num_layers = 5
state_size = 20
batch_size = 5

x = tf.placeholder(tf.int32, [batch_size, None])
y = tf.placeholder(tf.int32, [batch_size, None])
state = tf.placeholder(tf.float32, [num_layers, 2, batch_size, state_size])
state_tuple = tuple([tf.contrib.rnn.LSTMStateTuple(layer_state[0], layer_state[1]) for layer_state in tf.unstack(state, axis = 0)])

cell = lambda : tf.contrib.rnn.BasicLSTMCell(state_size, state_is_tuple = True)
cell = tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.DropoutWrapper(cell(), 0.5) for layer in range(num_layers)], state_is_tuple = True)
output_state, current_state = tf.nn.dynamic_rnn(cell, tf.cast(tf.expand_dims(x, -1), tf.float32), initial_state = state_tuple)

W = tf.Variable(np.random.random([state_size, num_classes]), dtype = tf.float32)
b = tf.Variable(np.zeros([1, num_classes]), dtype = tf.float32)

logits = tf.matmul(tf.reshape(output_state, [-1, state_size]), W) + b

losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits = logits, labels = tf.reshape(y, [-1]))
loss = tf.reduce_mean(losses)

In [8]:
#helper functions:
def convert_to_input(string):
    return np.tile([ref[char] for char in string], (5, 1))

In [10]:
def convert_to_output(logits):
    temp = np.argmax(logits, axis = 1)
    return ''.join([backref[char] for char in temp])

In [4]:
train_op = tf.train.AdamOptimizer(0.03).minimize(loss)
init_op = tf.global_variables_initializer()

In [5]:
#training:

sess = tf.Session()
sess.run(init_op)
_state = np.zeros([num_layers, 2, batch_size, state_size], dtype = np.float32)
count = 0
losses = []
for word_len in range(max_word_len - 1, 0, -1):
    print("Training for words of length:", word_len)
    num_words = len(words_by_len[word_len])
    for index in range(0, num_words, 5):
        if( (num_words - 1) - index + 1 < 5):
            break
        _x = words_by_len[word_len][index : index + 5]
        _y = np.roll(_x, -1, axis = 1)
        _y[:, -1] = ref[' ']
        _train, _loss = sess.run([train_op, loss], {x: _x, y: _y, state: _state})
        count += 1
        losses.append(_loss)
        if(count % 100 == 0):
            print("Trained:", count, "\tLoss: ", np.mean(losses))
            losses = []

Training for words of length: 45
Training for words of length: 44
Training for words of length: 43
Training for words of length: 42
Training for words of length: 41
Training for words of length: 40
Training for words of length: 39
Training for words of length: 38
Training for words of length: 37
Training for words of length: 36
Training for words of length: 35
Training for words of length: 34
Training for words of length: 33
Training for words of length: 32
Training for words of length: 31
Training for words of length: 30
Training for words of length: 29
Training for words of length: 28
Training for words of length: 27
Training for words of length: 26
Training for words of length: 25
Training for words of length: 24
Training for words of length: 23
Training for words of length: 22
Training for words of length: 21
Training for words of length: 20
Trained: 100 	Loss:  2.9044
Training for words of length: 19
Trained: 200 	Loss:  2.67944
Trained: 300 	Loss:  2.56225
Training for words of l

In [None]:
### test using:
#convert_to_output( sess.run( logits, 
#{x: covert_to_input(TEST_STR),
#state: np.zeros([num_layers, 2, batch_size, state_size], dtype = np.float32)
#}))