# Imports

In [1]:
import collections
import math
import numpy as np
import os
import random
import tensorflow as tf
import zipfile
from matplotlib import pylab
from six.moves import range
from sklearn.manifold import TSNE
# import pickle
import string
from six.moves.urllib.request import urlretrieve

# Data

In [2]:
url = 'http://mattmahoney.net/dc/'

def maybe_download(filename, expected_bytes):
    """Download a file if not present, and make sure it's the right size."""
    if not os.path.exists(filename):
        filename, _ = urlretrieve(url + filename, filename)
    statinfo = os.stat(filename)
    if statinfo.st_size == expected_bytes:
        print('Found and verified %s' % filename)
    else:
        print(statinfo.st_size)
        raise Exception(
          'Failed to verify ' + filename + '. Can you get to it with a browser?')
    return filename

filename = maybe_download('text8.zip', 31344016)

Found and verified text8.zip


In [3]:
def read_data(filename):
    f = zipfile.ZipFile(filename)
    for name in f.namelist():
        return tf.compat.as_str(f.read(name))
    f.close()

In [4]:
  
text = read_data(filename)
print('Data size %d' % len(text))

Data size 100000000


# Alphabet

In [5]:
#create alphabet
alphabet = ' '+string.ascii_lowercase
global alphabet
alphabet_size = len(alphabet)

# Helper functions

In [6]:
#character to int
def char2id(x):
    if x in alphabet:
        return alphabet.find(x)
    else:
        return 0

print char2id(' '),char2id('a'),char2id('z'),char2id('0'),char2id('9'),char2id('10')

0 1 26 0 0 0


In [7]:
#id to char
def id2char(x):
    return alphabet[x]

In [8]:
print id2char(0),id2char(1),id2char(26)#,id2char(27),id2char(36)

  a z


In [9]:
#letter to one_hot encoded vector
def char2vec(x):
    r = np.zeros([alphabet_size],dtype=np.int8)
    r[char2id(x)] = 1.0
    return r

In [10]:
print char2vec(' '),char2vec('b'),char2vec('z')

[1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] [0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1]


In [11]:
#turn the one-hot vector into a matrix
def vec2mat(x):
    return np.reshape(x,(1,len(x)))

In [12]:
vec2mat(char2vec('a'))

array([[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0]], dtype=int8)

In [13]:
#create batches (single examples)
batch_size = 1

# Batch for training examples

In [14]:
class Batch(object):
    def __init__(self,text,batch_size):
        self.text = text
        self.text_size = len(text)
        self.segment_size = self.text_size//batch_size
        self.cursors = [self.segment_size*b for b in range(batch_size)]
    
    def next(self):
        self.cursors = [(c + 1) % self.text_size for c in self.cursors]
        x,y = [self.text[c] for c in self.cursors], [self.text[(c+1)%self.text_size] for c in self.cursors]
        return x,y

In [15]:
batches = Batch(text,batch_size)

In [16]:
batches.next()

(['a'], ['n'])

# Sample a valid probability distribution

In [17]:
def sample_prob(prob):
    r=random.random()
    s=0
    for i,p in enumerate(prob):
        s+=p
        if s>=r:
            return i
    return 'Awry sampling probs'

In [18]:
sample_prob([0.5,.5,0])

0

# Window of batches

In [19]:
#currently missing a lot by calling b.next() all the time
def getWindowBatch(b,num_unrollings):
    window_batch = []
    for i in range(num_unrollings):
        window_batch.append(b.next())
    window_batch=zip(*window_batch)
    x,y = list(window_batch[0]),list(window_batch[1])
    return x,y

In [20]:
# batch_x,batch_y=getWindowBatch(b,num_unrollings)

In [21]:
# batch_x

In [22]:
# map(char2id,batch_x)

In [23]:
# map(char2vec,batch_x)

In [24]:
# map(char2vec,batch_x)[0].reshape([1,alphabet_size])

In [25]:
# for vec in map(char2vec,batch_x):
#     vec

In [26]:
# #need to run the graph first before this will run
# feed_dict={}
# x = map(char2vec,batch_x)
# for i in range(num_unrollings):
#     x[i]
#     feed_dict[train[i]] = x[i].reshape([1,alphabet_size])

In [27]:
# map(char2vec,batch_x)

# Create a random probrobability distribution of the elements in the alphabet

In [28]:
#random distribution
def random_dist():
    r = np.random.rand(alphabet_size)
    return r/np.sum(r)
random_dist()

array([ 0.07586472,  0.07680977,  0.06466536,  0.0048204 ,  0.02751051,
        0.03076177,  0.0135053 ,  0.02446717,  0.04314315,  0.0047593 ,
        0.01785831,  0.07374549,  0.00862088,  0.05419679,  0.03405243,
        0.06729639,  0.03927635,  0.01483897,  0.05838815,  0.00418858,
        0.02711004,  0.0567878 ,  0.02559417,  0.01566361,  0.05959378,
        0.03381121,  0.04266963])

# Negative log likelihood

In [29]:
# import matplotlib.pyplot as plt
# %matplotlib inline

In [30]:
# x = [i*.01 for i in range(0,100)]
# result=[]
# for a in x:
#     if a == 0:
#         result.append(10)
#     else:
#         result.append(-np.log(a))
# print result

In [31]:
# plt.plot(x,result)
# plt.ylabel('-log p(x)')
# plt.xlabel('p(x)')
# plt.show()

In [32]:
# np.array([[2,3,10],[2,2,4]])

In [33]:
# g = tf.Graph()
# with g.as_default():
#     outputs = tf.constant([[200.,3.,10.],[200.,2.,4.]])
#     labels=tf.constant([[0.,0.,1.],[0.,0.,1.]])
#     softmax = tf.nn.softmax(outputs)
#     fuck = tf.reduce_sum(tf.minimum(-tf.log(tf.reduce_sum(tf.multiply(tf.nn.softmax(outputs),labels),1)),10))

In [34]:
# sess = tf.Session(graph=g)
# print softmax.eval(session=sess)
# print fuck.eval(session=sess)

In [35]:
# -(np.log(.998)+np.log(.789))

In [36]:
# np.log(.789)

# Using builtin log likelihood

# Arch 1

In [37]:
batch_size=64

In [38]:
#build the graph
num_nodes = 50
num_unrollings = 20

g = tf.Graph()
with g.as_default():
    #input fed into the cell, could be a batch of training data or a single one-hot encoded vector
    train = list()
    for i in range(num_unrollings):
        train.append(tf.placeholder(tf.float32,shape=(batch_size,alphabet_size)))
    
    #the previous hidden layer gets fed into the cell
    output_feed= tf.placeholder(tf.float32,shape=(batch_size,num_nodes),name='one')
    
    #one-hot encoded labels for training
    labels = list()
    for i in range(num_unrollings):
        labels.append(tf.placeholder(tf.float32,shape=(batch_size,alphabet_size)))
        
    
    #validation place holder
    val_input = tf.placeholder(tf.float32,shape=(1,alphabet_size))
    val_output = tf.placeholder(tf.float32,shape=(1,num_nodes))
    
    
    #Variables
    #input matrix
    U = tf.Variable(tf.truncated_normal([alphabet_size,num_nodes],-0.1,0.1))
    
    #recurrent matrix multiplies previous hidden layer
    W = tf.Variable(tf.truncated_normal([num_nodes,num_nodes],-0.1,0.1))
    
    #bias vector
    b = tf.Variable(tf.zeros([1,num_nodes]))
    
    #output matrix
    V = tf.Variable(tf.truncated_normal([num_nodes,alphabet_size],-0.1,0.1))
    c = tf.Variable(tf.zeros([1,alphabet_size]))
    
    
    
    #model
    def RNN(i,h_input):
        a = tf.matmul(i,U)+tf.matmul(h_input,W)+b
        h_output = tf.nn.tanh(a)
        o_out = tf.matmul(h_output,V)+c
        return h_output,o_out
    
    
    #when training truncate the gradients aftern num_unrollings
    for i in range(num_unrollings):
        if i == 0:
            outputs = list()
            hidden_after,output_after = RNN(train[i],output_feed)
        else:
            hidden_after,output_after = RNN(train[i],hidden)
        hidden = hidden_after
        outputs.append(output_after)
    
    #train
    
    #log likelihood loss
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(tf.concat(0,outputs),tf.concat(0,labels)))

    
    #optimizer
    global_step = tf.Variable(0)
    learning_rate = tf.train.exponential_decay(
        learning_rate=5.0,global_step=global_step, decay_steps=5000, decay_rate=0.5, staircase=True)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    gradients,var=zip(*optimizer.compute_gradients(loss))
    gradients_clipped, _ = tf.clip_by_global_norm(gradients, 1.25)
    opt=optimizer.apply_gradients(zip(gradients_clipped,var),global_step=global_step)
    
    # Validation
    val_hidden_after,val_output_after = RNN(val_input,val_output) #change train to input_in
    #val_logits = tf.matmul(val_output_after,W_softmax)+b_softmax
    val_probs = tf.nn.softmax(val_output_after)
    
    #add init op to the graph
    init = tf.initialize_all_variables()

Instructions for updating:
Use `tf.global_variables_initializer` instead.


In [39]:
num_steps=50001
b = Batch(text,batch_size)

sess=tf.Session(graph=g)
sess.run(init)
average_loss = 0

for step in range(num_steps):
    #get the new inputs and labels
    batch_x,batch_y=getWindowBatch(b,num_unrollings)
    
    if step%b.text_size == 0:
        print "NEW EPOCH"

    #initialize the output
    if step == 0: #initialize the output state vectors
        output_pass = np.zeros([batch_size,num_nodes],dtype=np.float32)
    feed_dict={output_feed: output_pass}
    
    #trains x
    mega_batch_x = [] #each elemnt will be a batch.  there will be n elements where n is the number of unrollings
    for n in range(num_unrollings):
        batch = np.ndarray((batch_size,alphabet_size)) #contain all the one-hot encoding of the characters
        for ba in range(batch_size):
            batch[ba]=char2vec(batch_x[n][ba])
        mega_batch_x.append(batch)
    
    for i in range(num_unrollings):
        feed_dict[train[i]] = mega_batch_x[i]
    
    #trains y
    mega_batch_y = [] #each elemnt will be a batch.  there will be n elements where n is the number of unrollings
    for n in range(num_unrollings):
        batch = np.ndarray((batch_size,alphabet_size)) #contain all the one-hot encoding of the characters
        for ba in range(batch_size):
            batch[ba]=char2vec(batch_y[n][ba])
        mega_batch_y.append(batch)
    for i in range(num_unrollings):
        feed_dict[labels[i]] = mega_batch_y[i]
    
    output_pass,l,_=sess.run([hidden_after,loss,opt],feed_dict=feed_dict)
    average_loss += l
    if step % 1000 == 0:
        print 'Average loss: ',str(average_loss/1000)
        average_loss = 0
        
        
        print 'Learning rate: ', str(learning_rate.eval(session=sess))
        #sample and then generate text
        s=''
        
        #initialize the validations out, state, and character
        val_output_O = np.zeros(num_nodes).reshape(1,num_nodes)
        
        char_id = sample_prob(random_dist()) #create a random distribution then sample
        val_input_O = vec2mat(char2vec(id2char(char_id)))

        s+=id2char(char_id)
        for _ in range(100):
            feed_dict = {val_input: val_input_O, 
                         val_output: val_output_O}
            val_output_O,dist = sess.run([val_hidden_after,val_probs],feed_dict=feed_dict)
            char_id=sample_prob(dist[0])
            val_input_O = vec2mat(char2vec(id2char(char_id)))
            s+=id2char(char_id)
        print s

NEW EPOCH
Average loss:  0.00348291158676
Learning rate:  5.0
ne g g q q q g g e g g g g z g q z q g q g g g g g e q g q q q q q g g g q g g q g q q q z e e g q g 
Average loss:  15.1852117014
Learning rate:  5.0
joyiyiyiyiyiyiyiyiyiyiyiyiyiyiyiyiyiyiyiyiyiyiyiyiyiyiyiyiyiyiyiyiyiyiyiyiyiyiyiyiyiyiyiyiyiyiyiyiyiy
Average loss:  13.7378981442
Learning rate:  5.0
z  i                                                    i                                            
Average loss:  15.5829564743
Learning rate:  5.0
km       o i    oe                             i                 e                          o        
Average loss:  15.5096308351
Learning rate:  5.0
ivooooooooirroooogooooooooovoooroooooooooooooooojoooooojoooorroorooooooogoooorrooooroooloomooovrooooo
Average loss:  15.5792679033
Learning rate:  2.5
jvmadmmaaadaaahahamamaaaaaaamadmaadamdaaaadaaahdahamaaaaahaamadaaafdahamdmdadaddpadaaaaadamadmmaadmdd
Average loss:  7.14237965536
Learning rate:  2.5
 iveoosseosooeeofeerreeoedesrrss

best: 2.17

# Arch 2 
## Train with Teacher forcing

In [40]:
batch_size = 128

In [50]:
#build the graph
num_nodes = 100
num_unrollings = 10

g = tf.Graph()
with g.as_default():
    #input fed into the cell, could be a batch of training data or a single one-hot encoded vector
    train = list()
    for i in range(num_unrollings):
        train.append(tf.placeholder(tf.float32,shape=(batch_size,alphabet_size)))
    
    #the previous output the gets fed into the cell
    output_feed= tf.placeholder(tf.float32,shape=(batch_size,alphabet_size),name='one')
    
    #one-hot encoded labels for training
    labels = list()
    for i in range(num_unrollings):
        labels.append(tf.placeholder(tf.float32,shape=(batch_size,alphabet_size)))
        
    
    #validation place holder
    val_input = tf.placeholder(tf.float32,shape=(1,alphabet_size))
    val_output = tf.placeholder(tf.float32,shape=(1,alphabet_size))
    
    
    #Variables
    #input matrix
    U = tf.Variable(tf.truncated_normal([alphabet_size,num_nodes],-0.1,0.1))
    
    #recurrent matrix multiplies previous output
    W = tf.Variable(tf.truncated_normal([alphabet_size,num_nodes],-0.1,0.1))
    
    #bias vector
    b = tf.Variable(tf.zeros([1,num_nodes]))
    
    #output matrix
    V = tf.Variable(tf.truncated_normal([num_nodes,alphabet_size],-0.1,0.1))
    c = tf.Variable(tf.zeros([1,alphabet_size]))
    
    
    
    #model
    def RNN(i,o_input):
        a = tf.matmul(i,U)+tf.matmul(o_input,W)+b
        #h_output = tf.nn.tanh(a)
        h_output = tf.nn.relu(a)
        o_out = tf.matmul(h_output,V)+c
        return o_out
    
    
    #when training truncate the gradients after num_unrollings
    for i in range(num_unrollings):
        if i == 0:
            outputs = list()
            output_after = RNN(train[i],output_feed)
        else:
            output_after = RNN(train[i],labels[i-1])
#         output = output_after
        outputs.append(output_after)
    
    #train
    
    #log likelihood loss
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(tf.concat(0,outputs),tf.concat(0,labels)))

    
    #optimizer
    global_step = tf.Variable(0)
    learning_rate = tf.train.exponential_decay(
        learning_rate=1.0,global_step=global_step, decay_steps=5000, decay_rate=0.1, staircase=True)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    gradients,var=zip(*optimizer.compute_gradients(loss))
    gradients_clipped, _ = tf.clip_by_global_norm(gradients, 1.25)
    opt=optimizer.apply_gradients(zip(gradients_clipped,var),global_step=global_step)
    
    # Validation
    val_output_after = tf.nn.softmax(RNN(val_input,val_output))
    #val_logits = tf.matmul(val_output_after,W_softmax)+b_softmax
    val_probs = tf.nn.softmax(val_output_after)
    
    #add init op to the graph
    init = tf.initialize_all_variables()

Instructions for updating:
Use `tf.global_variables_initializer` instead.


In [51]:
num_steps=50001
b = Batch(text,batch_size)

sess=tf.Session(graph=g)
sess.run(init)
average_loss = 0

for step in range(num_steps):
    if step%b.text_size == 0:
        print "NEW EPOCH"

    #initialize the output
    if step == 0: #initialize the output state vectors
        output_pass = np.zeros([batch_size,alphabet_size],dtype=np.float32)
    feed_dict={output_feed: output_pass}
    
    
    #get the new inputs and labels
    batch_x,batch_y=getWindowBatch(b,num_unrollings)
    
    #mega batches
    mega_batch_x = [] #each elemnt will be a batch.  there will be tau elements where tau is the number of unrollings
    mega_batch_y = []
    for n in range(num_unrollings):
        batchx = np.ndarray((batch_size,alphabet_size)) #contain all the one-hot encoding of the characters
        batchy = np.ndarray((batch_size,alphabet_size))
        for ba in range(batch_size):
            batchx[ba]=char2vec(batch_x[n][ba])
            batchy[ba]=char2vec(batch_y[n][ba])
        mega_batch_x.append(batch)
        mega_batch_y.append(batch)
    
    for i in range(num_unrollings):
        feed_dict[train[i]] = mega_batch_x[i]
        feed_dict[labels[i]] = mega_batch_y[i]
    
    output_pass,l,_=sess.run([output_after,loss,opt],feed_dict=feed_dict)
    average_loss += l
    if step % 1000 == 0:
        print 'Average loss: ',str(average_loss/1000)
        average_loss = 0
        
        
        print 'Learning rate: ', str(learning_rate.eval(session=sess))
        #sample and then generate text
        s=''
        
        #initialize the validations out and character
        #val_output_O = np.zeros(alphabet_size).reshape(1,alphabet_size)
        val_output_O = vec2mat(char2vec(id2char(sample_prob(random_dist()))))
        
        char_id = sample_prob(random_dist()) #create a random distribution then sample
        val_input_O = vec2mat(char2vec(id2char(char_id)))

        s+=id2char(char_id)
        for _ in range(100):
            feed_dict = {val_input: val_input_O, 
                         val_output: val_output_O}
            val_output_O,dist = sess.run([val_output_after,val_probs],feed_dict=feed_dict)
            char_id=sample_prob(dist[0])
            val_input_O = vec2mat(char2vec(id2char(char_id)))
            s+=id2char(char_id)
        print s

NEW EPOCH
Average loss:  0.00330066585541
Learning rate:  1.0
mbludwyrdvsasrzvagtwhgojkxvvb geghurlgprwunaup usmumcqakahlbaxc  cnjwnvbcwbky vbffxezqwdplhpfi ghpiff
Average loss:  0.106197460013
Learning rate:  1.0
vejmmmtlxuiqifjtdwmvmdoonipojiuwstbxpqpykvtsdekcqspfpikxoljyrlrtbkhhyrznagqqpp  wqwttwjkbsjkvdbbqzrjl
Average loss:  0.000740274738346
Learning rate:  1.0
tzqtrorqrtpplvlqurahgtvjazhitvtmtjonptbglcahta syyafznc pqqbitiodkvmotfzaqngwpseczfee snglrkvuwyxvepc
Average loss:  0.000384142373339
Learning rate:  1.0
sbdbosr tdeeeojpwmhkbfzvmkkmuty kmcxkrcytm fjaappccfqvookfdhzvmmcdjbwujukshsbelneij yuondxfkmjyhlpsih
Average loss:  0.000256899726563
Learning rate:  1.0
bfrcnbmxy pquidavlqzifluvewcodcjogvxnrazpuusxbdllttwqkzdujr  ajtoasd  tsfnmgjkpoglsdle cepuul mfmrska
Average loss:  0.000191746679877
Learning rate:  0.1
sillmfmvhznnm eebbuhlkbfkuounhydfhkb pshadmegdkocmuj oxgeyosobvebepwjjmabfumnf shfgdwaabsxymupagchiqc
Average loss:  0.000167272343082
Learning rate:  0.1
yqttybbelms

KeyboardInterrupt: 