In [1]:
import utils
import numpy as np
import edf
from time import time
import pickle
import os

train_data, trcnt = utils.load_data_onechar('data/ptb.train.txt', False)
valid_data, vacnt = utils.load_data_onechar('data/ptb.valid.txt', False)
test_data, tecnt = utils.load_data_onechar('data/ptb.test.txt', False)

In [3]:
hidden_dim = 200
n_vocab = utils.n_vocab
batch = 50
parameters = []
model = 'model_LSTM.pkl'
eta = 0.5
decay = 0.9

inp = edf.Value()
np.random.seed(0)

edf.params = []
C2V = edf.Param(edf.xavier((n_vocab, hidden_dim)))

# forget gate
Wf = edf.Param(edf.xavier((2*hidden_dim, hidden_dim)))
bf = edf.Param(np.zeros((hidden_dim)))
# input gate
Wi = edf.Param(edf.xavier((2*hidden_dim, hidden_dim)))
bi = edf.Param(np.zeros((hidden_dim)))
# carry cell
Wc = edf.Param(edf.xavier((2*hidden_dim, hidden_dim)))
bc = edf.Param(np.zeros((hidden_dim)))
# output cell
Wo = edf.Param(edf.xavier((2*hidden_dim, hidden_dim)))
bo = edf.Param(np.zeros((hidden_dim)))
    
V = edf.Param(edf.xavier((hidden_dim, n_vocab)))

parameters.extend([C2V, Wf, bf, Wi, bi, Wc, bc, Wo, bo, V])


# load the trained model if exist
if os.path.exists(model):
    with open(model, 'rb') as f:
        p_value = pickle.load(f)
        idx = 0
        for p in p_value:
            parameters[idx].value = p
            idx += 1
                    

def LSTMCell(xt, h, c):
    
    f = edf.Sigmoid(edf.Add(edf.VDot(edf.ConCat(xt, h), Wf), bf))
    i = edf.Sigmoid(edf.Add(edf.VDot(edf.ConCat(xt, h), Wi), bi))
    o = edf.Sigmoid(edf.Add(edf.VDot(edf.ConCat(xt, h), Wo), bo))
    c_hat = edf.Tanh(edf.Add(edf.VDot(edf.ConCat(xt, h), Wc), bc))
    c_next = edf.Add(edf.Mul(f, c), edf.Mul(i, c_hat))
    h_next = edf.Mul(o, edf.Tanh(c_next))
            
    return h_next, c_next


def BuildModel():
 
    edf.components = []

    B = inp.value.shape[0]
    T = inp.value.shape[1]
    h = edf.Value(np.zeros((B, hidden_dim))) 
    c = edf.Value(np.zeros((B, hidden_dim)))
    
    score = []
    
    for t in range(T-1):
 
        wordvec = edf.Embed(edf.Value(inp.value[:,t]), C2V) 
        xt = edf.Reshape(wordvec, [-1, hidden_dim])
        h_next, c_next = LSTMCell(xt, h, c)
        p = edf.SoftMax(edf.VDot(h_next, V))
        logloss = edf.Reshape(edf.LogLoss(edf.Aref(p, edf.Value(inp.value[:,t+1]))), (B, 1))
        
        if t == 0:
            loss = logloss
        else:
            loss = edf.ConCat(loss, logloss)
            
        score.append(p)    
        h = h_next
        c = c_next
    
    masks = np.zeros((B, T-1), dtype = np.int32)
    masks[inp.value[:,1:] != 0] = 1
    loss = edf.MeanwithMask(loss, edf.Value(masks)) 
    
    return loss, score
    
    
def CalPerp(score):
    
    prob = [p.value for p in score]
    prob = np.transpose(np.stack(prob, axis = 0),(1,0,2))
    
    B = prob.shape[0]
    T = prob.shape[1]
    V = prob.shape[2]
    
    masks = np.zeros((B, T), dtype=np.int32)
    masks[inp.value[:,1:] != 0] = 1
    
    prob = prob.reshape(-1)
    idx = np.int32(inp.value[:,1:].reshape(-1))
    outer_dim = len(idx)
    inner_dim = len(prob)/outer_dim
    pick = np.int32(np.array(range(outer_dim))*inner_dim + idx)
    prob = prob[pick].reshape(B, T)
        
    return -np.sum(np.log(prob[np.nonzero(prob*masks)]))

def Predict(max_step, prefix):
   
    edf.components = []

    T = max_step       
    h = edf.Value(np.zeros((1, hidden_dim))) 
    c = edf.Value(np.zeros((1, hidden_dim))) 
    
    prediction = []

    for t in range(T):
   
        if t < len(prefix):
            pred = edf.Value(prefix[t])
            prediction.append(pred)              
        else:
            prediction.append(pred)

        wordvec = edf.Embed(pred, C2V)
        xt = edf.Reshape(wordvec, [-1, hidden_dim])
        h_next,c_next = LSTMCell(xt, h, c)
        p = edf.SoftMax(edf.VDot(h_next, V))
        pred = edf.ArgMax(p)
        h = h_next
        c = c_next   
            
    edf.Forward()
    
    idx = [pred.value for pred in prediction]
    stop_idx = utils.to_index('}')
    
    if stop_idx in idx:
        return idx[0:idx.index(stop_idx)+1]
    else:
        return idx

def Eval(data, cnt):
    
    perp = 0.
    avg_loss = 0.
    test_batches = range(0, len(data), batch)
    test_minbatches = [data[idx:idx+batch] for idx in test_batches]
    
    for minbatch in test_minbatches:
        
        x_padded = utils.make_mask(minbatch)
        inp.set(x_padded)
        loss, score = BuildModel()
        edf.Forward()
        avg_loss += loss.value
        perp += CalPerp(score)
           
    perp = np.exp(perp/cnt)
    avg_loss /= len(test_batches)
    return perp, avg_loss


############################################### training loop #####################################################

batches = range(0, len(train_data), batch)
minbatches = [train_data[idx:idx+batch] for idx in batches]

epoch = 30

# initial Perplexity and loss
perp, loss = Eval(valid_data, vacnt)
print("Initial: Perplexity: %0.5f Avg loss = %0.5f" % (perp, loss))    
best_loss = loss
prefix = 'the agreements bring'  
generation = Predict(400, utils.to_idxs(prefix))
print("Initial generated sentence ")
print (utils.to_string(generation))
    
    
for ep in range(epoch):

    perm = np.random.permutation(len(minbatches)).tolist() 
    stime=time()
    
    for k in range(len(minbatches)):
        
        minbatch = minbatches[perm[k]]
        x_padded = utils.make_mask(minbatch)
        inp.set(x_padded)
        loss, score = BuildModel()
        edf.Forward()
        edf.Backward(loss)
        edf.GradClip(10)
        edf.SGD(eta)
       
    duration = (time() - stime)/60.
    
    perp, loss = Eval(valid_data, vacnt)
    print("Epoch %d: Perplexity: %0.5f Avg loss = %0.5f [%.3f mins]" % (ep, perp, loss, duration))
    
    # generate some text given the prefix and trained model
    prefix = 'the agreements bring'  
    generation = Predict(400, utils.to_idxs(prefix))
    print("Epoch %d: generated sentence " % ep)
    print (utils.to_string(generation)) 

    if loss < best_loss:
        
        best_loss = loss
        # save the model
        f = open(model, 'wb')
        p_value = []
        for p in parameters:
            p_value.append(p.value)
        pickle.dump(p_value, f)
        
    else:
        
        # load the last best model and decay the learning rate
        eta *= decay
        with open(model, 'rb') as f:
            p_value = pickle.load(f)
            idx = 0
            for p in p_value:
                parameters[idx].value = p
                idx += 1

Initial: Perplexity: 49.13856 Avg loss = 3.92808
Initial generated sentence 
the agreements bringwx*//659e&@$$vl777#*b/655.7qqqqqqqqyyy.wxp3u7dm77dmx37dmal77#*b/655.7qqqqqqqqyyy.wxp3u7dm77dmx37dmal77#*b/655.7qqqqqqqqyyy.wxp3u7dm77dmx37dmal77#*b/655.7qqqqqqqqyyy.wxp3u7dm77dmx37dmal77#*b/655.7qqqqqqqqyyy.wxp3u7dm77dmx37dmal77#*b/655.7qqqqqqqqyyy.wxp3u7dm77dmx37dmal77#*b/655.7qqqqqqqqyyy.wxp3u7dm77dmx37dmal77#*b/655.7qqqqqqqqyyy.wxp3u7dm77dmx37dmal77#*b/655.7qqqqqqqqyyy.wxp3u
Epoch 0: Perplexity: 7.97855 Avg loss = 2.10561 [29.985 mins]
Epoch 0: generated sentence 
the agreements bring the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the
Epoch 1: Perplexity: 6.34002 A

In [32]:
reload(edf)

<module 'edf' from 'edf.py'>

In [4]:
#Using momentum in batch norm

hidden_dim = 200
n_vocab = utils.n_vocab
batch = 50
parameters = []
model = 'model_LSTM_Norm1.pkl'
eta = 0.5
decay = 0.9

inp = edf.Value()
np.random.seed(0)

edf.params = []
C2V = edf.Param(edf.xavier((n_vocab, hidden_dim)))

# forget gate
Wf = edf.Param(edf.xavier((2*hidden_dim, hidden_dim)))
bf = edf.Param(np.zeros((hidden_dim)))
# input gate
Wi = edf.Param(edf.xavier((2*hidden_dim, hidden_dim)))
bi = edf.Param(np.zeros((hidden_dim)))
# carry cell
Wc = edf.Param(edf.xavier((2*hidden_dim, hidden_dim)))
bc = edf.Param(np.zeros((hidden_dim)))
# output cell
Wo = edf.Param(edf.xavier((2*hidden_dim, hidden_dim)))
bo = edf.Param(np.zeros((hidden_dim)))
#normalization, using xavier init
gamma = edf.Param(np.ones((hidden_dim))/10.0)
beta = edf.Param(np.zeros((hidden_dim)))

    
V = edf.Param(edf.xavier((hidden_dim, n_vocab)))

parameters.extend([C2V, Wf, bf, Wi, bi, Wc, bc, Wo, bo, V, gamma, beta])


# load the trained model if exist
if os.path.exists(model):
    with open(model, 'rb') as f:
        p_value = pickle.load(f)
        idx = 0
        for p in p_value:
            parameters[idx].value = p
            idx += 1
                    
#Adding norm layer, c_t+1 -> norm(c_t+1)
def LSTMCellNorm(xt, h, c, test):
    
    f = edf.Sigmoid(edf.Add(edf.BatchNorm(edf.VDot(edf.ConCat(xt, h), Wf),gamma,beta,test), bf))
    i = edf.Sigmoid(edf.Add(edf.BatchNorm(edf.VDot(edf.ConCat(xt, h), Wi),gamma,beta,test), bi))
    o = edf.Sigmoid(edf.Add(edf.BatchNorm(edf.VDot(edf.ConCat(xt, h), Wo),gamma,beta,test), bo))
    c_hat = edf.Tanh(edf.Add(edf.BatchNorm(edf.VDot(edf.ConCat(xt, h), Wc),gamma,beta,test), bc))
    c_next = edf.Add(edf.Mul(f, c), edf.Mul(i, c_hat))
    #h_next = edf.Mul(o, edf.Tanh(edf.BatchNorm(c_next,gamma,beta,test)))
    h_next = edf.Mul(o, edf.Tanh(c_next))
            
    return h_next, c_next


def BuildModel():
 
    edf.components = []

    B = inp.value.shape[0]
    T = inp.value.shape[1]
    h = edf.Value(np.zeros((B, hidden_dim))) 
    c = edf.Value(np.zeros((B, hidden_dim)))


    
    score = []
    
    for t in range(T-1):
 
        wordvec = edf.Embed(edf.Value(inp.value[:,t]), C2V) 
        xt = edf.Reshape(wordvec, [-1, hidden_dim])
        h_next, c_next = LSTMCellNorm(xt, h, c, edf.Value(False))
        p = edf.SoftMax(edf.VDot(h_next, V))
        logloss = edf.Reshape(edf.LogLoss(edf.Aref(p, edf.Value(inp.value[:,t+1]))), (B, 1))
        
        if t == 0:
            loss = logloss
        else:
            loss = edf.ConCat(loss, logloss)
            
        score.append(p)    
        h = h_next
        c = c_next
    
    masks = np.zeros((B, T-1), dtype = np.int32)
    masks[inp.value[:,1:] != 0] = 1
    loss = edf.MeanwithMask(loss, edf.Value(masks)) 
    
    return loss, score
    
    
def CalPerp(score):
    
    prob = [p.value for p in score]
    prob = np.transpose(np.stack(prob, axis = 0),(1,0,2))
    
    B = prob.shape[0]
    T = prob.shape[1]
    V = prob.shape[2]
    
    masks = np.zeros((B, T), dtype=np.int32)
    masks[inp.value[:,1:] != 0] = 1
    
    prob = prob.reshape(-1)
    idx = np.int32(inp.value[:,1:].reshape(-1))
    outer_dim = len(idx)
    inner_dim = len(prob)/outer_dim
    pick = np.int32(np.array(range(outer_dim))*inner_dim + idx)
    prob = prob[pick].reshape(B, T)
        
    return -np.sum(np.log(prob[np.nonzero(prob*masks)]))

def Predict(max_step, prefix):
   
    edf.components = []

    T = max_step       
    h = edf.Value(np.zeros((1, hidden_dim))) 
    c = edf.Value(np.zeros((1, hidden_dim))) 
    
    prediction = []

    for t in range(T):
   
        if t < len(prefix):
            pred = edf.Value(prefix[t])
            prediction.append(pred)              
        else:
            prediction.append(pred)

        wordvec = edf.Embed(pred, C2V)
        xt = edf.Reshape(wordvec, [-1, hidden_dim])
        h_next,c_next = LSTMCellNorm(xt, h, c, edf.Value(True))
        p = edf.SoftMax(edf.VDot(h_next, V))
        pred = edf.ArgMax(p)
        h = h_next
        c = c_next   
            
    edf.Forward()
    
    idx = [pred.value for pred in prediction]
    stop_idx = utils.to_index('}')
    
    if stop_idx in idx:
        return idx[0:idx.index(stop_idx)+1]
    else:
        return idx

def Eval(data, cnt):
    
    perp = 0.
    avg_loss = 0.
    test_batches = range(0, len(data), batch)
    test_minbatches = [data[idx:idx+batch] for idx in test_batches]
    
    for minbatch in test_minbatches:
        
        x_padded = utils.make_mask(minbatch)
        inp.set(x_padded)
        loss, score = BuildModel()
        edf.Forward()
        avg_loss += loss.value
        perp += CalPerp(score)
           
    perp = np.exp(perp/cnt)
    avg_loss /= len(test_batches)
    return perp, avg_loss


############################################### training loop #####################################################

batches = range(0, len(train_data), batch)
minbatches = [train_data[idx:idx+batch] for idx in batches]

epoch = 30

# initial Perplexity and loss
perp, loss = Eval(valid_data, vacnt)
print("Initial: Perplexity: %0.5f Avg loss = %0.5f" % (perp, loss))    
best_loss = loss
prefix = 'the agreements bring'  
generation = Predict(400, utils.to_idxs(prefix))
print("Initial generated sentence ")
print (utils.to_string(generation))
    
    
for ep in range(epoch):

    perm = np.random.permutation(len(minbatches)).tolist() 
    stime=time()
    
    for k in range(len(minbatches)):
        
        minbatch = minbatches[perm[k]]
        x_padded = utils.make_mask(minbatch)
        inp.set(x_padded)
        loss, score = BuildModel()
        edf.Forward()
        edf.Backward(loss)
        edf.GradClip(10)
        edf.SGD(eta)
       
    duration = (time() - stime)/60.
    
    perp, loss = Eval(valid_data, vacnt)
    print("Epoch %d: Perplexity: %0.5f Avg loss = %0.5f [%.3f mins]" % (ep, perp, loss, duration))
    
    # generate some text given the prefix and trained model
    prefix = 'the agreements bring'  
    generation = Predict(400, utils.to_idxs(prefix))
    print("Epoch %d: generated sentence " % ep)
    print (utils.to_string(generation)) 

    if loss < best_loss:
        
        best_loss = loss
        # save the model
        f = open(model, 'wb')
        p_value = []
        for p in parameters:
            p_value.append(p.value)
        pickle.dump(p_value, f)
        
    else:
        
        # load the last best model and decay the learning rate
        eta *= decay
        with open(model, 'rb') as f:
            p_value = pickle.load(f)
            idx = 0
            for p in p_value:
                parameters[idx].value = p
                idx += 1

Initial: Perplexity: 182.90955 Avg loss = 5.27921
Initial generated sentence 
the agreements bringaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
Epoch 0: Perplexity: 9.53933 Avg loss = 2.28844 [32.666 mins]
Epoch 0: generated sentence 
the agreements bring> .d .d .d .d .d .d .d .d .data nd> .datat}
Epoch 1: Perplexity: 8.73949 Avg loss = 2.20002 [27.502 mins]
Epoch 1: generated sentence 
the agreements bringvt. .dst.datk> .dvtk> .ddyvtvtvtvtk> .dvtk> .dvtvtk> .dvtvtk> .dvtvtk> .dvtk> .dvtvtk> .dvtvtk> .dvtvtk> .dvtk> .dvtvtk> .dvtvtk> .dvtk> .dvtvtk> .dvtvtk> .dvtvtk> .dvtk> .dvtvtk> .dvtvtk> .dvtvtk> .dvtk> .dvtvtk> .dvtvtk> .dvtvtk> .dvtk> .dvtvtk> .dvtvtk

In [11]:
#Using batch norm from paper

hidden_dim = 200
n_vocab = utils.n_vocab
batch = 50
parameters = []
model = 'model_LSTM_Norm2.pkl'
eta = 0.5
decay = 0.9

inp = edf.Value()
np.random.seed(0)

edf.params = []
C2V = edf.Param(edf.xavier((n_vocab, hidden_dim)))

# forget gate
Wf = edf.Param(edf.xavier((2*hidden_dim, hidden_dim)))
bf = edf.Param(np.zeros((hidden_dim)))
# input gate
Wi = edf.Param(edf.xavier((2*hidden_dim, hidden_dim)))
bi = edf.Param(np.zeros((hidden_dim)))
# carry cell
Wc = edf.Param(edf.xavier((2*hidden_dim, hidden_dim)))
bc = edf.Param(np.zeros((hidden_dim)))
# output cell
Wo = edf.Param(edf.xavier((2*hidden_dim, hidden_dim)))
bo = edf.Param(np.zeros((hidden_dim)))
#normalization, using xavier init
gamma = edf.Param(np.ones((hidden_dim))/10.0)
beta = edf.Param(np.zeros((hidden_dim)))

    
V = edf.Param(edf.xavier((hidden_dim, n_vocab)))

parameters.extend([C2V, Wf, bf, Wi, bi, Wc, bc, Wo, bo, V, gamma, beta])


# load the trained model if exist
if os.path.exists(model):
    with open(model, 'rb') as f:
        p_value = pickle.load(f)
        idx = 0
        for p in p_value:
            parameters[idx].value = p
            idx += 1
                    
#Adding norm layer, c_t+1 -> norm(c_t+1)
def LSTMCellNorm(xt, h, c, test):
    
    f = edf.Sigmoid(edf.Add(edf.BatchNorm(edf.VDot(edf.ConCat(xt, h), Wf),gamma,beta,test), bf))
    i = edf.Sigmoid(edf.Add(edf.BatchNorm(edf.VDot(edf.ConCat(xt, h), Wi),gamma,beta,test), bi))
    o = edf.Sigmoid(edf.Add(edf.BatchNorm(edf.VDot(edf.ConCat(xt, h), Wo),gamma,beta,test), bo))
    c_hat = edf.Tanh(edf.Add(edf.BatchNorm(edf.VDot(edf.ConCat(xt, h), Wc),gamma,beta,test), bc))
    c_next = edf.Add(edf.Mul(f, c), edf.Mul(i, c_hat))
    #h_next = edf.Mul(o, edf.Tanh(edf.BatchNorm(c_next,gamma,beta,test)))
    h_next = edf.Mul(o, edf.Tanh(c_next))
            
    return h_next, c_next


def BuildModel():
 
    edf.components = []

    B = inp.value.shape[0]
    T = inp.value.shape[1]
    h = edf.Value(np.zeros((B, hidden_dim))) 
    c = edf.Value(np.zeros((B, hidden_dim)))


    
    score = []
    
    for t in range(T-1):
 
        wordvec = edf.Embed(edf.Value(inp.value[:,t]), C2V) 
        xt = edf.Reshape(wordvec, [-1, hidden_dim])
        h_next, c_next = LSTMCellNorm(xt, h, c, edf.Value(False))
        p = edf.SoftMax(edf.VDot(h_next, V))
        logloss = edf.Reshape(edf.LogLoss(edf.Aref(p, edf.Value(inp.value[:,t+1]))), (B, 1))
        
        if t == 0:
            loss = logloss
        else:
            loss = edf.ConCat(loss, logloss)
            
        score.append(p)    
        h = h_next
        c = c_next
    
    masks = np.zeros((B, T-1), dtype = np.int32)
    masks[inp.value[:,1:] != 0] = 1
    loss = edf.MeanwithMask(loss, edf.Value(masks)) 
    
    return loss, score
    
    
def CalPerp(score):
    
    prob = [p.value for p in score]
    prob = np.transpose(np.stack(prob, axis = 0),(1,0,2))
    
    B = prob.shape[0]
    T = prob.shape[1]
    V = prob.shape[2]
    
    masks = np.zeros((B, T), dtype=np.int32)
    masks[inp.value[:,1:] != 0] = 1
    
    prob = prob.reshape(-1)
    idx = np.int32(inp.value[:,1:].reshape(-1))
    outer_dim = len(idx)
    inner_dim = len(prob)/outer_dim
    pick = np.int32(np.array(range(outer_dim))*inner_dim + idx)
    prob = prob[pick].reshape(B, T)
        
    return -np.sum(np.log(prob[np.nonzero(prob*masks)]))

def Predict(max_step, prefix):
   
    edf.components = []

    T = max_step       
    h = edf.Value(np.zeros((1, hidden_dim))) 
    c = edf.Value(np.zeros((1, hidden_dim))) 
    
    prediction = []

    for t in range(T):
   
        if t < len(prefix):
            pred = edf.Value(prefix[t])
            prediction.append(pred)              
        else:
            prediction.append(pred)

        wordvec = edf.Embed(pred, C2V)
        xt = edf.Reshape(wordvec, [-1, hidden_dim])
        h_next,c_next = LSTMCellNorm(xt, h, c, edf.Value(True))
        p = edf.SoftMax(edf.VDot(h_next, V))
        pred = edf.ArgMax(p)
        h = h_next
        c = c_next   
            
    edf.Forward()
    
    idx = [pred.value for pred in prediction]
    stop_idx = utils.to_index('}')
    
    if stop_idx in idx:
        return idx[0:idx.index(stop_idx)+1]
    else:
        return idx

def Eval(data, cnt):
    
    perp = 0.
    avg_loss = 0.
    test_batches = range(0, len(data), batch)
    test_minbatches = [data[idx:idx+batch] for idx in test_batches]
    
    for minbatch in test_minbatches:
        
        x_padded = utils.make_mask(minbatch)
        inp.set(x_padded)
        loss, score = BuildModel()
        edf.Forward()
        avg_loss += loss.value
        perp += CalPerp(score)
           
    perp = np.exp(perp/cnt)
    avg_loss /= len(test_batches)
    return perp, avg_loss


############################################### training loop #####################################################

batches = range(0, len(train_data), batch)
minbatches = [train_data[idx:idx+batch] for idx in batches]

epoch = 30

# initial Perplexity and loss
perp, loss = Eval(valid_data, vacnt)
print("Initial: Perplexity: %0.5f Avg loss = %0.5f" % (perp, loss))    
best_loss = loss
prefix = 'the agreements bring'  
generation = Predict(400, utils.to_idxs(prefix))
print("Initial generated sentence ")
print (utils.to_string(generation))
    
    
for ep in range(epoch):

    perm = np.random.permutation(len(minbatches)).tolist() 
    stime=time()
    
    for k in range(len(minbatches)):
        
        minbatch = minbatches[perm[k]]
        x_padded = utils.make_mask(minbatch)
        inp.set(x_padded)
        loss, score = BuildModel()
        edf.Forward()
        edf.Backward(loss)
        edf.GradClip(10)
        edf.SGD(eta)
       
    duration = (time() - stime)/60.
    
    perp, loss = Eval(valid_data, vacnt)
    print("Epoch %d: Perplexity: %0.5f Avg loss = %0.5f [%.3f mins]" % (ep, perp, loss, duration))
    
    # generate some text given the prefix and trained model
    prefix = 'the agreements bring'  
    generation = Predict(400, utils.to_idxs(prefix))
    print("Epoch %d: generated sentence " % ep)
    print (utils.to_string(generation)) 

    if loss < best_loss:
        
        best_loss = loss
        # save the model
        f = open(model, 'wb')
        p_value = []
        for p in parameters:
            p_value.append(p.value)
        pickle.dump(p_value, f)
        
    else:
        
        # load the last best model and decay the learning rate
        eta *= decay
        with open(model, 'rb') as f:
            p_value = pickle.load(f)
            idx = 0
            for p in p_value:
                parameters[idx].value = p
                idx += 1

Initial: Perplexity: 9.30435 Avg loss = 2.26632
Initial generated sentence 
the agreements bring@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
Epoch 0: Perplexity: 9.05345 Avg loss = 2.23770 [30.059 mins]
Epoch 0: generated sentence 
the agreements bring@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
Epoch 1: Perplexity: 9.22198 Av


KeyboardInterrupt



In [12]:
reload(edf)

<module 'edf' from 'edf.py'>

In [18]:
#Using batch norm from online code
hidden_dim = 200
n_vocab = utils.n_vocab
batch = 50
parameters = []
model = 'model_LSTM_Norm4.pkl'
eta = 0.5
decay = 0.9

inp = edf.Value()
np.random.seed(0)

edf.params = []
C2V = edf.Param(edf.xavier((n_vocab, hidden_dim)))

# forget gate
Wf = edf.Param(edf.xavier((2*hidden_dim, hidden_dim)))
bf = edf.Param(np.zeros((hidden_dim)))
# input gate
Wi = edf.Param(edf.xavier((2*hidden_dim, hidden_dim)))
bi = edf.Param(np.zeros((hidden_dim)))
# carry cell
Wc = edf.Param(edf.xavier((2*hidden_dim, hidden_dim)))
bc = edf.Param(np.zeros((hidden_dim)))
# output cell
Wo = edf.Param(edf.xavier((2*hidden_dim, hidden_dim)))
bo = edf.Param(np.zeros((hidden_dim)))
#normalization, using xavier init
gamma = edf.Param(np.ones((hidden_dim))/10.0)
beta = edf.Param(np.zeros((hidden_dim)))



    
V = edf.Param(edf.xavier((hidden_dim, n_vocab)))

parameters.extend([C2V, Wf, bf, Wi, bi, Wc, bc, Wo, bo, V, gamma, beta])


# load the trained model if exist
if os.path.exists(model):
    with open(model, 'rb') as f:
        p_value = pickle.load(f)
        idx = 0
        for p in p_value:
            parameters[idx].value = p
            idx += 1
                    
#Adding norm layer, c_t+1 -> norm(c_t+1)
def LSTMCellNorm(xt, h, c, test):
    
    f = edf.Sigmoid(edf.Add(edf.BatchNormPaper(edf.VDot(edf.ConCat(xt, h), Wf),gamma,beta,test), bf))
    i = edf.Sigmoid(edf.Add(edf.BatchNormPaper(edf.VDot(edf.ConCat(xt, h), Wi),gamma,beta,test), bi))
    o = edf.Sigmoid(edf.Add(edf.BatchNormPaper(edf.VDot(edf.ConCat(xt, h), Wo),gamma,beta,test), bo))
    c_hat = edf.Tanh(edf.Add(edf.BatchNormPaper(edf.VDot(edf.ConCat(xt, h), Wc),gamma,beta,test), bc))
    c_next = edf.Add(edf.Mul(f, c), edf.Mul(i, c_hat))
    #h_next = edf.Mul(o, edf.Tanh(edf.BatchNorm(c_next,gamma,beta,test)))
    h_next = edf.Mul(o, edf.Tanh(c_next))
            
    return h_next, c_next


def BuildModel():
 
    edf.components = []

    B = inp.value.shape[0]
    T = inp.value.shape[1]
    h = edf.Value(np.zeros((B, hidden_dim))) 
    c = edf.Value(np.zeros((B, hidden_dim)))


    
    score = []
    
    for t in range(T-1):
 
        wordvec = edf.Embed(edf.Value(inp.value[:,t]), C2V) 
        xt = edf.Reshape(wordvec, [-1, hidden_dim])
        h_next, c_next = LSTMCellNorm(xt, h, c, edf.Value(False))
        p = edf.SoftMax(edf.VDot(h_next, V))
        logloss = edf.Reshape(edf.LogLoss(edf.Aref(p, edf.Value(inp.value[:,t+1]))), (B, 1))
        
        if t == 0:
            loss = logloss
        else:
            loss = edf.ConCat(loss, logloss)
            
        score.append(p)    
        h = h_next
        c = c_next
    
    masks = np.zeros((B, T-1), dtype = np.int32)
    masks[inp.value[:,1:] != 0] = 1
    loss = edf.MeanwithMask(loss, edf.Value(masks)) 
    
    return loss, score
    
    
def CalPerp(score):
    
    prob = [p.value for p in score]
    prob = np.transpose(np.stack(prob, axis = 0),(1,0,2))
    
    B = prob.shape[0]
    T = prob.shape[1]
    V = prob.shape[2]
    
    masks = np.zeros((B, T), dtype=np.int32)
    masks[inp.value[:,1:] != 0] = 1
    
    prob = prob.reshape(-1)
    idx = np.int32(inp.value[:,1:].reshape(-1))
    outer_dim = len(idx)
    inner_dim = len(prob)/outer_dim
    pick = np.int32(np.array(range(outer_dim))*inner_dim + idx)
    prob = prob[pick].reshape(B, T)
        
    return -np.sum(np.log(prob[np.nonzero(prob*masks)]))

def Predict(max_step, prefix):
   
    edf.components = []

    T = max_step       
    h = edf.Value(np.zeros((1, hidden_dim))) 
    c = edf.Value(np.zeros((1, hidden_dim))) 
    
    prediction = []

    for t in range(T):
   
        if t < len(prefix):
            pred = edf.Value(prefix[t])
            prediction.append(pred)              
        else:
            prediction.append(pred)

        wordvec = edf.Embed(pred, C2V)
        xt = edf.Reshape(wordvec, [-1, hidden_dim])
        h_next,c_next = LSTMCellNorm(xt, h, c, edf.Value(True))
        p = edf.SoftMax(edf.VDot(h_next, V))
        pred = edf.ArgMax(p)
        h = h_next
        c = c_next   
            
    edf.Forward()
    
    idx = [pred.value for pred in prediction]
    stop_idx = utils.to_index('}')
    
    if stop_idx in idx:
        return idx[0:idx.index(stop_idx)+1]
    else:
        return idx

def Eval(data, cnt):
    
    perp = 0.
    avg_loss = 0.
    test_batches = range(0, len(data), batch)
    test_minbatches = [data[idx:idx+batch] for idx in test_batches]
    
    for minbatch in test_minbatches:
        
        x_padded = utils.make_mask(minbatch)
        inp.set(x_padded)
        loss, score = BuildModel()
        edf.Forward()
        avg_loss += loss.value
        perp += CalPerp(score)
           
    perp = np.exp(perp/cnt)
    avg_loss /= len(test_batches)
    return perp, avg_loss


############################################### training loop #####################################################

batches = range(0, len(train_data), batch)
minbatches = [train_data[idx:idx+batch] for idx in batches]

epoch = 30

# initial Perplexity and loss
perp, loss = Eval(valid_data, vacnt)
print("Initial: Perplexity: %0.5f Avg loss = %0.5f" % (perp, loss))    
best_loss = loss
prefix = 'the agreements bring'  
generation = Predict(400, utils.to_idxs(prefix))
print("Initial generated sentence ")
print (utils.to_string(generation))
    
    
for ep in range(epoch):

    perm = np.random.permutation(len(minbatches)).tolist() 
    stime=time()
    
    for k in range(len(minbatches)):
        
        minbatch = minbatches[perm[k]]
        x_padded = utils.make_mask(minbatch)
        inp.set(x_padded)
        loss, score = BuildModel()
        edf.Forward()
        edf.Backward(loss)
        edf.GradClip(10)
        edf.SGD(eta)
       
    duration = (time() - stime)/60.
    
    perp, loss = Eval(valid_data, vacnt)
    print("Epoch %d: Perplexity: %0.5f Avg loss = %0.5f [%.3f mins]" % (ep, perp, loss, duration))
    
    # generate some text given the prefix and trained model
    prefix = 'the agreements bring'  
    generation = Predict(400, utils.to_idxs(prefix))
    print("Epoch %d: generated sentence " % ep)
    print (utils.to_string(generation)) 

    if loss < best_loss:
        
        best_loss = loss
        # save the model
        f = open(model, 'wb')
        p_value = []
        for p in parameters:
            p_value.append(p.value)
        pickle.dump(p_value, f)
        
    else:
        
        # load the last best model and decay the learning rate
        eta *= decay
        with open(model, 'rb') as f:
            p_value = pickle.load(f)
            idx = 0
            for p in p_value:
                parameters[idx].value = p
                idx += 1

Initial: Perplexity: 20.10327 Avg loss = 3.05206
Initial generated sentence 
the agreements bring@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
Epoch 0: Perplexity: 9.49633 Avg loss = 2.28494 [31.834 mins]
Epoch 0: generated sentence 
the agreements bring@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
Epoch 1: Perplexity: 9.13977 A

In [17]:
reload(edf)

<module 'edf' from 'edf.py'>

In [52]:
#Using batch norm from online code, time step mean and variance
hidden_dim = 200
n_vocab = utils.n_vocab
batch = 50
parameters = []
model = 'model_LSTM_Norm3.pkl'
eta = 0.5
decay = 0.9

inp = edf.Value()
np.random.seed(0)

edf.params = []
C2V = edf.Param(edf.xavier((n_vocab, hidden_dim)))

# forget gate
Wf = edf.Param(edf.xavier((2*hidden_dim, hidden_dim)))
bf = edf.Param(np.zeros((hidden_dim)))
# input gate
Wi = edf.Param(edf.xavier((2*hidden_dim, hidden_dim)))
bi = edf.Param(np.zeros((hidden_dim)))
# carry cell
Wc = edf.Param(edf.xavier((2*hidden_dim, hidden_dim)))
bc = edf.Param(np.zeros((hidden_dim)))
# output cell
Wo = edf.Param(edf.xavier((2*hidden_dim, hidden_dim)))
bo = edf.Param(np.zeros((hidden_dim)))
#normalization, using xavier init
gamma = edf.Param(np.ones((hidden_dim))/10.0)
beta = edf.Param(np.zeros((hidden_dim)))
timeHold = np.zeros((1000,2,hidden_dim))



    
V = edf.Param(edf.xavier((hidden_dim, n_vocab)))

parameters.extend([C2V, Wf, bf, Wi, bi, Wc, bc, Wo, bo, V, gamma, beta])


# load the trained model if exist
if os.path.exists(model):
    with open(model, 'rb') as f:
        p_value = pickle.load(f)
        idx = 0
        for p in p_value:
            parameters[idx].value = p
            idx += 1
                    
#Adding norm layer, c_t+1 -> norm(c_t+1)
def LSTMCellNorm(xt, h, c, timeHold, test):
    
    f = edf.Sigmoid(edf.Add(edf.BatchNormTime(edf.VDot(edf.ConCat(xt, h), Wf),gamma,beta,timeHold,test), bf))
    i = edf.Sigmoid(edf.Add(edf.BatchNormTime(edf.VDot(edf.ConCat(xt, h), Wi),gamma,beta,timeHold,test), bi))
    o = edf.Sigmoid(edf.Add(edf.BatchNormTime(edf.VDot(edf.ConCat(xt, h), Wo),gamma,beta,timeHold,test), bo))
    c_hat = edf.Tanh(edf.Add(edf.BatchNormTime(edf.VDot(edf.ConCat(xt, h), Wc),gamma,beta,timeHold,test), bc))
    c_next = edf.Add(edf.Mul(f, c), edf.Mul(i, c_hat))
    #h_next = edf.Mul(o, edf.Tanh(edf.BatchNorm(c_next,gamma,beta,test)))
    h_next = edf.Mul(o, edf.Tanh(c_next))
            
    return h_next, c_next


def BuildModel():
 
    edf.components = []

    B = inp.value.shape[0]
    T = inp.value.shape[1]
    h = edf.Value(np.zeros((B, hidden_dim))) 
    c = edf.Value(np.zeros((B, hidden_dim)))


    
    score = []
    
    for t in range(T-1):
 
        wordvec = edf.Embed(edf.Value(inp.value[:,t]), C2V) 
        xt = edf.Reshape(wordvec, [-1, hidden_dim])
        h_next, c_next = LSTMCellNorm(xt, h, c, timeHold[t], edf.Value(False))
        p = edf.SoftMax(edf.VDot(h_next, V))
        logloss = edf.Reshape(edf.LogLoss(edf.Aref(p, edf.Value(inp.value[:,t+1]))), (B, 1))
        
        if t == 0:
            loss = logloss
        else:
            loss = edf.ConCat(loss, logloss)
            
        score.append(p)    
        h = h_next
        c = c_next
    
    masks = np.zeros((B, T-1), dtype = np.int32)
    masks[inp.value[:,1:] != 0] = 1
    loss = edf.MeanwithMask(loss, edf.Value(masks)) 
    
    return loss, score
    
    
def CalPerp(score):
    
    prob = [p.value for p in score]
    prob = np.transpose(np.stack(prob, axis = 0),(1,0,2))
    
    B = prob.shape[0]
    T = prob.shape[1]
    V = prob.shape[2]
    
    masks = np.zeros((B, T), dtype=np.int32)
    masks[inp.value[:,1:] != 0] = 1
    
    prob = prob.reshape(-1)
    idx = np.int32(inp.value[:,1:].reshape(-1))
    outer_dim = len(idx)
    inner_dim = len(prob)/outer_dim
    pick = np.int32(np.array(range(outer_dim))*inner_dim + idx)
    prob = prob[pick].reshape(B, T)
        
    return -np.sum(np.log(prob[np.nonzero(prob*masks)]))

def Predict(max_step, prefix):
   
    edf.components = []

    T = max_step       
    h = edf.Value(np.zeros((1, hidden_dim))) 
    c = edf.Value(np.zeros((1, hidden_dim))) 
    
    prediction = []

    for t in range(T):
   
        if t < len(prefix):
            pred = edf.Value(prefix[t])
            prediction.append(pred)              
        else:
            prediction.append(pred)

        wordvec = edf.Embed(pred, C2V)
        xt = edf.Reshape(wordvec, [-1, hidden_dim])
        h_next,c_next = LSTMCellNorm(xt, h, c, timeHold[t], edf.Value(True))
        p = edf.SoftMax(edf.VDot(h_next, V))
        pred = edf.ArgMax(p)
        h = h_next
        c = c_next   
            
    edf.Forward()
    
    idx = [pred.value for pred in prediction]
    stop_idx = utils.to_index('}')
    
    if stop_idx in idx:
        return idx[0:idx.index(stop_idx)+1]
    else:
        return idx

def Eval(data, cnt):
    
    perp = 0.
    avg_loss = 0.
    test_batches = range(0, len(data), batch)
    test_minbatches = [data[idx:idx+batch] for idx in test_batches]
    
    for minbatch in test_minbatches:
        
        x_padded = utils.make_mask(minbatch)
        inp.set(x_padded)
        loss, score = BuildModel()
        edf.Forward()
        avg_loss += loss.value
        perp += CalPerp(score)
           
    perp = np.exp(perp/cnt)
    avg_loss /= len(test_batches)
    return perp, avg_loss


############################################### training loop #####################################################

batches = range(0, len(train_data), batch)
minbatches = [train_data[idx:idx+batch] for idx in batches]

epoch = 30

# initial Perplexity and loss
perp, loss = Eval(valid_data, vacnt)
print("Initial: Perplexity: %0.5f Avg loss = %0.5f" % (perp, loss))    
best_loss = loss
prefix = 'the agreements bring'  
generation = Predict(400, utils.to_idxs(prefix))
print("Initial generated sentence ")
print (utils.to_string(generation))
    
    
for ep in range(epoch):

    perm = np.random.permutation(len(minbatches)).tolist() 
    stime=time()
    
    for k in range(len(minbatches)):
        
        minbatch = minbatches[perm[k]]
        x_padded = utils.make_mask(minbatch)
        inp.set(x_padded)
        loss, score = BuildModel()
        edf.Forward()
        edf.Backward(loss)
        edf.GradClip(10)
        edf.SGD(eta)
       
    duration = (time() - stime)/60.
    
    perp, loss = Eval(valid_data, vacnt)
    print("Epoch %d: Perplexity: %0.5f Avg loss = %0.5f [%.3f mins]" % (ep, perp, loss, duration))
    
    # generate some text given the prefix and trained model
    prefix = 'the agreements bring'  
    generation = Predict(400, utils.to_idxs(prefix))
    print("Epoch %d: generated sentence " % ep)
    print (utils.to_string(generation)) 

    if loss < best_loss:
        
        best_loss = loss
        # save the model
        f = open(model, 'wb')
        p_value = []
        for p in parameters:
            p_value.append(p.value)
        pickle.dump(p_value, f)
        
    else:
        
        # load the last best model and decay the learning rate
        eta *= decay
        with open(model, 'rb') as f:
            p_value = pickle.load(f)
            idx = 0
            for p in p_value:
                parameters[idx].value = p
                idx += 1

Initial: Perplexity: 20.10327 Avg loss = 3.05206
Initial generated sentence 
the agreements bring@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
Epoch 0: Perplexity: 9.49818 Avg loss = 2.28507 [40.724 mins]
Epoch 0: generated sentence 
the agreements bring@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
Epoch 1: Perplexity: 9.03764 A

KeyboardInterrupt: 

In [55]:
reload(edf)

<module 'edf' from 'edf.py'>

In [3]:
def fold_data(data):
    out = list()
    long = len(data)
    i = 0
    while i < long/2:
        hold = list()
        hold.extend(data[i])
        hold.extend(data[long-i-1])
        out.append(np.array(hold))
        i += 1
    return out

In [54]:
#Using batch norm from online code, time step mean and variance
hidden_dim = 200
n_vocab = utils.n_vocab
batch = 50
parameters = []
model = 'model_LSTM_Norm5.pkl'
eta = 0.5
decay = 0.9

#folddata
data_train = fold_data(train_data)
data_valid = fold_data(valid_data)
data_test = fold_data(test_data)

inp = edf.Value()
np.random.seed(0)

edf.params = []
C2V = edf.Param(edf.xavier((n_vocab, hidden_dim)))

# forget gate
Wf = edf.Param(edf.xavier((2*hidden_dim, hidden_dim)))
bf = edf.Param(np.zeros((hidden_dim)))
# input gate
Wi = edf.Param(edf.xavier((2*hidden_dim, hidden_dim)))
bi = edf.Param(np.zeros((hidden_dim)))
# carry cell
Wc = edf.Param(edf.xavier((2*hidden_dim, hidden_dim)))
bc = edf.Param(np.zeros((hidden_dim)))
# output cell
Wo = edf.Param(edf.xavier((2*hidden_dim, hidden_dim)))
bo = edf.Param(np.zeros((hidden_dim)))
#normalization, using xavier init
gamma = edf.Param(np.ones((hidden_dim))/10.0)
beta = edf.Param(np.zeros((hidden_dim)))
timeHold = np.zeros((1000,2,hidden_dim))



    
V = edf.Param(edf.xavier((hidden_dim, n_vocab)))

parameters.extend([C2V, Wf, bf, Wi, bi, Wc, bc, Wo, bo, V, gamma, beta])


# load the trained model if exist
if os.path.exists(model):
    with open(model, 'rb') as f:
        p_value = pickle.load(f)
        idx = 0
        for p in p_value:
            parameters[idx].value = p
            idx += 1
                    
#Adding norm layer, c_t+1 -> norm(c_t+1)
def LSTMCellNorm(xt, h, c, timeHold, test):
    
    f = edf.Sigmoid(edf.Add(edf.BatchNormTime(edf.VDot(edf.ConCat(xt, h), Wf),gamma,beta,timeHold,test), bf))
    i = edf.Sigmoid(edf.Add(edf.BatchNormTime(edf.VDot(edf.ConCat(xt, h), Wi),gamma,beta,timeHold,test), bi))
    o = edf.Sigmoid(edf.Add(edf.BatchNormTime(edf.VDot(edf.ConCat(xt, h), Wo),gamma,beta,timeHold,test), bo))
    c_hat = edf.Tanh(edf.Add(edf.BatchNormTime(edf.VDot(edf.ConCat(xt, h), Wc),gamma,beta,timeHold,test), bc))
    c_next = edf.Add(edf.Mul(f, c), edf.Mul(i, c_hat))
    #h_next = edf.Mul(o, edf.Tanh(edf.BatchNorm(c_next,gamma,beta,test)))
    h_next = edf.Mul(o, edf.Tanh(c_next))
            
    return h_next, c_next


def BuildModel():
 
    edf.components = []

    B = inp.value.shape[0]
    T = inp.value.shape[1]
    h = edf.Value(np.zeros((B, hidden_dim))) 
    c = edf.Value(np.zeros((B, hidden_dim)))


    
    score = []
    
    for t in range(T-1):
 
        wordvec = edf.Embed(edf.Value(inp.value[:,t]), C2V) 
        xt = edf.Reshape(wordvec, [-1, hidden_dim])
        h_next, c_next = LSTMCellNorm(xt, h, c, timeHold[t], edf.Value(False))
        p = edf.SoftMax(edf.VDot(h_next, V))
        logloss = edf.Reshape(edf.LogLoss(edf.Aref(p, edf.Value(inp.value[:,t+1]))), (B, 1))
        
        if t == 0:
            loss = logloss
        else:
            loss = edf.ConCat(loss, logloss)
            
        score.append(p)    
        h = h_next
        c = c_next
    
    masks = np.zeros((B, T-1), dtype = np.int32)
    masks[inp.value[:,1:] != 0] = 1
    loss = edf.MeanwithMask(loss, edf.Value(masks)) 
    
    return loss, score
    
    
def CalPerp(score):
    
    prob = [p.value for p in score]
    prob = np.transpose(np.stack(prob, axis = 0),(1,0,2))
    
    B = prob.shape[0]
    T = prob.shape[1]
    V = prob.shape[2]
    
    masks = np.zeros((B, T), dtype=np.int32)
    masks[inp.value[:,1:] != 0] = 1
    
    prob = prob.reshape(-1)
    idx = np.int32(inp.value[:,1:].reshape(-1))
    outer_dim = len(idx)
    inner_dim = len(prob)/outer_dim
    pick = np.int32(np.array(range(outer_dim))*inner_dim + idx)
    prob = prob[pick].reshape(B, T)
        
    return -np.sum(np.log(prob[np.nonzero(prob*masks)]))

def Predict(max_step, prefix):
   
    edf.components = []

    T = max_step       
    h = edf.Value(np.zeros((1, hidden_dim))) 
    c = edf.Value(np.zeros((1, hidden_dim))) 
    
    prediction = []

    for t in range(T):
   
        if t < len(prefix):
            pred = edf.Value(prefix[t])
            prediction.append(pred)              
        else:
            prediction.append(pred)

        wordvec = edf.Embed(pred, C2V)
        xt = edf.Reshape(wordvec, [-1, hidden_dim])
        h_next,c_next = LSTMCellNorm(xt, h, c, timeHold[t], edf.Value(True))
        p = edf.SoftMax(edf.VDot(h_next, V))
        pred = edf.ArgMax(p)
        h = h_next
        c = c_next   
            
    edf.Forward()
    
    idx = [pred.value for pred in prediction]
    stop_idx = utils.to_index('}')
    
    if stop_idx in idx:
        return idx[0:idx.index(stop_idx)+1]
    else:
        return idx

def Eval(data, cnt):
    
    perp = 0.
    avg_loss = 0.
    test_batches = range(0, len(data), batch)
    test_minbatches = [data[idx:idx+batch] for idx in test_batches]
    
    for minbatch in test_minbatches:
        
        x_padded = utils.make_mask(minbatch)
        inp.set(x_padded)
        loss, score = BuildModel()
        edf.Forward()
        avg_loss += loss.value
        perp += CalPerp(score)
           
    perp = np.exp(perp/cnt)
    avg_loss /= len(test_batches)
    return perp, avg_loss


############################################### training loop #####################################################

batches = range(0, len(data_train), batch)
minbatches = [data_train[idx:idx+batch] for idx in batches]

epoch = 30

# initial Perplexity and loss
perp, loss = Eval(data_valid, vacnt)
print("Initial: Perplexity: %0.5f Avg loss = %0.5f" % (perp, loss))    
best_loss = loss
prefix = 'the agreements bring'  
generation = Predict(400, utils.to_idxs(prefix))
print("Initial generated sentence ")
print (utils.to_string(generation))
    
    
for ep in range(epoch):

    perm = np.random.permutation(len(minbatches)).tolist() 
    stime=time()
    
    for k in range(len(minbatches)):
        
        minbatch = minbatches[perm[k]]
        x_padded = utils.make_mask(minbatch)
        inp.set(x_padded)
        loss, score = BuildModel()
        edf.Forward()
        edf.Backward(loss)
        edf.GradClip(10)
        edf.SGD(eta)
       
    duration = (time() - stime)/60.
    
    perp, loss = Eval(data_valid, vacnt)
    print("Epoch %d: Perplexity: %0.5f Avg loss = %0.5f [%.3f mins]" % (ep, perp, loss, duration))
    
    # generate some text given the prefix and trained model
    prefix = 'the agreements bring'  
    generation = Predict(400, utils.to_idxs(prefix))
    print("Epoch %d: generated sentence " % ep)
    print (utils.to_string(generation)) 

    if loss < best_loss:
        
        best_loss = loss
        # save the model
        f = open(model, 'wb')
        p_value = []
        for p in parameters:
            p_value.append(p.value)
        pickle.dump(p_value, f)
        
    else:
        
        # load the last best model and decay the learning rate
        eta *= decay
        with open(model, 'rb') as f:
            p_value = pickle.load(f)
            idx = 0
            for p in p_value:
                parameters[idx].value = p
                idx += 1

Initial: Perplexity: 7.08902 Avg loss = 1.96433
Initial generated sentence 
the agreements bring@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
Epoch 0: Perplexity: 8.65026 Avg loss = 2.16442 [38.650 mins]
Epoch 0: generated sentence 
the agreements bring@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
Epoch 1: Perplexity: 7.74601 Av

In [4]:
#Using batch norm from momentum, time step mean and variance
hidden_dim = 200
n_vocab = utils.n_vocab
batch = 50
parameters = []
model = 'model_LSTM_Norm6.pkl'
eta = 0.5
decay = 0.9

#folddata
data_train = fold_data(train_data)
data_valid = fold_data(valid_data)
data_test = fold_data(test_data)

inp = edf.Value()
np.random.seed(0)

edf.params = []
C2V = edf.Param(edf.xavier((n_vocab, hidden_dim)))

# forget gate
Wf = edf.Param(edf.xavier((2*hidden_dim, hidden_dim)))
bf = edf.Param(np.zeros((hidden_dim)))
# input gate
Wi = edf.Param(edf.xavier((2*hidden_dim, hidden_dim)))
bi = edf.Param(np.zeros((hidden_dim)))
# carry cell
Wc = edf.Param(edf.xavier((2*hidden_dim, hidden_dim)))
bc = edf.Param(np.zeros((hidden_dim)))
# output cell
Wo = edf.Param(edf.xavier((2*hidden_dim, hidden_dim)))
bo = edf.Param(np.zeros((hidden_dim)))
#normalization, using xavier init
gamma = edf.Param(np.ones((hidden_dim))/10.0)
beta = edf.Param(np.zeros((hidden_dim)))
timeHold = np.zeros((1000,2,hidden_dim))



    
V = edf.Param(edf.xavier((hidden_dim, n_vocab)))

parameters.extend([C2V, Wf, bf, Wi, bi, Wc, bc, Wo, bo, V, gamma, beta])


                    
#Adding norm layer, c_t+1 -> norm(c_t+1)
def LSTMCellNorm(xt, h, c, timeHold, test):
    
    f = edf.Sigmoid(edf.Add(edf.BatchNormMom(edf.VDot(edf.ConCat(xt, h), Wf),gamma,beta,timeHold,test), bf))
    i = edf.Sigmoid(edf.Add(edf.BatchNormMom(edf.VDot(edf.ConCat(xt, h), Wi),gamma,beta,timeHold,test), bi))
    o = edf.Sigmoid(edf.Add(edf.BatchNormMom(edf.VDot(edf.ConCat(xt, h), Wo),gamma,beta,timeHold,test), bo))
    c_hat = edf.Tanh(edf.Add(edf.BatchNormMom(edf.VDot(edf.ConCat(xt, h), Wc),gamma,beta,timeHold,test), bc))
    c_next = edf.Add(edf.Mul(f, c), edf.Mul(i, c_hat))
    #h_next = edf.Mul(o, edf.Tanh(edf.BatchNorm(c_next,gamma,beta,test)))
    h_next = edf.Mul(o, edf.Tanh(c_next))
            
    return h_next, c_next


def BuildModel():
 
    edf.components = []

    B = inp.value.shape[0]
    T = inp.value.shape[1]
    h = edf.Value(np.zeros((B, hidden_dim))) 
    c = edf.Value(np.zeros((B, hidden_dim)))


    
    score = []
    
    for t in range(T-1):
 
        wordvec = edf.Embed(edf.Value(inp.value[:,t]), C2V) 
        xt = edf.Reshape(wordvec, [-1, hidden_dim])
        h_next, c_next = LSTMCellNorm(xt, h, c, timeHold[t], edf.Value(False))
        p = edf.SoftMax(edf.VDot(h_next, V))
        logloss = edf.Reshape(edf.LogLoss(edf.Aref(p, edf.Value(inp.value[:,t+1]))), (B, 1))
        
        if t == 0:
            loss = logloss
        else:
            loss = edf.ConCat(loss, logloss)
            
        score.append(p)    
        h = h_next
        c = c_next
    
    masks = np.zeros((B, T-1), dtype = np.int32)
    masks[inp.value[:,1:] != 0] = 1
    loss = edf.MeanwithMask(loss, edf.Value(masks)) 
    
    return loss, score
    
    
def CalPerp(score):
    
    prob = [p.value for p in score]
    prob = np.transpose(np.stack(prob, axis = 0),(1,0,2))
    
    B = prob.shape[0]
    T = prob.shape[1]
    V = prob.shape[2]
    
    masks = np.zeros((B, T), dtype=np.int32)
    masks[inp.value[:,1:] != 0] = 1
    
    prob = prob.reshape(-1)
    idx = np.int32(inp.value[:,1:].reshape(-1))
    outer_dim = len(idx)
    inner_dim = len(prob)/outer_dim
    pick = np.int32(np.array(range(outer_dim))*inner_dim + idx)
    prob = prob[pick].reshape(B, T)
        
    return -np.sum(np.log(prob[np.nonzero(prob*masks)]))

def Predict(max_step, prefix):
   
    edf.components = []

    T = max_step       
    h = edf.Value(np.zeros((1, hidden_dim))) 
    c = edf.Value(np.zeros((1, hidden_dim))) 
    
    prediction = []

    for t in range(T):
   
        if t < len(prefix):
            pred = edf.Value(prefix[t])
            prediction.append(pred)              
        else:
            prediction.append(pred)

        wordvec = edf.Embed(pred, C2V)
        xt = edf.Reshape(wordvec, [-1, hidden_dim])
        h_next,c_next = LSTMCellNorm(xt, h, c, timeHold[t], edf.Value(True))
        p = edf.SoftMax(edf.VDot(h_next, V))
        pred = edf.ArgMax(p)
        h = h_next
        c = c_next   
            
    edf.Forward()
    
    idx = [pred.value for pred in prediction]
    stop_idx = utils.to_index('}')
    
    if stop_idx in idx:
        return idx[0:idx.index(stop_idx)+1]
    else:
        return idx

def Eval(data, cnt):
    
    perp = 0.
    avg_loss = 0.
    test_batches = range(0, len(data), batch)
    test_minbatches = [data[idx:idx+batch] for idx in test_batches]
    
    for minbatch in test_minbatches:
        
        x_padded = utils.make_mask(minbatch)
        inp.set(x_padded)
        loss, score = BuildModel()
        edf.Forward()
        avg_loss += loss.value
        perp += CalPerp(score)
           
    perp = np.exp(perp/cnt)
    avg_loss /= len(test_batches)
    return perp, avg_loss


############################################### training loop #####################################################

batches = range(0, len(data_train), batch)
minbatches = [data_train[idx:idx+batch] for idx in batches]

epoch = 30

# initial Perplexity and loss
perp, loss = Eval(data_valid, vacnt)
print("Initial: Perplexity: %0.5f Avg loss = %0.5f" % (perp, loss))    
best_loss = loss
prefix = 'the agreements bring'  
generation = Predict(400, utils.to_idxs(prefix))
print("Initial generated sentence ")
print (utils.to_string(generation))
    
    
for ep in range(epoch):

    perm = np.random.permutation(len(minbatches)).tolist() 
    stime=time()
    
    for k in range(len(minbatches)):
        
        minbatch = minbatches[perm[k]]
        x_padded = utils.make_mask(minbatch)
        inp.set(x_padded)
        loss, score = BuildModel()
        edf.Forward()
        edf.Backward(loss)
        edf.GradClip(10)
        edf.SGD(eta)
       
    duration = (time() - stime)/60.
    
    perp, loss = Eval(data_valid, vacnt)
    print("Epoch %d: Perplexity: %0.5f Avg loss = %0.5f [%.3f mins]" % (ep, perp, loss, duration))
    
    # generate some text given the prefix and trained model
    prefix = 'the agreements bring'  
    generation = Predict(400, utils.to_idxs(prefix))
    print("Epoch %d: generated sentence " % ep)
    print (utils.to_string(generation)) 

    if loss < best_loss:
        
        best_loss = loss

    else:
        
        # load the last best model and decay the learning rate
        eta *= decay


Initial: Perplexity: 50.17347 Avg loss = 3.93221
Initial generated sentence 
the agreements bringwx*//655.77#*b//655.77#*b//655.77#*b//655.77#*b//655.77#*b//655.77#*b//65e   *bff*b//655.77}
Epoch 0: Perplexity: 9.30988 Avg loss = 2.24059 [35.608 mins]
Epoch 0: generated sentence 
the agreements bring the the the the the the the the the the the the the the the the the ther and the ther and the ther and the ther and the ther and the ther the the the the the the the the the the the the the the the the ther the the the ther and the ther the the ther ther ther ther ther ther ther ther ther ther ther ther ther ther ther therare thar the ther ther fore thare thar ther fore fofint
Epoch 1: Perplexity: 8.07657 Avg loss = 2.09762 [40.758 mins]


  self.value = 1. / (1. + np.exp(-self.x.value))
  x_neg_exp = np.exp(-self.x.value)
  self.value = (x_exp - x_neg_exp)/(x_exp + x_neg_exp)


Epoch 1: generated sentence 
the agreements bring@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
Epoch 2: Perplexity: 6.71031 Avg loss = 1.91137 [38.617 mins]
Epoch 2: generated sentence 
the agreements bring@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
Epoch 3: Perplexity: 5.58694 Avg loss = 1.72640 [41.392 mins]
Epoch 3: generat

  x_exp = np.exp(self.x.value)





KeyboardInterrupt: 

In [5]:
reload(edf)

<module 'edf' from 'edf.py'>

In [7]:
#Using batch norm from momentum, folded data
hidden_dim = 200
n_vocab = utils.n_vocab
batch = 50
parameters = []
model = 'model_LSTM_Norm7.pkl'
eta = 0.5
decay = 0.9

#folddata
data_train = fold_data(train_data)
data_valid = fold_data(valid_data)
data_test = fold_data(test_data)

inp = edf.Value()
np.random.seed(0)

edf.params = []
C2V = edf.Param(edf.xavier((n_vocab, hidden_dim)))

# forget gate
Wf = edf.Param(edf.xavier((2*hidden_dim, hidden_dim)))
bf = edf.Param(np.zeros((hidden_dim)))
# input gate
Wi = edf.Param(edf.xavier((2*hidden_dim, hidden_dim)))
bi = edf.Param(np.zeros((hidden_dim)))
# carry cell
Wc = edf.Param(edf.xavier((2*hidden_dim, hidden_dim)))
bc = edf.Param(np.zeros((hidden_dim)))
# output cell
Wo = edf.Param(edf.xavier((2*hidden_dim, hidden_dim)))
bo = edf.Param(np.zeros((hidden_dim)))
#normalization, using xavier init
gamma = edf.Param(np.ones((hidden_dim))/10.0)
beta = edf.Param(np.zeros((hidden_dim)))




    
V = edf.Param(edf.xavier((hidden_dim, n_vocab)))

parameters.extend([C2V, Wf, bf, Wi, bi, Wc, bc, Wo, bo, V, gamma, beta])


                    
#Adding norm layer, c_t+1 -> norm(c_t+1)
def LSTMCellNorm(xt, h, c, test):
    
    f = edf.Sigmoid(edf.Add(edf.BatchNormMomNT(edf.VDot(edf.ConCat(xt, h), Wf),gamma,beta,test), bf))
    i = edf.Sigmoid(edf.Add(edf.BatchNormMomNT(edf.VDot(edf.ConCat(xt, h), Wi),gamma,beta,test), bi))
    o = edf.Sigmoid(edf.Add(edf.BatchNormMomNT(edf.VDot(edf.ConCat(xt, h), Wo),gamma,beta,test), bo))
    c_hat = edf.Tanh(edf.Add(edf.BatchNormMomNT(edf.VDot(edf.ConCat(xt, h), Wc),gamma,beta,test), bc))
    c_next = edf.Add(edf.Mul(f, c), edf.Mul(i, c_hat))
    #h_next = edf.Mul(o, edf.Tanh(edf.BatchNorm(c_next,gamma,beta,test)))
    h_next = edf.Mul(o, edf.Tanh(c_next))
            
    return h_next, c_next


def BuildModel():
 
    edf.components = []

    B = inp.value.shape[0]
    T = inp.value.shape[1]
    h = edf.Value(np.zeros((B, hidden_dim))) 
    c = edf.Value(np.zeros((B, hidden_dim)))


    
    score = []
    
    for t in range(T-1):
 
        wordvec = edf.Embed(edf.Value(inp.value[:,t]), C2V) 
        xt = edf.Reshape(wordvec, [-1, hidden_dim])
        h_next, c_next = LSTMCellNorm(xt, h, c, edf.Value(False))
        p = edf.SoftMax(edf.VDot(h_next, V))
        logloss = edf.Reshape(edf.LogLoss(edf.Aref(p, edf.Value(inp.value[:,t+1]))), (B, 1))
        
        if t == 0:
            loss = logloss
        else:
            loss = edf.ConCat(loss, logloss)
            
        score.append(p)    
        h = h_next
        c = c_next
    
    masks = np.zeros((B, T-1), dtype = np.int32)
    masks[inp.value[:,1:] != 0] = 1
    loss = edf.MeanwithMask(loss, edf.Value(masks)) 
    
    return loss, score
    
    
def CalPerp(score):
    
    prob = [p.value for p in score]
    prob = np.transpose(np.stack(prob, axis = 0),(1,0,2))
    
    B = prob.shape[0]
    T = prob.shape[1]
    V = prob.shape[2]
    
    masks = np.zeros((B, T), dtype=np.int32)
    masks[inp.value[:,1:] != 0] = 1
    
    prob = prob.reshape(-1)
    idx = np.int32(inp.value[:,1:].reshape(-1))
    outer_dim = len(idx)
    inner_dim = len(prob)/outer_dim
    pick = np.int32(np.array(range(outer_dim))*inner_dim + idx)
    prob = prob[pick].reshape(B, T)
        
    return -np.sum(np.log(prob[np.nonzero(prob*masks)]))

def Predict(max_step, prefix):
   
    edf.components = []

    T = max_step       
    h = edf.Value(np.zeros((1, hidden_dim))) 
    c = edf.Value(np.zeros((1, hidden_dim))) 
    
    prediction = []

    for t in range(T):
   
        if t < len(prefix):
            pred = edf.Value(prefix[t])
            prediction.append(pred)              
        else:
            prediction.append(pred)

        wordvec = edf.Embed(pred, C2V)
        xt = edf.Reshape(wordvec, [-1, hidden_dim])
        h_next,c_next = LSTMCellNorm(xt, h, c, edf.Value(True))
        p = edf.SoftMax(edf.VDot(h_next, V))
        pred = edf.ArgMax(p)
        h = h_next
        c = c_next   
            
    edf.Forward()
    
    idx = [pred.value for pred in prediction]
    stop_idx = utils.to_index('}')
    
    if stop_idx in idx:
        return idx[0:idx.index(stop_idx)+1]
    else:
        return idx

def Eval(data, cnt):
    
    perp = 0.
    avg_loss = 0.
    test_batches = range(0, len(data), batch)
    test_minbatches = [data[idx:idx+batch] for idx in test_batches]
    
    for minbatch in test_minbatches:
        
        x_padded = utils.make_mask(minbatch)
        inp.set(x_padded)
        loss, score = BuildModel()
        edf.Forward()
        avg_loss += loss.value
        perp += CalPerp(score)
           
    perp = np.exp(perp/cnt)
    avg_loss /= len(test_batches)
    return perp, avg_loss


############################################### training loop #####################################################

batches = range(0, len(data_train), batch)
minbatches = [data_train[idx:idx+batch] for idx in batches]

epoch = 30

# initial Perplexity and loss
perp, loss = Eval(data_valid, vacnt)
print("Initial: Perplexity: %0.5f Avg loss = %0.5f" % (perp, loss))    
best_loss = loss
prefix = 'the agreements bring'  
generation = Predict(400, utils.to_idxs(prefix))
print("Initial generated sentence ")
print (utils.to_string(generation))
    
    
for ep in range(epoch):

    perm = np.random.permutation(len(minbatches)).tolist() 
    stime=time()
    
    for k in range(len(minbatches)):
        
        minbatch = minbatches[perm[k]]
        x_padded = utils.make_mask(minbatch)
        inp.set(x_padded)
        loss, score = BuildModel()
        edf.Forward()
        edf.Backward(loss)
        edf.GradClip(10)
        edf.SGD(eta)
       
    duration = (time() - stime)/60.
    
    perp, loss = Eval(data_valid, vacnt)
    print("Epoch %d: Perplexity: %0.5f Avg loss = %0.5f [%.3f mins]" % (ep, perp, loss, duration))
    
    # generate some text given the prefix and trained model
    prefix = 'the agreements bring'  
    generation = Predict(400, utils.to_idxs(prefix))
    print("Epoch %d: generated sentence " % ep)
    print (utils.to_string(generation)) 

    if loss < best_loss:
        
        best_loss = loss

    else:
        
        # load the last best model and decay the learning rate
        eta *= decay


Initial: Perplexity: 50.17347 Avg loss = 3.93221
Initial generated sentence 
the agreements bringwx0   @@>dmal77#*b8ffdx<p3fdx<p3fdx<p3fdx<p3fdx<p3fdx<p3fdx<p3fdx<p3fdx<p3fdx<p3fdx<p3fdx<p3fdx<p3fdx<p3fdx<p3fdx<p3fdx<p3fdx<p3fdx<p3fdx<p3fdx<p3fdx<p3fdx<p3fdx<p3fdx<p3fdx<p3fdx<p3fdx<p3fdx<p3fdx<p3fdx<p3fdx<p3fdx<p3fdx<p3fdx<p3fdx<p3fdx<p3fdx<p3fdx<p3fdx<p3fdx<p3fdx<p3fdx<p3fdx<p3fdx<p3fdx<p3fdx<p3fdx<p3fdx<p3fdx<p3fdx<p3fdx<p3fdx<p3fdx<p3fdx<p3fdx<p3fdx<p3fdx<p3fdx<p3fdx<p3
Epoch 0: Perplexity: 9.30988 Avg loss = 2.24059 [35.878 mins]
Epoch 0: generated sentence 
the agreements bringtattitittotittotittotittotittotittotittotittotittotittotittotittotittotittotittotittotittotittotittotittotittotittotittotittotittotittotittotittotittotittotittotittotittotittotittotittotittotittotittotittotittotittotittotittotittotittotittotittotittotittotittotittotittotittotittotittotittotittotittotittotittotittotittotittotittotittotittotittotittotittotittotittotittotittotitto
Epoch 1: Perplexity: 8.07657 A

In [10]:
def LSTMCell(xt, h, c):
    
    f = edf.Sigmoid(edf.Add(edf.VDot(edf.ConCat(xt, h), Wf), bf))
    i = edf.Sigmoid(edf.Add(edf.VDot(edf.ConCat(xt, h), Wi), bi))
    o = edf.Sigmoid(edf.Add(edf.VDot(edf.ConCat(xt, h), Wo), bo))
    c_hat = edf.Tanh(edf.Add(edf.VDot(edf.ConCat(xt, h), Wc), bc))
    c_next = edf.Add(edf.Mul(f, c), edf.Mul(i, c_hat))
    h_next = edf.Mul(o, edf.Tanh(c_next))
            
    return h_next, c_next

def PredictNew(max_step, prefix):
   
    edf.components = []

    T = max_step       
    h = edf.Value(np.zeros((1, hidden_dim))) 
    c = edf.Value(np.zeros((1, hidden_dim))) 
    
    prediction = []

    for t in range(T):
   
        if t < len(prefix):
            pred = edf.Value(prefix[t])
            prediction.append(pred)              
        else:
            prediction.append(pred)

        wordvec = edf.Embed(pred, C2V)
        xt = edf.Reshape(wordvec, [-1, hidden_dim])
        h_next,c_next = LSTMCell(xt, h, c)
        p = edf.SoftMax(edf.VDot(h_next, V))
        pred = edf.ArgMax(p)
        h = h_next
        c = c_next   
            
    edf.Forward()
    
    idx = [pred.value for pred in prediction]
    stop_idx = utils.to_index('}')
    
    if stop_idx in idx:
        return idx[0:idx.index(stop_idx)+1]
    else:
        return idx

# generate some text given the prefix and trained model
prefix = 'the agreements bring'  
generation = PredictNew(400, utils.to_idxs(prefix))
print("Generated sentence ")
print (utils.to_string(generation))

Generated sentence 
the agreements bringoververvorvervoververvorvervoververvorvervoververvorvervoververvorvervoververvorvervoververvorvervoververvorvervoververvorvervoververvorvervoververvorvervoververvorvervoververvorvervoververvorvervoververvorvervoververvorvervoververvorvervoververvorvervoververvorvervoververvorvervoververvorvervoververvorvervoververvorvervoververvorvervoververvorvervoververvorvervoververvorvervov
