# Chinese Poetry Genarator - V1 -With RNN

In [1]:
import sys
sys.path.append("../")
sys.path.append("../../")

In [2]:
import jieba as jb
import numpy as np
# from rnn_utils import *
from nlp_utils import load_char_embeddings, load_corpus

In [45]:
def clip(gradients, maxValue):
    '''
    Clips the gradients' values between minimum and maximum.
    
    Arguments:
    gradients -- a dictionary containing the gradients "dWaa", "dWax", "dWya", "db", "dby"
    maxValue -- everything above this number is set to this number, and everything less than -maxValue is set to -maxValue
    
    Returns: 
    gradients -- a dictionary with the clipped gradients.
    '''
    
    dWaa, dWax, dWya, db, dby = gradients['dWaa'], gradients['dWax'], gradients['dWya'], gradients['db'], gradients['dby']
   
    ### START CODE HERE ###
    # clip to mitigate exploding gradients, loop over [dWax, dWaa, dWya, db, dby]. (≈2 lines)
    for gradient in [dWax, dWaa, dWya, db, dby]:
        np.clip(gradient,-maxValue,maxValue,out = gradient)
    ### END CODE HERE ###
    
    gradients = {"dWaa": dWaa, "dWax": dWax, "dWya": dWya, "db": db, "dby": dby}
    
    return gradients


def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

def smooth(loss, cur_loss):
    return loss * 0.999 + cur_loss * 0.001

def print_sample(sample_ix, ix_to_char):
    txt = ''.join(ix_to_char[str(ix)] for ix in  sample_ix)
    # txt = txt[0].upper() + txt[1:]  # capitalize first character 
    print ('%s' % (txt, ), end='')

def get_initial_loss(vocab_size, seq_length):
    return -np.log(1.0/vocab_size)*seq_length


def initialize_parameters(n_a, n_x, n_y):
    """
    Initialize parameters with small random values
    
    Returns:
    parameters -- python dictionary containing:
                        Wax -- Weight matrix multiplying the input, numpy array of shape (n_a, n_x)
                        Waa -- Weight matrix multiplying the hidden state, numpy array of shape (n_a, n_a)
                        Wya -- Weight matrix relating the hidden-state to the output, numpy array of shape (n_y, n_a)
                        b --  Bias, numpy array of shape (n_a, 1)
                        by -- Bias relating the hidden-state to the output, numpy array of shape (n_y, 1)
    """
    np.random.seed(1)
    Wax = np.random.randn(n_a, n_x)*0.01 # input to hidden
    Waa = np.random.randn(n_a, n_a)*0.01 # hidden to hidden
    Wya = np.random.randn(n_y, n_a)*0.01 # hidden to output
    b = np.zeros((n_a, 1)) # hidden bias
    by = np.zeros((n_y, 1)) # output bias
    
    parameters = {"Wax": Wax, "Waa": Waa, "Wya": Wya, "b": b,"by": by}
    
    return parameters

def rnn_step_forward(parameters, a_prev, x):
    
    Waa, Wax, Wya, by, b = parameters['Waa'], parameters['Wax'], parameters['Wya'], parameters['by'], parameters['b']
    a_next = np.tanh(np.dot(Wax, x) + np.dot(Waa, a_prev) + b) # hidden state
    p_t = softmax(np.dot(Wya, a_next) + by) # unnormalized log probabilities for next chars # probabilities for next chars 
    
    return a_next, p_t

def rnn_step_backward_old(dy, gradients, parameters, x, a, a_prev):
    
#     print("dy",dy.shape)
#     print("dby",gradients['dby'].shape)
   
#     print("dWax",gradients['dWax'].shape) 
#     print("dWaa",gradients['dWaa'].shape)
#     print("da_next",gradients['da_next'].shape)
    
    m = x.shape[1]
    gradients['dWya'] += 1/m*np.sum(np.dot(dy, a.T), axis=1, keepdims=True)
    gradients['dby'] += 1/m*np.sum(dy, axis=1, keepdims=True)
    da = np.dot(parameters['Wya'].T, dy) + gradients['da_next'] # backprop into h
    daraw = (1 - a * a) * da # backprop through tanh nonlinearity
#     print("daraw", daraw.shape)
    gradients['db'] += 1/m*np.sum(daraw, axis=1, keepdims=True)
#     print("db",gradients['db'].shape)
    gradients['dWax'] += 1/m*np.sum(np.dot(daraw, x.T), axis=1, keepdims=True)
    gradients['dWaa'] += 1/m*np.sum(np.dot(daraw, a_prev.T), axis=1, keepdims=True)
    gradients['da_next'] = np.dot(parameters['Waa'].T, daraw)
    
#     print("da",da.shape)
    
    return gradients

def rnn_step_backward(dy, gradients, parameters, x, a, a_prev, beta=0.99):
    
#     print("dy",dy.shape)
#     print("dby",gradients['dby'].shape)
   
#     print("dWax",gradients['dWax'].shape) 
#     print("dWaa",gradients['dWaa'].shape)
#     print("da_next",gradients['da_next'].shape)
    
    m = x.shape[1]
    gradients['dWya'] += beta*gradients['dWya'] + (1- beta) * 1/m*np.sum(np.dot(dy, a.T), axis=1, keepdims=True)
    gradients['dby'] += beta *gradients['dby'] +(1-beta) * 1/m*np.sum(dy, axis=1, keepdims=True)
    da = np.dot(parameters['Wya'].T, dy) + gradients['da_next'] # backprop into h
    daraw = (1 - a * a) * da # backprop through tanh nonlinearity
#     print("daraw", daraw.shape)
    gradients['db'] += beta *gradients['db'] +(1-beta) * 1/m*np.sum(daraw, axis=1, keepdims=True)
#     print("db",gradients['db'].shape)
    gradients['dWax'] += beta * gradients['dWax']+(1-beta) * 1/m*np.sum(np.dot(daraw, x.T), axis=1, keepdims=True)
    gradients['dWaa'] += beta * gradients['dWaa']+(1-beta) * 1/m*np.sum(np.dot(daraw, a_prev.T), axis=1, keepdims=True)
    gradients['da_next'] = beta *  gradients['da_next']+(1-beta) * np.dot(parameters['Waa'].T, daraw)
    
#     print("da",da.shape)
    
    return gradients

def update_parameters(parameters, gradients, lr):

    parameters['Wax'] += -lr * gradients['dWax']
    parameters['Waa'] += -lr * gradients['dWaa']
    parameters['Wya'] += -lr * gradients['dWya']
    parameters['b']  += -lr * gradients['db']
    parameters['by']  += -lr * gradients['dby']
    return parameters

def rnn_forward(X, Y, a0, parameters, vocab_size = 27):
    
    m = X[0].shape[1]
    # Initialize x, a and y_hat as empty dictionaries
    
    a, y_hat = {}, {}
    
    a[-1] = np.copy(a0)
    
    # initialize your loss to 0
    loss = 0
    
    for t in range(len(X)):
        
        # Set x[t] to be the one-hot vector representation of the t'th character in X.
        # if X[t] == None, we just have x[t]=0. This is used to set the input for the first timestep to the zero vector. 
        # x[t] = np.zeros((vocab_size,1)) 
        # if (X[t] != None):
        #     x[t][X[t]] = 1
        
        # Run one step forward of the RNN
        a[t], y_hat[t] = rnn_step_forward(parameters, a[t-1], X[t])
        
        # Update the loss by substracting the cross-entropy term of this time-step from it.
        dloss = 0
        
        for i in range(m):
            dloss += 1/m* np.log(y_hat[t][Y[t][i],0])
        loss -= dloss
        
    cache = (y_hat, a, X)
        
    return loss, cache

def rnn_backward(X, Y, parameters, cache):
    # Initialize gradients as an empty dictionary
    gradients = {}
    
    # Retrieve from cache and parameters
    (y_hat, a, x) = cache
    m = x[0].shape[1]
    Waa, Wax, Wya, by, b = parameters['Waa'], parameters['Wax'], parameters['Wya'], parameters['by'], parameters['b']
    
    # each one should be initialized to zeros of the same dimension as its corresponding parameter
    gradients['dWax'], gradients['dWaa'], gradients['dWya'] = np.zeros_like(Wax), np.zeros_like(Waa), np.zeros_like(Wya)
    gradients['db'], gradients['dby'] = np.zeros_like(b), np.zeros_like(by)
    gradients['da_next'] = np.zeros_like(a[0])
    
    ### START CODE HERE ###
    # Backpropagate through time
    for t in reversed(range(len(X))):
        dy = np.copy(y_hat[t])
        for i in range(m):
             dy[Y[t][i]][i] -= 1
        
       
        gradients = rnn_step_backward(dy, gradients, parameters, x[t], a[t], a[t-1])
    ### END CODE HERE ###
    
    return gradients, a

In [31]:
char_to_index, index_to_char, index_to_vec = load_char_embeddings("../../data/embedding/sogou/char_embedding_plus_punctuation")

In [32]:
vocab_len = len(char_to_index) + 2
emb_dim = index_to_vec['0'].shape
print("vocab_len = ", vocab_len, "emb_dim = ", emb_dim)

vocab_len =  6902 emb_dim =  (300,)


In [33]:
# embedding fitures are between -1.685101 1.671867
# append EOS and UNK to embedding matrixz
np.random.seed(ord('E'))
char_to_index['<EOS>'] = 6900
index_to_char['6900'] = '<EOS>'
index_to_vec['6900'] = np.random.rand(300,)

np.random.seed(ord('U'))
char_to_index['<UNK>'] = 6901
index_to_char['6901'] = '<UNK>'
index_to_vec['6901'] = np.random.rand(300,)

#define /n as EOS
char_to_index['\n'] = 6900

In [34]:
len(index_to_vec)

6902

In [35]:
tangshis = np.load("../../data/chinese-poetry-master/tangshi_v1_5yan.npy")

In [36]:
tangshis[0]

'秦川雄帝宅，函谷壮皇居。'

In [37]:
# dataset= tangshis
# index = 5
# batch_size = 4
# mini_batch = []
# Y = []
# for c in range(12):
#     char_batch = []
#     char_batch_ix = []
#     for i in range(batch_size): 
#         ix = char_to_index['<EOS>'] #初始化而已
#         try:
#             ix = char_to_index[dataset[index+i][c]]
#         except KeyError:
#             ix = char_to_index['<UNK>'] 
#         char_batch_ix.append(ix)
#         char_batch.append(index_to_vec[str(ix)])
#     Y.append(char_batch_ix)
#     mini_batch.append(char_batch)
# Y.append([char_to_index['<EOS>']]*4)
# Y

In [38]:
def save_params(parameters):
    np.save("saves/parameters_"+str(parameters['endpoint'])+'.npy', parameters)

In [39]:
def sample(parameters, char_to_ix, ix_to_char, ix_to_vec, seed, fixed_chars = None, padding = False ):
    # Retrieve parameters and relevant shapes from "parameters" dictionary
    Waa, Wax, Wya, by, b = parameters['Waa'], parameters['Wax'], parameters['Wya'], parameters['by'], parameters['b']
    vocab_size = by.shape[0]
    n_a = Waa.shape[1]
    
    ### START CODE HERE ###
    # Step 1: Create the one-hot vector x for the first character (initializing the sequence generation). (≈1 line)
    x = np.zeros((300,1))
    # Step 1': Initialize a_prev as zeros (≈1 line)
    a_prev = np.zeros((n_a,1))
    
    # Create an empty list of indices, this is the list which will contain the list of indices of the characters to generate (≈1 line)
    indices = []
    
    # Idx is a flag to detect a newline character, we initialize it to -1
    idx = -1
    
    # Loop over time-steps t. At each time-step, sample a character from a probability distribution and append 
    # its index to "indices". We'll stop if we reach 50 characters (which should be very unlikely with a well 
    # trained model), which helps debugxging and prevents entering an infinite loop. 
    counter = 0
    newline_character = char_to_ix['\n']
    
    while (idx != newline_character and counter != 12):
       
        # Step 2: Forward propagate x using the equations (1), (2) and (3)
        
        a = np.tanh(np.dot( Wax, x) + np.dot(Waa, a_prev) + b)
        z = np.dot(Wya, a) + by
        y = softmax(z)
        
        
        # for grading purposes
        np.random.seed(counter+seed) 
        # Step 3: Sample the index of a character within the vocabulary from the probability distribution y
        if fixed_chars!=None and counter<len(fixed_chars):
            idx = char_to_ix[fixed_chars[counter]]
        else:
            while True:
                idx = np.random.choice(range(vocab_size),p = y.ravel())
                if not padding or idx != char_to_index["<EOS>"]:

                    break
            
        # Append the index to "indices"
        indices.append(idx)
        
        # Step 4: Overwrite the input character as the one corresponding to the sampled index.
        x = index_to_vec[str(idx)].reshape(300,1)
        
        # Update "a_prev" to be "a"
        a_prev = a
        
        # for grading purposes
        seed += 1
        counter +=1
        
    ### END CODE HERE ###

    if (counter == 50):
        indices.append(char_to_ix['\n'])
    
    return indices

In [40]:
def optimize(X, Y, a_prev, parameters, learning_rate = 0.01):
    # Forward propagate through time (≈1 line)
    loss, cache = rnn_forward(X, Y, a_prev, parameters)
    
    # Backpropagate through time (≈1 line)
    gradients, a = rnn_backward(X, Y, parameters, cache)
    
    # Clip your gradients between -5 (min) and 5 (max) (≈1 line)
    gradients = clip(gradients, 5)
    
    # Update parameters (≈1 line)
    parameters = update_parameters(parameters, gradients, learning_rate)
    
    return loss, gradients, a[len(X)-1]
    

In [76]:
def model(dataset,index_to_char, char_to_index, index_to_vec, num_iterations = 120000, n_a = 64,gen_samples = 5,parameters = None, batch_size = 64,learning_rate= 0.01):
    # Retrieve n_x and n_y from vocab_size
    n_x, n_y = 300, len(char_to_index)
    
    # Initialize parameters
    if not parameters:
        parameters = initialize_parameters(n_a, n_x, n_y)
        parameters['endpoint'] = 0
        parameters['time'] = []
    
    # Initialize loss (this is required because we want to smooth our loss, don't worry about it)
    # loss = get_initial_loss(vocab_size, dino_names)
    
    # Build list of all dinosaur names (training examples).
    
    # Shuffle list of all dinosaur names
    np.random.seed(0)
    np.random.shuffle(dataset)
    
    # Initialize the hidden state of your LSTM
    a_prev = np.zeros((n_a, batch_size))
    j = 0
    import time
    time_start=time.time()
    epoch_size = len(dataset)
    try:
    # Optimization loop
        for j in range(parameters['endpoint'],parameters['endpoint'] + num_iterations):

            ### START CODE HERE ###

            # Use the hint above to define one training example (X,Y) (≈ 2 lines)
            index = j*batch_size%len(dataset)
            
            
            

            X = []
            Y = []
            
            # 讲汉字转换为向量输入和索引输出，并处理未知字符。生成输入矩阵
            X.append(np.zeros((300,batch_size)))
            for c in range(12):
                char_batch = []
                char_batch_ix = []
                for i in range(batch_size): 
                    ix = char_to_index['<EOS>'] #初始化而已
                    try:
                        ix = char_to_index[dataset[index+i][c]]
                    except KeyError:
                        ix = char_to_index['<UNK>'] 
                        
                    char_batch_ix.append(ix)
                    char_batch.append(index_to_vec[str(ix)])
                    
                X.append(np.array(char_batch).T)
                Y.append(char_batch_ix)
            
            Y.append([char_to_index["<EOS>"]]*batch_size)

            

            # Perform one optimization step: Forward-prop -> Backward-prop -> Clip -> Update parameters
            # Choose a learning rate of 0.01
            loss, gradients, a_prev = optimize(X, Y, a_prev, parameters, learning_rate = learning_rate)

            # Use a latency trick to keep the loss smooth. It happens here to accelerate the training.
            # loss = smooth(loss, curr_loss)

            # Every 2000 Iteration, generate "n" characters thanks to sample() to check if the model is learning properly
            
            print("进度:{}/{} epoches".format(j*batch_size,epoch_size), end="\r")
             
            if j*batch_size % 512 == 0:
                print("\n耗时",round(time.time()-time_start,3),"s")
                print('Iteration: %d, Loss: %7f' % (j*batch_size, loss) + '\n')

                # The number of dinosaur names to print
                seed = 0
                for s in range(gen_samples):
                    
                    # Sample indices and print them
                    sampled_indices = sample(parameters, char_to_index, index_to_char, index_to_vec, seed)
                    print_sample(sampled_indices, index_to_char)
                    print('')
                    seed += 1  # To get the same result for grading purposed, increment the seed by one. 

                print('\n')
    except KeyboardInterrupt:
        pass
    finally:
        
        parameters['endpoint'] = j
        time_end=time.time()
        parameters['time'].append(round(time_end-time_start,3))
        save_params(parameters)
         
    return parameters

In [79]:
 parameters= np.load("saves/parameters_190454.npy")[()]

In [74]:
# parameters['time'] 

In [80]:
parameters=model(tangshis, index_to_char, char_to_index, index_to_vec, num_iterations = 10000,batch_size=512,parameters=parameters, learning_rate=0.05)

进度:97512448/154162 epoches
耗时 1.859 s
Iteration: 97512448, Loss: 75.148441

兴住朽旌浑，因今小帝仙。
旧兵无不在，浩荡无人地。
昨烛尘蓬潞，落金烟隔深。
形应日为者，徒此从不终。
<EOS>


进度:97512960/154162 epoches
耗时 3.815 s
Iteration: 97512960, Loss: 75.225317

兴住忻夷，疾去平生。<EOS>
旧兵无不在，浩荡长成分。
阿勘悲幽揫，乡去皆归期。
形自后为此，闲作不有疑。
<EOS>


进度:97513472/154162 epoches
耗时 5.659 s
Iteration: 97513472, Loss: 74.049158

兴海嘶壑，启所拟问。<EOS>
旧兵无不在，浩荡无人地。
昨侍悲幽舜，何由寺田谁。
形自后为此，闲作不有疑。
<EOS>


进度:97513984/154162 epoches
耗时 7.79 s
Iteration: 97513984, Loss: 74.331152

兴住忻袍绮，应叶带池萍。
旧徐如有一，虚隐受地三。
阿敛辨鹿瑟，断电遍烟园。
形自后为老，闲生不有语。
<EOS>


进度:97514496/154162 epoches
耗时 10.193 s
Iteration: 97514496, Loss: 76.141449

兴投谅钓宦，所合共逢妆。
旧兵无不在，始验作分名。
阿甸卿衔涩，复自逢云药。
形自他为此，闲作不有疑。
<EOS>


进度:97515008/154162 epoches
耗时 12.264 s
Iteration: 97515008, Loss: 76.183639

兴拟诏卒谬，所招近汉仁。
旧兵无不在，始欢发行时。
昨贻辞雁浦，似此知载山。
形五日将图，闲方不对营。
<EOS>


进度:97515520/154162 epoches
耗时 14.368 s
Iteration: 97515520, Loss: 76.763320

兴德夙蚌槿，几马只何言。
旧钟近月在，尝晓自我多。
阿鲤尘枕隐，彩景雪额节。
形自就一均，倚岛月中落。
<EOS>


进度:97516032/154162 epoches
耗时 16.756 s
I

In [34]:
save_params(parameters)

In [39]:
# 生成 诗句
seed = 330
for name in range(5):
                
    # Sample indices and print them
    sampled_indices = sample(parameters, char_to_index, index_to_char, index_to_vec, seed, fixed_chars="", padding = False)
    print_sample(sampled_indices, index_to_char)
    seed += 1  # To get the same result for grading purposed, increment the seed by one. 
      
    print('\n')

春冀近推好，日自何由流。

曙星。<EOS>

闰来心名六三卷浮。<EOS>

青素。<EOS>

此道上华日，翻管许闲漫。

