In [58]:
import sys
sys.path.append("../")
sys.path.append("../../")

In [59]:
import numpy as np

In [None]:
def clip(gradients, maxValue):
    '''
    Clips the gradients' values between minimum and maximum.
    
    Arguments:
    gradients -- a dictionary containing the gradients "dWaa", "dWax", "dWya", "db", "dby"
    maxValue -- everything above this number is set to this number, and everything less than -maxValue is set to -maxValue
    
    Returns: 
    gradients -- a dictionary with the clipped gradients.
    '''
    
    dWaa, dWax, dWya, db, dby = gradients['dWaa'], gradients['dWax'], gradients['dWya'], gradients['db'], gradients['dby']
   
    ### START CODE HERE ###
    # clip to mitigate exploding gradients, loop over [dWax, dWaa, dWya, db, dby]. (≈2 lines)
    for gradient in [dWax, dWaa, dWya, db, dby]:
        np.clip(gradient,-maxValue,maxValue,out = gradient)
    ### END CODE HERE ###
    
    gradients = {"dWaa": dWaa, "dWax": dWax, "dWya": dWya, "db": db, "dby": dby}
    
    return gradients


def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

def smooth(loss, cur_loss):
    return loss * 0.999 + cur_loss * 0.001

def print_sample(sample_ix, ix_to_char):
    txt = ''.join(ix_to_char[str(ix)] for ix in  sample_ix)
    # txt = txt[0].upper() + txt[1:]  # capitalize first character 
    print ('%s' % (txt, ), end='')

def get_initial_loss(vocab_size, seq_length):
    return -np.log(1.0/vocab_size)*seq_length


def initialize_parameters(n_a, n_x, n_y):
    """
    Initialize parameters with small random values
    
    Returns:
    parameters -- python dictionary containing:
                        Wax -- Weight matrix multiplying the input, numpy array of shape (n_a, n_x)
                        Waa -- Weight matrix multiplying the hidden state, numpy array of shape (n_a, n_a)
                        Wya -- Weight matrix relating the hidden-state to the output, numpy array of shape (n_y, n_a)
                        b --  Bias, numpy array of shape (n_a, 1)
                        by -- Bias relating the hidden-state to the output, numpy array of shape (n_y, 1)
    """
    np.random.seed(1)
    Wax = np.random.randn(n_a, n_x)*0.01 # input to hidden
    Waa = np.random.randn(n_a, n_a)*0.01 # hidden to hidden
    Wya = np.random.randn(n_y, n_a)*0.01 # hidden to output
    b = np.zeros((n_a, 1)) # hidden bias
    by = np.zeros((n_y, 1)) # output bias
    
    parameters = {"Wax": Wax, "Waa": Waa, "Wya": Wya, "b": b,"by": by}
    
    return parameters

def rnn_step_forward(parameters, a_prev, x):
    
    Waa, Wax, Wya, by, b = parameters['Waa'], parameters['Wax'], parameters['Wya'], parameters['by'], parameters['b']
    a_next = np.tanh(np.dot(Wax, x) + np.dot(Waa, a_prev) + b) # hidden state
    p_t = softmax(np.dot(Wya, a_next) + by) # unnormalized log probabilities for next chars # probabilities for next chars 
    
    return a_next, p_t

def rnn_step_backward(dy, gradients, parameters, x, a, a_prev):
    
    m = x.shape[1]
    gradients['dWya'] += 1/m * np.sum(np.dot(dy, a.T), axis=1, keepdims=True)
    gradients['dby'] += 1/m * np.sum(dy, axis=1, keepdims=True)
    da = np.dot(parameters['Wya'].T, dy) + gradients['da_next'] # backprop into h
    daraw = (1 - a * a) * da # backprop through tanh nonlinearity
    gradients['db'] += daraw
    gradients['dWax'] += np.dot(daraw, x.T)
    gradients['dWaa'] += np.dot(daraw, a_prev.T)
    gradients['da_next'] = np.dot(parameters['Waa'].T, daraw)
    return gradients

def update_parameters(parameters, gradients, lr):

    parameters['Wax'] += -lr * gradients['dWax']
    parameters['Waa'] += -lr * gradients['dWaa']
    parameters['Wya'] += -lr * gradients['dWya']
    parameters['b']  += -lr * gradients['db']
    parameters['by']  += -lr * gradients['dby']
    return parameters

def rnn_forward(X, Y, a0, parameters, vocab_size = 27):
    
    # Initialize x, a and y_hat as empty dictionaries
    a, y_hat = {}, {}
    
    a[-1] = np.copy(a0)
    
    # initialize your loss to 0
    loss = 0
    
    for t in range(len(X)):
        
        # Set x[t] to be the one-hot vector representation of the t'th character in X.
        # if X[t] == None, we just have x[t]=0. This is used to set the input for the first timestep to the zero vector. 
        # x[t] = np.zeros((vocab_size,1)) 
        # if (X[t] != None):
        #     x[t][X[t]] = 1
        
        # Run one step forward of the RNN
        a[t], y_hat[t] = rnn_step_forward(parameters, a[t-1], X[t])
        
        # Update the loss by substracting the cross-entropy term of this time-step from it.
        loss -= np.log(y_hat[t][Y[t],0])
        
    cache = (y_hat, a, X)
        
    return loss, cache

def rnn_backward(X, Y, parameters, cache):
    # Initialize gradients as an empty dictionary
    gradients = {}
    
    # Retrieve from cache and parameters
    (y_hat, a, x) = cache
    Waa, Wax, Wya, by, b = parameters['Waa'], parameters['Wax'], parameters['Wya'], parameters['by'], parameters['b']
    
    # each one should be initialized to zeros of the same dimension as its corresponding parameter
    gradients['dWax'], gradients['dWaa'], gradients['dWya'] = np.zeros_like(Wax), np.zeros_like(Waa), np.zeros_like(Wya)
    gradients['db'], gradients['dby'] = np.zeros_like(b), np.zeros_like(by)
    gradients['da_next'] = np.zeros_like(a[0])
    
    ### START CODE HERE ###
    # Backpropagate through time
    for t in reversed(range(len(X))):
        dy = np.copy(y_hat[t])
        dy[Y[t]] -= 1
        gradients = rnn_step_backward(dy, gradients, parameters, x[t], a[t], a[t-1])
    ### END CODE HERE ###
    
    return gradients, a



# 将唐诗 json 数据按句提取 方便简单模型处理

In [45]:
raw_data = load_corpus("../../data/chinese-poetry-master/json/poet.song.0.json")

In [46]:
import json

In [47]:
with open("../../data/chinese-poetry-master/json/poet.tang.0.json") as f:
    tangshi0= json.load(f)

In [48]:
from zhconv import convert

In [49]:
with open("../../data/chinese-poetry-master/json/poet.tang.0.json") as f:
        tangshi = json.load(f)

In [50]:
tangshi[0]['paragraphs']

['秦川雄帝宅，函谷壯皇居。', '綺殿千尋起，離宮百雉餘。', '連甍遙接漢，飛觀迥凌虛。', '雲日隱層闕，風煙出綺疎。']

In [51]:
def generate_dataset_for_V1():
    tangshi_set_in_sentence = []
    for i in range(58):

        with open("../../data/chinese-poetry-master/json/poet.tang."+ str(i*1000)+".json") as f:
            tangshis = json.load(f)
        for shi in tangshis:
            for sentence in shi['paragraphs']:
                zh_cn_sentence = convert(sentence, 'zh-cn')
                tangshi_set_in_sentence.append(zh_cn_sentence)
    np.save("../../data/chinese-poetry-master/tangshi_v1.npy", tangshi_set_in_sentence)


generate_dataset_for_V1()
            
        

# 唐诗分为五言和七言

In [10]:
tangshi  =np.load("../../data/chinese-poetry-master/tangshi_v1.npy")

In [17]:
tangshi_5yan = []
tangshi_7yan = []

In [18]:
for sentence in tangshi:
    if len(sentence) > 12:
        tangshi_7yan.append(sentence)
    else:
        tangshi_5yan.append(sentence)

In [21]:
np.save("../../data/chinese-poetry-master/tangshi_v1_5yan.npy", tangshi_5yan)
np.save("../../data/chinese-poetry-master/tangshi_v1_7yan.npy", tangshi_7yan)

# 读取字嵌入向量，添加标点符号嵌入向量并存储

In [24]:
from zhon.hanzi import punctuation

In [51]:
char_to_index, index_to_char, index_to_vec = load_char_embeddings("../../data/embedding/sogou")

In [43]:
embeddings, words = load_words_and_embeddings("../../data/embedding/sogou/") 

In [44]:
words = words[1:]
words

array(['，', '的', '。', ..., '林展海', '８２０２３３３０', '西萨'], dtype='<U25')

In [45]:
len(char_to_index)

6892

In [52]:
c_to_ix_plus = {}
ix_to_c_plus = {}
ix_to_v_plus = {}

i = 6892

In [53]:
for index, word in enumerate(words):
    if word in ",.!?，。？！" and word not in char_to_index.keys():
        c_to_ix_plus[word] = i
        ix_to_c_plus[str(i)] = word
        ix_to_v_plus[str(i)] = embeddings[index]
        i = i + 1

In [54]:
char_to_index.update(c_to_ix_plus)
index_to_char.update(ix_to_c_plus)
index_to_vec.update(ix_to_v_plus)

In [55]:
len(char_to_index)

6900

In [56]:
np.save("../../data/embedding/sogou/char_embedding_plus_punctuation/char_to_index.npy", char_to_index)
np.save("../../data/embedding/sogou/char_embedding_plus_punctuation/index_to_char.npy", index_to_char)
np.save("../../data/embedding/sogou/char_embedding_plus_punctuation/index_to_vec.npy", index_to_vec)