In [1]:
from __future__ import unicode_literals, print_function, division
from io import open
import glob
import os
import unicodedata
import string
import jieba

In [2]:
path = 'data/poetry'

In [3]:
def load_data(path):
    file = path
    datasets = pd.read_csv(file, header=0)
    cats = datasets.cat
    labels = datasets.label
    reviews = datasets.review
    rows = np.random.randint(0, len(datasets), len(datasets))
    return np.array(cats[rows]), np.array(reviews[rows]), np.array(labels[rows])

In [4]:
f = open(path)
a = f.read().split('\n')

In [5]:
print (a)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [6]:
a[0]

'首春:寒随穷律变，春逐鸟声开。初风飘带柳，晚雪间花梅。碧林青旧竹，绿沼翠新苔。芝田初雁去，绮树巧莺来。'

In [7]:
aa = a[0]

In [8]:
pun = [':', '，', '。', '!']
def cutreviews(texts):
    sent_words = []
    for i in range(len(texts)):
        sent_word = list(jieba.lcut(texts[i]))
#         print (sent_word)
        _sent_words = []
        for x in sent_word:
            if x not in pun:
                _sent_words += x
        
        _sent_word = ''.join(str(x) for x in _sent_words)
        sent_words.append(_sent_word)
    return sent_words  

In [9]:
datasets = cutreviews(a)

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.508 seconds.
Prefix dict has been built succesfully.


In [10]:
print (type(datasets))

<class 'list'>


In [11]:
datasets[0]

'首春寒随穷律变春逐鸟声开初风飘带柳晚雪间花梅碧林青旧竹绿沼翠新苔芝田初雁去绮树巧莺来'

## Create Model

In [12]:
import torch
import torch.nn as nn

In [13]:
class PoetryModel(nn.Module):
    def __init__(self, vocab, embedding_dim, hidden_dim):
        super(PoetryModel, self).__init__()
        self.hidden_dim = hidden_dim
        n_vocab = len(vocab)
        self.embeddings = nn.Embedding(n_vocab, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, self.hidden_dim, num_layers=2)
        self.linear = nn.Linear(self.hidden_dim, len(vocab))
    
    def forward(self, input, hidden=None):
        batch_size, seq_len = input.size()
#        print (seq_len)
        if hidden is None:
#            h_0 = torch.zeros(2, batch_size, self.hidden_dim)
#            c_0 = torch.zeros(2, batch_size, self.hidden_dim)
            h_0 = input.data.new(2, 1, self.hidden_dim).fill_(0).float()
            #print (h_0)
            c_0 = input.data.new(2, 1, self.hidden_dim).fill_(0).float()
            #print (c_0)
        else:
            h_0, c_0 = hidden
        # size: (seq_len,batch_size,embeding_dim)
        embeds = self.embeddings(input)
        # output size: (seq_len,batch_size,hidden_dim)
        output, hidden = self.lstm(embeds, (h_0, c_0))
        output = self.linear(output.view(seq_len * batch_size, -1))
        return output, hidden

## Preparing for training

In [14]:
import numpy as np

In [15]:
def randomChoice(data):
    if isinstance(data, list):
        data = np.array(data)
    rows = np.random.randint(0, len(data), len(data))
    return data[rows]

In [16]:
def randomchoiceone(data):
    data = np.array(data)
    return data[np.random.randint(0, len(data)-1)]

In [17]:
ran_datasets = randomChoice(datasets)

In [18]:
ran_datasets[1]

'永淳中童谣新禾不入箱新麦不入场迨及八九月狗吠空垣墙'

In [19]:
ran_dataset = randomchoiceone(datasets)

In [20]:
ran_dataset

'咏史诗夷门六龙冉冉骤朝昏魏国贤才杳不存唯有侯嬴在时月夜来空自照夷门'

In [21]:
datatest = ran_datasets[:2]

In [22]:
datatest[0]

'九日九日重阳节开门有菊花不知来送酒若个是陶家'

## Make a vocabulary

In [23]:
def vocabb(data: np.ndarray):
    vocabdict = []
    for i in range(len(data)):
        row = data[i]
        for word in data[i]:
            if word not in vocabdict:
                vocabdict.append(word)
    num = range(0, len(vocabdict)-1)
    
    return dict(zip(vocabdict, num))

In [24]:
vocabl = vocabb(ran_datasets)

In [25]:
def vocabre(vocab: dict):
    return dict((v,k) for k, v in vocab.items())

## One-hot vector for category

In [26]:
def chartoindex(char, vocab):
    num = vocab[char]
    return num

In [27]:
def chartotensor(char, vocab):
    tensor = torch.zeros(1, len(vocab))
    tensor[0][chartoindex(char, vocab)] = 1
    return tensor

In [28]:
def inputtensor(line, vocab):
    tensor = torch.zeros(len(line), 1, len(vocab))
    chars = []
    for li in range(len(line)):
        char = line[li]
        tensor[li][0][vocab[char]] = 1
        chars.append(char)
    return tensor, chars

In [29]:
def targettensor(line, vocab):
    char_indexes = [vocab[line[li]] for li in range(1, len(line))]
    char_indexes.append(len(line))
    return torch.LongTensor(char_indexes)

In [30]:
def randomtraingingexample(data, vocab):
    line = randomchoiceone(data)
    input_line_tensor, chars = inputtensor(line, vocab)
    target_line_tensor = targettensor(line, vocab)
    return input_line_tensor, chars, target_line_tensor

## Train

In [31]:
#rnn = RNN(len(vocabl), 1000, len(vocabl)).cuda()
rnn = PoetryModel(vocabl, 256, 500).cuda()
# criterion = nn.NLLLoss()
criterion = nn.CrossEntropyLoss()
learning_rate = 0.001

optimizer = torch.optim.Adam(rnn.parameters(), learning_rate)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1000, gamma=0.1)

In [32]:
def train_test(input_line_tensor, target_line_tensor):
    target_line_tensor.unsqueeze_(-1)
    print (target_line_tensor)

In [33]:
input_line_tensor,chars, target_line_tensor = randomtraingingexample(ran_datasets, vocabl)

In [34]:
print (chars)

['奉', '酬', '袭', '美', '苦', '雨', '四', '声', '重', '寄', '三', '十', '二', '句', '平', '去', '声', '乌', '蟾', '俱', '沈', '光', '昼', '夜', '恨', '暗', '度', '何', '当', '乘', '云', '螭', '面', '见', '上', '帝', '诉', '臣', '言', '阴', '云', '欺', '诏', '用', '利', '剑', '付', '回', '车', '诛', '群', '奸', '自', '散', '万', '籁', '怒']


In [35]:
chartoind = [[chartoindex(char, vocabl)] for char in chars]
torch.tensor(chartoind, dtype=torch.long)

tensor([[ 384],
        [ 514],
        [1982],
        [1089],
        [ 647],
        [  89],
        [ 198],
        [ 945],
        [   2],
        [ 141],
        [ 617],
        [  44],
        [ 383],
        [ 417],
        [ 266],
        [ 748],
        [ 945],
        [1057],
        [2224],
        [ 533],
        [ 667],
        [ 407],
        [ 784],
        [ 547],
        [ 123],
        [ 653],
        [ 971],
        [ 146],
        [ 425],
        [ 913],
        [ 160],
        [4127],
        [1282],
        [ 520],
        [ 261],
        [ 242],
        [3923],
        [1059],
        [1247],
        [  61],
        [ 160],
        [2494],
        [1851],
        [ 513],
        [2443],
        [1629],
        [3228],
        [ 402],
        [1403],
        [2967],
        [ 341],
        [2107],
        [  93],
        [ 452],
        [  79],
        [ 940],
        [2996]])

In [36]:
def train(chars, target_line_tensor):
    chartoind = [[chartoindex(char, vocabl)] for char in chars]
    input_tensor = torch.tensor(chartoind, dtype=torch.long)
    input_tensor = input_tensor.cuda()
    output, hidden = rnn(input_tensor)
    loss = 0
    optimizer.zero_grad()
    l = criterion(output, target_line_tensor)
    loss += l
    optimizer.step()
    loss.backward()
    
    for p in rnn.parameters():
        p.data.add_(-learning_rate, p.grad.data)
        
    return output, loss.item() / input_line_tensor.size(0)

In [None]:
n_iters = 10000
print_every = 200
plot_every = 500
all_losses = []
total_loss = 0

for iter in range(0, n_iters+1):
    input_line_tensor,chars, target_line_tensor = randomtraingingexample(ran_datasets, vocabl)
#    input_line_tensor = input_line_tensor.cuda()
    target_line_tensor = target_line_tensor.cuda()
#     print (input_line_tensor)
#     print (target_line_tensor)
    
    output, loss = train(chars, target_line_tensor)
    total_loss += loss
    
    if iter % print_every == 0:
        print('(%d %d%%) %.4f' % (iter, iter / n_iters * 100, loss))
        
    if iter % plot_every == 0:
        all_losses.append(total_loss / plot_every)
        total_loss = 0

(0 0%) 0.1346


In [None]:
max_lenght = 7

In [None]:
def sample(vocab, revocab, start_char='成' ):
    with torch.no_grad():
#        input = inputtensor(start_char, vocab)
        chartoind = [[chartoindex(char, vocabl)] for char in start_char]
        input_tensor = torch.tensor(chartoind, dtype=torch.long)
#        input = input.cuda()
        output_p = start_char
        
        for i in range(max_lenght):
            output, hidden = rnn(input_tensor)
            topv, topi = output.topk(1)
            topi = int(topi[0][0])
#            print (topi)
            if topi == len(vocab):
                break
            else:
                char = revocab[topi]
                output_p += char
    return output_p 

In [None]:
def samples(vocab,revocab, start_chars='天上'):
    for start_char in start_chars:
        print (sample(vocab, revocab, start_char))

In [None]:
samples(vocabl,re_vocab, start_chars='天上人间')