## 讀取文字資料

In [2]:
import keras
import numpy as np

path = keras.utils.get_file(
    'nietzsche.txt',
    origin='https://s3.amazonaws.com/text-datasets/nietzsche.txt')
text = open(path).read().lower()
print('Corpus length:', len(text))

Using TensorFlow backend.


Corpus length: 600893


## 將字元向量化

In [4]:
# Vectorizing sequences of characters

# 從文章中取得序列長度
maxlen = 60

# 按步驟數進行隨機採樣
step = 3

# 保留句子的序列
sentences = []

# 保留下一個字元
next_chars = []

for i in range(0, len(text) - maxlen, step):
    # 根據step，每次取出 maxlen 的序列為 train data
    sentences.append(text[i: i + maxlen])
    # 取出序列的下一個字元當 targets
    next_chars.append(text[i + maxlen])
print('Number of sequences:', len(sentences))

# 產生詞彙表
chars = sorted(list(set(text)))
print('Unique characters:', len(sentences))
# 利用字典對映 char 和索引 index
char_indices = dict((char, chars.index(char)) for char in chars)

# 使用one-hot 編碼
# x：訓練樣本
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
# y：目標樣本
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):        # 對每個句子
    for t, char in enumerate(sentence):         # 對每個詞
        x[i, t, char_indices[char]] = 1         # 將訓練樣本做one-hot
    y[i, char_indices[next_chars[i]]] = 1       # 將標籤做one-hot

Number of sequences: 200278
Unique characters: 200278


## 建立LSTM 模型

In [14]:
# 使用 LSTM 做下一個字元的預測

from keras import layers

model = keras.models.Sequential()
# 輸入為60 * 字串長度
model.add(layers.LSTM(128, input_shape=(maxlen, len(chars))))
# 以字元對應作為輸出
model.add(layers.Dense(len(chars), activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam')

## 根據預測結果採樣

In [16]:
#temperature 低：會出現重複率比較高但是部分結構比較真實的文章。
#temperature 高：文章會比較有意義，但可能會發明一些新詞，詞的部分結構可能不完整，有的詞只有單詞的一半
#temperature=0.5：文章生成得比較好，在結構和隨機性保持平衡的情況下，產生比較好的句子。


def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    # 根據temperature 計算
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)

    # 回傳多項式分布的概率
    probas = np.random.multinomial(1, preds, 1)
    # 回傳最大index
    return np.argmax(probas)

## 文本生成

In [None]:
import random
import sys

for epoch in range(1, 2):
    print('epoch', epoch)
    # 训訓練模型
    model.fit(x, y, batch_size=128, epochs=1)

    # 從數據裡面選一個隨機樣本
    start_index = random.randint(0, len(text) - maxlen - 1)
    generated_text = text[start_index: start_index + maxlen]
    print('--- Generating with seed: "' + generated_text + '"')

    for temperature in [0.2, 0.5, 1.0, 1.2]:
        print('------ temperature:', temperature)
        sys.stdout.write(generated_text)

        for i in range(400):
            sampled = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(generated_text):
                # 對字元做one-hot
                sampled[0, t, char_indices[char]] = 1.

            # 計算預測的分佈值
            preds = model.predict(sampled, verbose=0)[0]
            # 輸出最大機率的字元 index
            next_index = sample(preds, temperature)
            # 輸出字元
            next_char = chars[next_index]
            generated_text += next_char
            generated_text = generated_text[1:]

            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()

## 參考資源
- https://github.com/karpathy/char-rnn
- https://www.dlology.com/blog/how-to-generate-realistic-yelp-restaurant-reviews-with-keras/
- https://github.com/Tony607/Yelp_review_generation/releases/download/V0.1/pre-trained.hdf5

## Yelp Review

In [8]:
! pip install h5py

Collecting h5py
  Downloading https://files.pythonhosted.org/packages/44/81/50a0560aac57a33c2a624d9e160735f39d7a6324e3f6f115425a1bf01dd9/h5py-2.8.0-cp36-cp36m-macosx_10_6_intel.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl (6.0MB)
[K    100% |████████████████████████████████| 6.1MB 236kB/s ta 0:00:011
Installing collected packages: h5py
Successfully installed h5py-2.8.0
[33mYou are using pip version 9.0.1, however version 10.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [1]:
import keras
from keras import layers
import sys
import numpy as np

text= 'this text file can be any text, as long as it contains text longer than maxlen defined below'

chars=['\n', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~']

# Dictionary mapping unique characters to their index in `chars`
char_indices = dict((char, chars.index(char)) for char in chars)
maxlen=60
step=1

model = keras.models.Sequential()
model.add(layers.LSTM(1024, input_shape=(maxlen, len(chars)),return_sequences=True))
model.add(layers.LSTM(1024, input_shape=(maxlen, len(chars))))
model.add(layers.Dense(len(chars), activation='softmax'))
model.load_weights("pre-trained.hdf5")

optimizer = keras.optimizers.Adam(lr=0.0002)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

Using TensorFlow backend.


Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [2]:

def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

def random_reviews():
    # sample a start index
    start_index = np.random.randint(0, len(text) - maxlen - 1)
    # the initial sampled text with maxlen long
    generated_text = text[start_index: start_index + maxlen]
    print('Coming up with several reviews for you...')

    for temperature in [0.8]:
        sys.stdout.write(generated_text)

        # We generate 600 characters
        for i in range(600):
            sampled = np.zeros((1, maxlen, len(chars)))
            # Turn each char to char index.
            for t, char in enumerate(generated_text):
                sampled[0, t, char_indices[char]] = 1.
            # Predict next char probabilities
            preds = model.predict(sampled, verbose=0)[0]
            # Add some randomness by sampling given probabilities.
            next_index = sample(preds, temperature)
            # Turn char index to char.
            next_char = chars[next_index]
            # Append char to generated text string
            generated_text += next_char
            # Pop the first char in generated text string.
            generated_text = generated_text[1:]
            # Print the new generated char.
            sys.stdout.write(next_char)
            sys.stdout.flush()
        print(generated_text)

In [3]:
random_reviews()

Coming up with several reviews for you...
 be any text, as long as it contains text longer than maxlenee<EOR

  after removing the cwd from sys.path.


>"
"<SOR>Really nice carne asada fries.  All the food was especially great.  Fresh and diligent ingredients. Patio is nice and clean ALWAYS good in a large selection of momma not so sweet and a good presentation. Prices are reasonable.<EOR>"
"<SOR>Love Tomo!!! Always fast and friendly staff.  The staff is always friendly and helpful.  It's lite and crowded on the weekends.<EOR>"
"<SOR>I ate here a couple weeks and absolutely loved it.  We ordered the Gritty and it was very good.  I had the Beef French Dip with Pesto appetizer which was so good.<EOR>"
"<SOR>Food is great, I gotta hangout ich was so good.<EOR>"
"<SOR>Food is great, I gotta hangout 
