In [None]:
import random
import sys
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from keras.utils import np_utils
from keras import optimizers
import keras
from keras import layers

#Data processing

In [None]:
# Load data
train_reviews = open('train_reviews.txt', 'r').readlines()
train_cate = open('train_context.txt', 'r').readlines()
test_cate = open('test_context.txt', 'r').readlines() # seed text prefix
generated_review = open('generate.txt', 'w')

# Concat reviews & contexts, using character <SEP> as separator
for id, review in enumerate(train_reviews):
    train_reviews[id] = context = train_cate[id][:-1] + '<SEP> ' + review.replace('\n', '')

df = pd.DataFrame(list(train_reviews), columns =['text'])
df = df.replace({r'\+': ''}, regex=True)

In [None]:
#train on shorter reviews.  Already lots of data, easier to train on shorter ones too
mask = (df['text'].str.len() < 301) 
df = df.loc[mask]
len(df)

3023

In [None]:
# # shuffle the order of the reviews so we don't train on 100 Subway ones in a row
short_reviews = df.sample(frac=1).reset_index(drop=True)

filename = 'short_reviews_shuffle.txt'
short_reviews.to_csv(filename, header=None, index=None, sep=' ')

text = open('short_reviews_shuffle.txt').read()
print('Corpus length:', len(text))

Corpus length: 353019


In [None]:
# List of unique characters in the corpus
chars = sorted(list(set(text)))
print('Unique characters:', len(chars))
# Dictionary mapping unique characters to their index in `chars`
char_indices = dict((char, chars.index(char)) for char in chars)

maxlen = 60
step = 1

Unique characters: 85


In [None]:
char_indices

In [None]:
# This get Data From Chunk is necessary to process large data sets like the one we have
# If you're using a sample less than 1 million characters you can train the whole thing at once

def getDataFromChunk(txtChunk, maxlen=60, step=1):
    sentences = []
    next_chars = []
    for i in range(0, len(txtChunk) - maxlen, step):
        sentences.append(txtChunk[i: i + maxlen])
        next_chars.append(txtChunk[i + maxlen])
    print('nb sequences:', len(sentences))
    print('Vectorization...')
    X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
    y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
    for i, sentence in enumerate(sentences):
        for t, char in enumerate(sentence):
            X[i, t, char_indices[char]] = 1
            y[i, char_indices[next_chars[i]]] = 1
    return [X, y]

#Model training

In [None]:
model = keras.models.Sequential()
model.add(layers.LSTM(1024, input_shape=(maxlen, len(chars)), return_sequences=True))
model.add(layers.LSTM(1024, input_shape=(maxlen, len(chars))))
model.add(layers.Dense(len(chars), activation='softmax'))

In [None]:
optimizer = keras.optimizers.Adam(lr=0.001)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

In [None]:
# this saves the weights everytime they improve so you can let it train.  Also learning rate decay
filepath="{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.5,
                              patience=1, min_lr=0.00001)
callbacks_list = [checkpoint, reduce_lr]

In [None]:
def sample(preds, temperature=1.0):
    """
    Generate some randomness with the given preds
    which is a list of numbers, if the temperature
    is very small, it will always pick the index
    with highest pred value
    """
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [None]:
# This trains the model batching from the text file
# every epoch it prints out 300 characters at different "temperatures"
# temperature controls how random the characters sample: more temperature== more crazy (but often better) text
for iteration in range(1, 20):
    print()
    print('-' * 50)
    print('Iteration', iteration)
    with open("short_reviews_shuffle.txt") as f:
        for chunk in iter(lambda: f.read(80000), ""):
            X, y = getDataFromChunk(chunk)
            model.fit(X, y, batch_size=128, epochs=1, callbacks=callbacks_list)


--------------------------------------------------
Iteration 1
nb sequences: 79940
Vectorization...

Epoch 00001: loss improved from inf to 2.17027, saving model to 01-2.1703.hdf5
nb sequences: 79940
Vectorization...

Epoch 00001: loss improved from 2.17027 to 1.43988, saving model to 01-1.4399.hdf5
nb sequences: 79940
Vectorization...

Epoch 00001: loss improved from 1.43988 to 1.28438, saving model to 01-1.2844.hdf5
nb sequences: 79940
Vectorization...

Epoch 00001: loss improved from 1.28438 to 1.18202, saving model to 01-1.1820.hdf5
nb sequences: 32959
Vectorization...

Epoch 00001: loss improved from 1.18202 to 1.14429, saving model to 01-1.1443.hdf5

--------------------------------------------------
Iteration 2
nb sequences: 79940
Vectorization...

Epoch 00001: loss improved from 1.14429 to 1.12184, saving model to 01-1.1218.hdf5
nb sequences: 79940
Vectorization...

Epoch 00001: loss improved from 1.12184 to 1.03398, saving model to 01-1.0340.hdf5
nb sequences: 79940
Vectoriza

KeyboardInterrupt: ignored

#Generating reviews

In [None]:
# Use test context as seeds
test_reviews = open('test_reviews.txt', 'r').readlines()
test_cate = open('test_context.txt', 'r').readlines()
for ci, cate in enumerate(test_cate):
    seed_text = cate.strip()
    print('--- category: ' + seed_text + ' <SEP>')
    generated_text = seed_text + ' <SEP>'
    entire_text = ''

    # Generate review with 200 characters
    for i in range(200):
        sampled = np.zeros((1, len(generated_text), len(chars)))
        for t, char in enumerate(generated_text):
            sampled[0, t, char_indices[char]] = 1.

        preds = model.predict(sampled, verbose=0)[0]
        next_index = sample(preds, 0.8)
        next_char = chars[next_index]
        if next_char == '\n':
          break

        generated_text += next_char
        entire_text += next_char
        generated_text = generated_text[1:]

        sys.stdout.write(next_char)
        sys.stdout.flush()

    generated_review.write(seed_text + '\n')
    generated_review.write(test_reviews[ci])
    generated_review.write(entire_text)
    generated_review.write('\n')


--- category: food positive bread <SEP>
 Our server for the unlimited meats, various served vine and wine well."--- category: service positive delivery times <SEP>
 The menu hinters were even but there are three times of Texonaze house that is...)."--- category: food positive Food <SEP>
 I really liked the spot on a whie restaurant week."--- category: food positive coffee <SEP>
 Southern I had the most Delivery."--- category: food conflict ambience positive sushi place <SEP>
 While they would love this place little loud after the meal or the rice -SESUST - deliveress offeres."--- category: anecdotes/miscellaneous positive people <SEP>
 After a terrifies on tee was their Amized at appetizer management that is a cute price, which wasn't a place further when they try the price you can just get counter service your wait anything and to--- category: food positive Japanese food <SEP>
 The portion is a bit soggyingting up on your plate so when we received expected."--- category: food positive

In [None]:
generated_review.flush()
generated_review.close()