In [1]:
import random
import sys
import numpy as np
import pandas as pd
import keras
from keras import layers
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from keras.utils import np_utils
from keras import optimizers

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [None]:
df=pd.read_csv('/scratch/kc3172/Yelp/notebook/five_star_restaurants_reviews_only.csv')
len(df)

In [None]:
#train on shorter reviews.  Already lots of data, easier to train on shorter ones too
mask = (df['text'].str.len() < 251) 
df2 = df.loc[mask]
len(df2)

In [None]:
# tt = df2.iloc[:,0].str.contains("[^(\u0000-\u007e|\u00e9|\u00f1|\u00e9|\u00fb|\u00f3|\xa0)]")
tt = df2.iloc[:,0].str.contains("[^(\u0000-\u002a|\u002c-\u007e)]")

df3 = df2[tt]
print("eliminated dataframes rows: ", len(df3))
df2 = df2.drop(list(df3.index.values))
print("current row number: ", len(df2))

In [None]:
# print(df3.values[200:300])

In [None]:
#shuffle the order of the reviews so we don't train on 100 Subway ones in a row
short_reviews=df2.sample(frac=1).reset_index(drop=True)

In [None]:
#only run this the first time, it will save a txt file on your computer
filename='short_reviews_shuffle.txt'
short_reviews.to_csv(filename, header=None, index=None, sep=' ')

In [2]:
text = open('short_reviews_shuffle.txt').read()
print('Corpus length:', len(text))

Corpus length: 71538040


In [3]:
# List of unique characters in the corpus

chars = sorted(list(set(text)))
print('Unique characters:', len(chars))
# Dictionary mapping unique characters to their index in `chars`
char_indices = dict((char, chars.index(char)) for char in chars)

maxlen=60
step=1

Unique characters: 95


In [4]:
#This get Data From Chunk is necessary to process large data sets like the one we have
#If you're using a sample less than 1 million characters you can train the whole thing at once

def getDataFromChunk(txtChunk, maxlen=60, step=1):
    sentences = []
    next_chars = []
    for i in range(0, len(txtChunk) - maxlen, step):
        sentences.append(txtChunk[i : i + maxlen])
        next_chars.append(txtChunk[i + maxlen])
    print('nb sequences:', len(sentences))
    print('Vectorization...')
    X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
    y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
    for i, sentence in enumerate(sentences):
        for t, char in enumerate(sentence):
            X[i, t, char_indices[char]] = 1
            y[i, char_indices[next_chars[i]]] = 1
    return [X, y]

In [36]:
keras.backend.clear_session()

In [37]:
model = keras.models.Sequential()
model.add(layers.LSTM(1024, input_shape=(maxlen, len(chars)),return_sequences=True))
model.add(layers.LSTM(1024, input_shape=(maxlen, len(chars))))
model.add(layers.Dense(len(chars), activation='softmax'))
model.load_weights("./saved_weights/May-5-all-01-0.6371.hdf5")
# model.load_weights("May-2-all-01-1.3136.hdf5")
# model.load_weights("May-3-all-01-1.1973.hdf5")

In [38]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 60, 1024)          4587520   
_________________________________________________________________
lstm_2 (LSTM)                (None, 1024)              8392704   
_________________________________________________________________
dense_1 (Dense)              (None, 95)                97375     
Total params: 13,077,599
Trainable params: 13,077,599
Non-trainable params: 0
_________________________________________________________________


In [39]:
optimizer = keras.optimizers.Adam(lr=0.000003)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

In [40]:
# this saves the weights everytime they improve so you can let it train.  Also learning rate decay
filepath="./saved_weights/May-5-all-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.1,
              patience=1, min_lr=0.000001)
callbacks_list = [checkpoint, reduce_lr]

In [41]:
def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [42]:
#This trains the model batching from the text file
#every epoch it prints out 300 characters at different "temperatures"
#temperature controls how random the characters sample: more temperature== more crazy (but often better) text
for iteration in range(1, 20):
    print()
    print('-' * 50)
    print('Iteration', iteration)
#     with open("short_reviews_shuffle.txt") as f:
#         for chunk in iter(lambda: f.read(90000), ""):
#             X, y = getDataFromChunk(chunk)
#             model.fit(X, y, batch_size=128, epochs=1, callbacks=callbacks_list)
    
     # Select a text seed at random
    start_index = random.randint(0, len(text) - maxlen - 1)
    generated_text = text[start_index: start_index + maxlen]
    print('--- Generating with seed: "' + generated_text + '"')

    for temperature in [0.5, 0.8, 1.0]:
        print('------ temperature:', temperature)
        sys.stdout.write(generated_text)

        # We generate 300 characters
        for i in range(300):
            sampled = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(generated_text):
                sampled[0, t, char_indices[char]] = 1.

            preds = model.predict(sampled, verbose=0)[0]
            next_index = sample(preds, temperature)
            next_char = chars[next_index]

            generated_text += next_char
            generated_text = generated_text[1:]

            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()


--------------------------------------------------
Iteration 1
--- Generating with seed: "t on my forehead. My family liked it too. I'm happy to find "
------ temperature: 0.5
t on my forehead. My family liked it too. I'm happy to find a good place t

  This is separate from the ipykernel package so we can avoid doing imports until


o eat in town.<EOR>"
"<SOR>This place is amazing. The service is great and the food is amazing.<EOR>"
"<SOR>This place is amazing! The place is a bit small for a trendy restaurant and in the summer is always great.  The food selection is great and the staff are welcoming and the atmosp
------ temperature: 0.8
election is great and the staff are welcoming and the atmosphere is friendly. The food is ALWAYS fresh and happy staff are !! They support local businesses and they always seem to fill you up with a smile! Food is always good and my favorite was the chicken tacos. Also, service was excellent. We will definitely be back!!<EOR>"
"<SOR>Love this place!! Chicken
------ temperature: 1.0
l definitely be back!!<EOR>"
"<SOR>Love this place!! Chicken Katsu is awesome! If you can get a pepperoni pizza burrito for 6 recent transaction.The place is criminable. You'll be happy you did.<EOR>"
"<SOR>So good! Great place to come and have the new dessert, 21 defour with multiple kids on the side. 

KeyboardInterrupt: 

In [43]:
#USE THIS TO TEST YOUR OUTPUT WHEN NOT/DONE TRAINING

# Select a text seed at random
start_index = random.randint(0, len(text) - maxlen - 1)
generated_text = text[start_index: start_index + maxlen]
print('--- Generating with seed: "' + generated_text + '"')

for temperature in [0.5, 0.8, 1.0]:
    print('------ temperature:', temperature)
    sys.stdout.write(generated_text)

        # We generate 300 characters
    for i in range(300):
        sampled = np.zeros((1, maxlen, len(chars)))
        for t, char in enumerate(generated_text):
            sampled[0, t, char_indices[char]] = 1.

        preds = model.predict(sampled, verbose=0)[0]
        next_index = sample(preds, temperature)
        next_char = chars[next_index]

        generated_text += next_char
        generated_text = generated_text[1:]

        sys.stdout.write(next_char)
        sys.stdout.flush()
    print()

--- Generating with seed: "steamed buns are very good as well some of the better ones a"
------ temperature: 0.5
steamed buns are very good as well some of the better ones are to die for

  This is separate from the ipykernel package so we can avoid doing imports until


.<EOR>"
"<SOR>We came in here on our way back to Wisconsin. The prices are great and the food is always fresh and delicious. My favorite is the chicken parm 

KeyboardInterrupt: 

In [None]:
# model.save_weights("May_1_model.hdf5")