In [1]:
# keras module for building LSTM 
from keras_preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku 
from sklearn.utils import shuffle
import tensorflow as tf


# set seeds for reproducability
from numpy.random import seed
seed(54)

import pandas as pd
import numpy as np
import string, os 

import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
# Check if a GPU is available
print(tf.config.list_physical_devices('GPU'))

# Check if TensorFlow is using a GPU
print(tf.test.is_built_with_cuda())
print(tf.test.is_gpu_available())

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
True
Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
True


In [3]:
all_reviews = []

reviews = pd.read_csv('reviews.csv')
reviews = shuffle(reviews)

#Standardise and tokenize
for column in reviews:
    reviews['text'] = reviews['text'].str.lower()   #Covert the text to lower case
    reviews['text'].str.replace('[^\w\s]','') #Remove punctuation
    reviews['text'].str.strip() #Remove whitespace
    reviews['text'].str.replace("\n", " ") #Remove escape characters
reviews = reviews['text']

#Use only a slice of the data
reviews = reviews[:1100]
print(len(reviews))
print(reviews.sample())

1100
92257    one star too many -- while are sever\r\nlori l...
Name: text, dtype: object


In [4]:
def get_sequence_of_tokens(reviews):
  # Initialize the tokenizer
  tokenizer = Tokenizer()

  # Fit the tokenizer on the texts
  tokenizer.fit_on_texts(reviews)

  # Get the total number of words
  num_words = len(tokenizer.word_index) + 1

  # Convert the texts to sequences of tokens
  sequences = []
  for line in reviews:
      token_list = tokenizer.texts_to_sequences([line])[0]
      for i in range(1, len(token_list)):
          ngram_sequence = token_list[:i+1]
          sequences.append(ngram_sequence)
  return sequences, num_words

# Get the input sequences and the total number of words
sequences, num_words = get_sequence_of_tokens(reviews)

# Print the first 10 input sequences
print(sequences[:10])

[[27, 160], [27, 160, 374], [27, 160, 374, 3275], [27, 160, 374, 3275, 212], [27, 160, 374, 3275, 212, 4], [27, 160, 374, 3275, 212, 4, 223], [27, 160, 374, 3275, 212, 4, 223, 52], [27, 160, 374, 3275, 212, 4, 223, 52, 104], [27, 160, 374, 3275, 212, 4, 223, 52, 104, 1056], [27, 160, 374, 3275, 212, 4, 223, 52, 104, 1056, 15]]


In [5]:
def gen_pad_sequences(sequences):
    # Find the maximum length of all sequences
    max_sequence_len = max([len(x) for x in sequences])

    # Pad all sequences to the maximum length
    sequences = np.array(pad_sequences(sequences, maxlen=max_sequence_len, padding='pre'))

    # Split the sequences into predictors and label
    predictors, label = sequences[:,:-1],sequences[:,-1]
    label = ku.to_categorical(label, num_classes=num_words)
    return predictors, label, max_sequence_len


predictors, label, max_sequence_len = gen_pad_sequences(sequences)

In [6]:
def create_model(max_sequence_len, num_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    
    # Add Input Embedding Layer
    model.add(Embedding(num_words, 10, input_length=input_len))
    
    # Add Hidden Layer 1 - LSTM Layer
    model.add(LSTM(100))
    model.add(Dropout(0.1))
    
    # Add Output Layer
    model.add(Dense(num_words, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    return model

model = create_model(max_sequence_len, num_words)
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 817, 10)           88760     
                                                                 
 lstm (LSTM)                 (None, 100)               44400     
                                                                 
 dropout (Dropout)           (None, 100)               0         
                                                                 
 dense (Dense)               (None, 8876)              896476    
                                                                 
Total params: 1,029,636
Trainable params: 1,029,636
Non-trainable params: 0
_________________________________________________________________


In [7]:
model.fit(predictors, label, epochs=50,batch_size=30)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50

In [None]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    tokenizer = Tokenizer()
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')

    predicted = model.predict(token_list, verbose=0)

    # Use the `argmax()` method to get the index of the highest predicted probability
    predicted_index = np.argmax(predicted)

    output_word = ""
    for word,index in tokenizer.word_index.items():
        if index == predicted_index:
            output_word = word
            break
    return seed_text + " " + output_word


In [None]:
review = (generate_text("For what it is", 1, model, max_sequence_len))
print (review)
for i in range (30):
    review = (generate_text(review, i, model, max_sequence_len))

print(review)

For what it is a
For what it is a little casual delivery to find a great experience and the waitress was very friendly and the service was very friendly and the staff was friendly and the staff is very
