In [1]:
# keras module for building LSTM 
from keras_preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku 
from sklearn.utils import shuffle


# set seeds for reproducability
from numpy.random import seed
seed(14)

import pandas as pd
import numpy as np
import string, os 

import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
all_reviews = []
reviews = pd.read_csv('reviews.csv')
reviews = reviews['text']
reviews = shuffle(reviews)
reviews = reviews[:2500]

print(len(reviews))
print(reviews.sample())

2500
83167    Ray personally came out the day after I called...
Name: text, dtype: object


In [3]:
tokenizer = Tokenizer()

def get_sequence_of_tokens(reviews):
    ## tokenization
    tokenizer.fit_on_texts(reviews)
    total_words = len(tokenizer.word_index) + 1
    
    ## convert data to sequence of tokens 
    input_sequences = []
    for line in reviews:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_words

inp_sequences, total_words = get_sequence_of_tokens(reviews)
inp_sequences[:10]

[[18, 30],
 [18, 30, 6],
 [18, 30, 6, 60],
 [18, 30, 6, 60, 181],
 [18, 30, 6, 60, 181, 11],
 [18, 30, 6, 60, 181, 11, 21],
 [18, 30, 6, 60, 181, 11, 21, 104],
 [18, 30, 6, 60, 181, 11, 21, 104, 10],
 [18, 30, 6, 60, 181, 11, 21, 104, 10, 889],
 [18, 30, 6, 60, 181, 11, 21, 104, 10, 889, 2148]]

In [4]:
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len

predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences)

In [5]:
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    
    # Add Input Embedding Layer
    model.add(Embedding(total_words, 10, input_length=input_len))
    
    # Add Hidden Layer 1 - LSTM Layer
    model.add(LSTM(100))
    model.add(Dropout(0.1))
    
    # Add Output Layer
    model.add(Dense(total_words, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    return model

model = create_model(max_sequence_len, total_words)
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 923, 10)           132980    
                                                                 
 lstm (LSTM)                 (None, 100)               44400     
                                                                 
 dropout (Dropout)           (None, 100)               0         
                                                                 
 dense (Dense)               (None, 13298)             1343098   
                                                                 
Total params: 1,520,478
Trainable params: 1,520,478
Non-trainable params: 0
_________________________________________________________________


In [7]:
model.fit(predictors, label, epochs=100, verbose=5)

InternalError: Failed copying input tensor from /job:localhost/replica:0/task:0/device:CPU:0 to /job:localhost/replica:0/task:0/device:GPU:0 in order to run _EagerConst: Dst tensor is not initialized.