In [1]:
import os
import numpy as np
from gensim import models
from keras.models import model_from_json
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Input
from keras.layers import Embedding
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint, TensorBoard, EarlyStopping
import json
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [2]:
BASE_DIR = '../'
EMBEDDING_DIR = BASE_DIR + 'embeddings/' # http://nlp.stanford.edu/projects/glove/ pretrained vectors
TEXT_DATA_DIR = BASE_DIR + '../data/'
TEXT_DATA_FILE = "reviews_rt_all.csv"
HEADER = True

In [3]:
def load_data():
    data = []
    labels = []
    with open(os.path.join(TEXT_DATA_DIR, TEXT_DATA_FILE), "r") as f:
        if HEADER:
            header = next(f)
        for line in f:
            temp_y, temp_x = line.rstrip("\n").split("|")
            data.append(temp_x)
            labels.append(temp_y)
            
    return data, labels
data, labels = load_data()
labels = np.asarray(labels, dtype='int8')

In [4]:
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 300
MAX_SEQUENCE_LENGTH = 50
VALIDATION_SPLIT = 0.2
RANDOM_SEED = 42

In [6]:
data_train, data_test, labels_train, labels_test = train_test_split(data, np.asarray(labels, dtype='int8'), test_size=0.2, random_state=42, stratify=labels)

In [7]:
def transform(tokenizer, train, test):
    sequences_train = tokenizer.texts_to_sequences(train) # transform words to its indexes
    sequences_test = tokenizer.texts_to_sequences(test)
    
    word_index = tokenizer.word_index # dictionary of word:index
    print('Found %s unique tokens.' % len(word_index))
    
    data_train = pad_sequences(sequences_train, maxlen=MAX_SEQUENCE_LENGTH) # transform a list to numpy array with shape (nb_samples, MAX_SEQUENCE_LENGTH)
    data_test = pad_sequences(sequences_test, maxlen=MAX_SEQUENCE_LENGTH)   # be careful because it takes only last MAX_SEQUENCE_LENGTH words
    
    return data_train, data_test, word_index

In [9]:
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS) # create dictionary of MAX_NB_WORDS, other words will not be used
tokenizer.fit_on_texts(data_train)

X_train, X_test, word_index = transform(tokenizer, data_train, data_test)

y_train, y_test = to_categorical(np.asarray(labels_train)), to_categorical(np.asarray(labels_test))
print('Shape of data train tensor:', X_train.shape)
print('Shape of data test tensor:', X_test.shape)

Found 56422 unique tokens.
Shape of data train tensor: (82088, 50)
Shape of data test tensor: (20522, 50)


In [10]:
def load_w2v():
    _fname = "../embeddings/GoogleNews-vectors-negative300.bin"
    w2vModel = models.KeyedVectors.load_word2vec_format(_fname, binary=True)
    return w2vModel 
embeddings = load_w2v()

In [11]:
# prepare embedding matrix
nb_words = min(MAX_NB_WORDS, len(word_index))
embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_NB_WORDS:
        continue
    try:
        embedding_vector = embeddings.word_vec(word)
    except: 
        embedding_vector = None
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [12]:
del(embeddings)

In [13]:
callback_1 = TensorBoard(log_dir='./logs', histogram_freq=1, write_graph=False, write_images=True)
callback_2 = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto')
callback_3 = ModelCheckpoint("model/lstm_model.hdf5", monitor='val_loss', save_best_only=True, verbose=1)

In [14]:
embedding_layer = Embedding(nb_words,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

In [None]:
model = Sequential()
model.add(embedding_layer)
model.add(LSTM(128, dropout_U=0.2, dropout_W=0.2))
model.add(Dropout(0.4))
model.add(Dense(2))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy', "fmeasure"])

model.fit(X_train, y_train, validation_data = [X_test, y_test], batch_size=128, nb_epoch=100, callbacks=[callback_1, callback_2, callback_3])

Train on 82088 samples, validate on 20522 samples
INFO:tensorflow:Summary name embedding_1_W:0 is illegal; using embedding_1_W_0 instead.
INFO:tensorflow:Summary name embedding_1_W:0 is illegal; using embedding_1_W_0 instead.
INFO:tensorflow:Summary name lstm_1_W_i:0 is illegal; using lstm_1_W_i_0 instead.
INFO:tensorflow:Summary name lstm_1_W_i:0 is illegal; using lstm_1_W_i_0 instead.
INFO:tensorflow:Summary name lstm_1_U_i:0 is illegal; using lstm_1_U_i_0 instead.
INFO:tensorflow:Summary name lstm_1_U_i:0 is illegal; using lstm_1_U_i_0 instead.
INFO:tensorflow:Summary name lstm_1_b_i:0 is illegal; using lstm_1_b_i_0 instead.
INFO:tensorflow:Summary name lstm_1_b_i:0 is illegal; using lstm_1_b_i_0 instead.
INFO:tensorflow:Summary name lstm_1_W_c:0 is illegal; using lstm_1_W_c_0 instead.
INFO:tensorflow:Summary name lstm_1_W_c:0 is illegal; using lstm_1_W_c_0 instead.
INFO:tensorflow:Summary name lstm_1_U_c:0 is illegal; using lstm_1_U_c_0 instead.
INFO:tensorflow:Summary name lstm_1_