In [1]:
import numpy as np
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding, Dropout, Activation
from keras.layers import LSTM

Using TensorFlow backend.


## vocabulary comes from pre-trained vectors

In [2]:
#load our word2vec vocabulary and numpy array
vocab_fn = "GoogleNews_100K_vocab.txt"
with open(vocab_fn, 'r') as vfn:
    index2word = vfn.read().split('\n')
print(len(index2word),"words in vocab")

mat_fn = "GoogleNews_100K.npy"
embedding_mat = np.load(mat_fn)
print(embedding_mat.shape,"embedding matrix")

100000 words in vocab
(100000, 300) embedding matrix


In [3]:
#add NULL (0) and UNK (1) to our vocab
lookup_with_unk = {word:i+2 for i,word in enumerate(index2word)}
UNK_IND = 1

#add null and UNK vectors to our embedding matrix so it still lines up
embeddings_with_unk = np.zeros((embedding_mat.shape[0]+2, embedding_mat.shape[1]))
embeddings_with_unk[2:] = embedding_mat
embeddings_with_unk.shape

(100002, 300)

## Load Data and Transform them into the input of the model

In [4]:
import re
from nltk import word_tokenize
import string
print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [5]:
sent_fn = "test.ft.txt"

sent_len = 30 # limit the sentence length of each input record
total_records = 200000 # limit the number of input records
count = 0

# initialization
X_matrix = np.zeros((0, sent_len), dtype=np.int32)
labels = []

with open(sent_fn, 'r') as sent_file:
    for sent in sent_file:
        # change sentence to lower cases; change "don't" to "do not"
        # because "don't" would be tokenized to ["do", "n't"]
        # but our vocab does not contain ["n't"]
        sent = sent.lower().replace("n't"," not")
        # tokenize
        tokens = word_tokenize(sent)
        # the first token is the label, need to be forced into as 0/1
        labels.append(int(tokens[0][9])-1) # label is 0 (negative) or 1 (positive)
        tokens = tokens[1:]
        # remove numbers and punctuations
        tokens_a = [token for token in tokens if not token.isdigit() and token not in string.punctuation]
        # look for id
        tokens_id = [lookup_with_unk[s] if s in lookup_with_unk else UNK_IND for s in tokens_a]
        # add 0s if the sentence is too short
        if len(tokens_id) < sent_len:
            tokens_id.extend([0]*(sent_len-len(tokens_id)))
        X_matrix = np.r_[X_matrix, np.asmatrix(tokens_id[0:sent_len])] #row cat
        count += 1
        if count >= total_records:
            break

In [6]:
print(X_matrix[0:2,])
X_matrix.shape

[[  269 62311   128  8281 17245    26    47     1    13   269  6472     1
     69  1614  4503    23  8480     1    30 62311     4    79     1  4503
    153   748    17    63  4503   238]
 [   47     1    13   203    96   638 44046     4     1    96  4503    94
     15   232   174   867    13   595     5  4503    23    91   409     1
    430  2799     1    13    96    13]]


(200000, 30)

In [7]:
y = np.asarray(labels)
len(y)

200000

In [8]:
# check balance
print(len([y1 for y1 in y[:int(0.8*total_records)] if y1 == 0]))
print(len([y1 for y1 in y[:int(0.8*total_records)] if y1 == 1]))

79287
80713


## Build, Train, and Test the LSTM Model

In [9]:
vocab_size, embed_size = embeddings_with_unk.shape
print(vocab_size, embed_size)

100002 300


In [10]:
# build the LSTM model
model = Sequential()
model.add(Embedding(vocab_size, embed_size, 
                    name="word_vec", 
                    weights=[embeddings_with_unk,]))

model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss="binary_crossentropy", 
              optimizer = "adam", 
              metrics=["accuracy",])

In [11]:
# use the first 80% records to train the model
model.fit(X_matrix[:int(0.8*total_records),], y[:int(0.8*total_records)], 
          epochs=3, 
          batch_size=1024, # The larger the batch, the better the approximation; 
          verbose=1) # Verbosity mode. 0 = silent, 1 = progress bar, 2 = one line per epoch.

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x13c749c88>

In [12]:
# use the last 20% records to test the model
score = model.evaluate(X_matrix[int(0.8*total_records):,], y[int(0.8*total_records):], 
                       batch_size=1024, verbose=1)
print(score)

[0.28687794504165648, 0.87809999999999999]


In [20]:
# use the first 80% records to train the model
model.fit(X_matrix[:int(0.8*total_records),], y[:int(0.8*total_records)], 
          epochs=1, 
          batch_size=1024, 
          verbose=1) 
# use the last 20% records to test the model
score = model.evaluate(X_matrix[int(0.8*total_records):,], y[int(0.8*total_records):], 
                       batch_size=1024, verbose=1)
print(score)

Epoch 1/1
[0.31486054973602295, 0.87622500000000003]


## Save The Model

In [14]:
from keras.models import load_model
model.save('my_model.h5')  # creates a HDF5 file 'my_model.h5'
# del sentiment_model  # deletes the existing model
# sentiment_model = load_model('my_model.h5') # load a model

## More Manual Tests

In [15]:
model = load_model('my_model.h5')

In [16]:
sample_sents = ["This is a joke",
                "The worst product I have ever bought",
                "I like it, and I want to buy it again",
                "I am going to give a presentation.",
                "I am going to have an exam.", 
                "The new tea table looks amazing.", 
                "This is the best class ever!",
                "My mom loves it.",
                "nice weather",
                "This is the best 30 bucks that I have ever spent",
                "Complete waste of money."
               ]

X_sample = np.zeros((0, sent_len), dtype=np.int32)

for i, sent in enumerate(sample_sents):
    sent = sent.lower().replace("n't"," not")
    tokens = word_tokenize(sent)
    # remove punctuations
    tokens_a = [token for token in tokens if token.isalpha()]
    # look for id
    tokens_id = [lookup_with_unk[s] if s in lookup_with_unk else UNK_IND for s in tokens_a]
    # add 0s if the sentence is too short
    if len(tokens_id) < sent_len:
        tokens_id.extend([0]*(sent_len-len(tokens_id)))
    X_sample = np.r_[X_sample, np.asmatrix(tokens_id[0:sent_len])] #row cat
        
results = model.predict(X_sample)

for i, result in enumerate(results):
    if result > 0.8:
        print("Posi", result, sample_sents[i])
    elif result < 0.2:
        print("neg", result, sample_sents[i])
    else:
        print("neutral", result, sample_sents[i])

neg [ 0.00744833] This is a joke
neg [ 0.00443599] The worst product I have ever bought
Posi [ 0.97385174] I like it, and I want to buy it again
neutral [ 0.52989006] I am going to give a presentation.
neutral [ 0.71282566] I am going to have an exam.
Posi [ 0.99185044] The new tea table looks amazing.
Posi [ 0.99524188] This is the best class ever!
Posi [ 0.99224257] My mom loves it.
Posi [ 0.99034542] nice weather
Posi [ 0.9875195] This is the best 30 bucks that I have ever spent
neg [ 0.00449477] Complete waste of money.


In [17]:
t = "I DON'T like 5 it! *_*"
t = t.lower().replace("n't"," not")
print(t)
t = word_tokenize(t.lower())
print(t)
print([t1 for t1 in t if not t1.isdigit() and t1 not in string.punctuation])
print([lookup_with_unk[s] if s in lookup_with_unk else UNK_IND for s in t])

i do not like 5 it! *_*
['i', 'do', 'not', 'like', '5', 'it', '!', '*_*']
['i', 'do', 'not', 'like', 'it', '*_*']
[4503, 60, 15, 89, 295, 17, 1, 1]


In [18]:
import keras
keras.__version__

'2.0.9'