In [2]:
from keras.layers import Dot, Embedding,Activation, Input, Reshape, Flatten
from keras.layers import LSTM, Dense, Dropout
from keras.models i mport Model
import numpy as np
from nltk import word_tokenize

# Simple sentiment analysis

Sentiment analysis is a supervised task: a text (e.g. sentence, tweet) is labeled with a scalar score.

In [2]:
# got this data from a U Mich sentiment analysis kaggle competition
sent_fn = "training.txt"
#format is 1 (positive) or 0 (negative), tab, sentence
with open(sent_fn, 'r') as sent_file:
    print(sent_file.readline())

1	The Da Vinci Code book is just awesome.



In [3]:
# the data set is small - read it all
labels = []
sents = []
with open(sent_fn, 'r') as sent_file:
    for line in sent_file:
        label,sent = line.strip().split('\t')
        labels.append(int(label))
        sents.append(sent)
        
print(labels[:3])
print(sents[:3])
print(len(labels))

[1, 1, 1]
['The Da Vinci Code book is just awesome.', "this was the first clive cussler i've ever read, but even books like Relic, and Da Vinci code were more plausible than this.", 'i liked the Da Vinci Code a lot.']
7086


In [4]:
sent_lens = [len(sent.split()) for sent in sents]
# we'll take the first 20 tokens because most of our sentences are length 20 or less
sent_len = 20
sum([l<=sent_len for l in sent_lens])/float(len(sent_lens))

0.8718600056449337

# Start with the semantic space from our pre-trained, word2vec vectors

"Transfer learning" means we can take the information we learned from lots of language on the language modeling task, and apply it to this task where data is more scarse.

In [5]:
#load our word2vec vocabulary and numpy array

vocab_fn = "GoogleNews_100K_vocab.txt"
with open(vocab_fn, 'r') as vfn:
    index2word = vfn.read().split('\n')
print(len(index2word),"words in vocab")

mat_fn = "GoogleNews_100K.npy"
embedding_mat = np.load(mat_fn)
print(embedding_mat.shape,"embedding matrix")

100000 words in vocab
(100000, 300) embedding matrix


In [6]:
#add NULL (0) and UNK to our vocab
lookup_with_unk = {word:i+2 for i,word in enumerate(index2word)}
UNK_IND = 1

#add null and UNK vectors to our embedding matrix so it still lines up
embeddings_with_unk = np.zeros((embedding_mat.shape[0]+2, embedding_mat.shape[1]))
embeddings_with_unk[2:] = embedding_mat

# Convert our task dataset into numpy arrays

In [7]:
# X holds our texts, converted to vocabulary indexes
X_matrix = np.zeros((len(sents), sent_len), dtype=np.int32)
for i,sent in enumerate(sents):
    sent_tokens = word_tokenize(sent.strip().lower())
    sent_inds = [lookup_with_unk[s] if s in lookup_with_unk else UNK_IND for s in sent_tokens]
    sent_inds = sent_inds[:sent_len] #truncate if necessary
    X_matrix[i, :len(sent_inds)] = sent_inds

In [8]:
# y is our targets - here 0 and 1
y = np.asarray(labels)

#shuffle both
rng_state = np.random.get_state()
np.random.shuffle(X_matrix)
np.random.set_state(rng_state)
np.random.shuffle(y)

print(y)

[0 0 0 ... 0 1 0]


In [9]:
# build a simple recurrent network with keras

hidden_size = 16 # tune this based on performance - it's pretty small
vocab_size,embed_size = embeddings_with_unk.shape

#simplest possible model

sent_in = Input((None,), dtype="int32", name="sent_in")
# load the weights into the model
embed_layer = Embedding(vocab_size, embed_size, name="word_vec", weights=[embeddings_with_unk,])
sent_embeddings = embed_layer(sent_in)

# mask out some of the data during training to make the task harder
sent_embeddings = Dropout(0.5)(sent_embeddings)

# compose the sequence of words using a recurrent layer
sent_avg = LSTM(32)(sent_embeddings)

#add a fully-connected layer - in practice, we would want to see whether this actually improves score or not
hidden_repr = Dense(hidden_size, activation="tanh")(sent_avg)

pred = Dense(1, activation="sigmoid")(hidden_repr)
sentiment_model = Model(inputs=[sent_in], outputs=[pred,])
sentiment_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
sent_in (InputLayer)         (None, None)              0         
_________________________________________________________________
word_vec (Embedding)         (None, None, 300)         30000600  
_________________________________________________________________
dropout_1 (Dropout)          (None, None, 300)         0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 32)                42624     
_________________________________________________________________
dense_1 (Dense)              (None, 16)                528       
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 17        
Total params: 30,043,769
Trainable params: 30,043,769
Non-trainable params: 0
________________________________________________________________

In [10]:
sentiment_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy",])

In [11]:
sentiment_model.fit(X_matrix,y, epochs=2, validation_split=0.2)

Train on 5668 samples, validate on 1418 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x1a503a58d0>

In [12]:
# decode model - create the X
sample_sents = ["I love my dog", "I'm sick of eating baby carrots", "What's the weather tomorrow"]
sample_X = np.zeros((len(sample_sents), sent_len), dtype=np.int32)
for i,sent in enumerate(sample_sents):
    sent_tokens = sent.strip().lower().split() #lazy tokenization
    sent_inds = [lookup_with_unk[s] if s in lookup_with_unk else UNK_IND for s in sent_tokens]
    sent_inds = sent_inds[:sent_len] #truncate if necessary
    sample_X[i, :len(sent_inds)] = sent_inds

In [13]:
# predict the y - how'd we do?
predictions = sentiment_model.predict(sample_X)

for i, sample in enumerate(sample_sents):
    print("{:.03}\t{}".format(predictions[i][0], sample))

0.996	I love my dog
0.00424	I'm sick of eating baby carrots
0.99	What's the weather tomorrow
