Notes:

MAX_WORDS from 20000 to 10000 hurt accuracy and lead to overfitting with weighted embedding LSTM benchmark

In [17]:
import keras as k
import numpy as np
import pandas as pd
from keras.layers import GRU, Dense, Activation, Embedding, Input, Dropout, LSTM
from keras.layers import Conv1D, MaxPooling1D, Flatten, GlobalMaxPooling1D, BatchNormalization
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer, one_hot
from keras.utils import np_utils

In [2]:
import tensorflow as tf

In [3]:
data = pd.read_csv('train.tsv',sep='\t')
test = pd.read_csv('test.tsv',sep='\t')

In [5]:
data.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


Preprocess data

In [7]:
#Create tokens and sequences with those tokens
MAX_WORDS = 20000

texts = data['Phrase'].values
tokenizer = Tokenizer(num_words=MAX_WORDS)
tokenizer.fit_on_texts(texts)
tokenizer.fit_on_texts(test['Phrase'].values)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 17780 unique tokens.


In [9]:
#Pad the sequences to the max sequence length
MAX_SEQUENCE_LENGTH = max([len(i) for i in sequences])
x = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
y = np_utils.to_categorical(data['Sentiment'].values)

In [10]:
y = np_utils.to_categorical(data['Sentiment'].values)

In [7]:
#Create embedding matrix for the tokens
GRU_UNITS = 128
EMBEDDING_DIM = 100
model = Sequential()
model.add(Embedding(len(word_index) + 1,EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH,mask_zero=True))
model.add(GRU(GRU_UNITS))
model.add(Dropout(0.2))
model.add(Dense(5,activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [10]:
with tf.device('/gpu:0'):
    model.fit(x, y, batch_size=32, epochs=3, verbose=1, validation_split=0.2)

Train on 124848 samples, validate on 31212 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


Thoughts:

Balance data set by removing a large portion of neutral reviews

Use pretrained encoding matrix 

In [None]:
#Create embedding matrix for the tokens
GRU_UNITS = 256

model = Sequential()
model.add(Embedding(len(word_index) + 1,EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH,mask_zero=True))
model.add(LSTM(GRU_UNITS,recurrent_dropout=0.2,dropout=0.2))
model.add(Dropout(0.2))
model.add(Dense(5,activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
with tf.device('/gpu:0'):
    model.fit(x, y, batch_size=32, epochs=3, verbose=1, validation_split=0.2)

Train on 124848 samples, validate on 31212 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [None]:
import gc
gc.collect()

In [11]:
embeddings_index = {}
f = open('glove.6B.100d.txt',encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [13]:
EMBEDDING_DIM = 100
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [14]:
#Create embedding matrix for the tokens
GRU_UNITS = 256
EMBEDDING_DIM = 100
model = Sequential()
model.add(Embedding(len(word_index) + 1,EMBEDDING_DIM,weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH,mask_zero=True))
model.add(LSTM(GRU_UNITS,recurrent_dropout=0.2,dropout=0.2))
model.add(Dropout(0.2))
model.add(Dense(5,activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
with tf.device('/gpu:0'):
    model.fit(x, y, batch_size=32, epochs=2, verbose=1, validation_split=0.2)

Train on 124848 samples, validate on 31212 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [21]:
#Create embedding matrix for the tokens
GRU_UNITS = 256
EMBEDDING_DIM = 100
model = Sequential()
model.add(Embedding(len(word_index) + 1,EMBEDDING_DIM,weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH,mask_zero=True,trainable=False))
model.add(LSTM(GRU_UNITS,recurrent_dropout=0.2,dropout=0.2))
model.add(Dropout(0.2))
model.add(Dense(5,activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
with tf.device('/gpu:0'):
    model.fit(x, y, batch_size=32, epochs=10, verbose=1, validation_split=0.2)

Train on 124848 samples, validate on 31212 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
#May be some training left to do here, try for more epochs

In [16]:
#Create embedding matrix for the tokens
GRU_UNITS = 256
EMBEDDING_DIM = 100
model = Sequential()
model.add(Embedding(len(word_index) + 1,EMBEDDING_DIM,weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH,mask_zero=True))
model.add(GRU(GRU_UNITS,recurrent_dropout=0.2,dropout=0.2))
model.add(Dropout(0.2))
model.add(Dense(5,activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
with tf.device('/gpu:0'):
    model.fit(x, y, batch_size=32, epochs=3, verbose=1, validation_split=0.2)

Train on 124848 samples, validate on 31212 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [14]:
#Try with less words:
MAX_WORDS = 20000
EMBEDDING_DIM = 100

texts = data['Phrase'].values
tokenizer = Tokenizer(num_words=MAX_WORDS)
tokenizer.fit_on_texts(texts)
tokenizer.fit_on_texts(test['Phrase'].values)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

embeddings_index = {}
f = open('glove.6B.100d.txt',encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))


embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
        
#Pad the sequences to the max sequence length
MAX_SEQUENCE_LENGTH = max([len(i) for i in sequences])
x = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
y = np_utils.to_categorical(data['Sentiment'].values)

Found 17780 unique tokens.
Found 400000 word vectors.


In [None]:
#TODO try decay

In [20]:
EMBEDDING_DIM = 100
model = Sequential()
model.add(Embedding(len(word_index) + 1,EMBEDDING_DIM,weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH))
model.add(Conv1D(256,3,activation='relu'))
model.add(BatchNormalization())
model.add(Conv1D(256,3,activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(256,activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(5, activation = 'sigmoid'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
with tf.device('/gpu:0'):
    model.fit(x, y, batch_size=32, epochs=10, verbose=1, validation_split=0.2)

Train on 124848 samples, validate on 31212 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
# BEST MODEL
UNITS = 256
EMBEDDING_DIM = 100
model = Sequential()
model.add(Embedding(len(word_index) + 1,EMBEDDING_DIM,weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH,mask_zero=True))
model.add(LSTM(UNITS,recurrent_dropout=0.2,dropout=0.2))
model.add(Dropout(0.2))
model.add(Dense(5,activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
with tf.device('/gpu:0'):
    model.fit(x, y, batch_size=32, epochs=2, verbose=1, validation_split=0.2)

Train on 124848 samples, validate on 31212 samples
Epoch 1/2

In [None]:
#Test model:

sequences_test = tokenizer.texts_to_sequences(test['Phrase'].values)
x_test = pad_sequences(sequences_test, maxlen=MAX_SEQUENCE_LENGTH)
with tf.device('/gpu:0'):
    out = model.predict(x_test)