In [1]:
import pandas as pd 
import numpy as np
import tensorflow as tf
import keras
import os
import re

Using TensorFlow backend.


In [2]:
RANDOM_SEED = 42
LOG_DIR = '/media/eigenstir/1TBSecondary/tbgraphs'

# Load & Read Data

In [3]:
os.listdir('data/aclImdb/')

['imdbEr.txt', 'train', 'imdb.vocab', 'test', 'README']

In [1]:
def load_directory_data(directory):
    data={}
    data["sentence"] = []
    data["sentiment"] = []
    for file_path in os.listdir(directory):
        with tf.io.gfile.GFile(os.path.join(directory, file_path), "r") as f:
            data["sentence"].append(f.read())
            data["sentiment"].append(re.match("\d+_(\d+)\.txt", file_path).group(1))
    return pd.DataFrame.from_dict(data)

def load_dataset(directory):
    pos_df = load_directory_data(directory + "/pos")
    neg_df = load_directory_data(directory + "/neg")
    pos_df["polarity"] = 1
    neg_df["polarity"] = 0
    return pd.concat([pos_df, neg_df]).sample(frac=1).reset_index(drop=True)

In [5]:
train_df = load_dataset("data/aclImdb/train/")
test_df = load_dataset("data/aclImdb/test/")

In [6]:
X_train = train_df["sentence"]
y_train = train_df["sentiment"]

X_test = test_df["sentence"]
y_test = test_df["sentiment"]

In [7]:
X_train.head(), y_test.head()

(0    Alive<br /><br />Alive is a very entertaining ...
 1    The whole point of making this film, one of th...
 2    I managed to tape this off my satellite, but I...
 3    This movie is pretty cheesy, but I do give it ...
 4    A stuttering plot, uninteresting characters an...
 Name: sentence, dtype: object, 0    10
 1     8
 2    10
 3     1
 4     7
 Name: sentiment, dtype: object)

# One Hot Encode

In [18]:
from keras.utils import to_categorical

y_train = to_categorical(train_df["sentiment"])
y_test = to_categorical(test_df["sentiment"])

In [19]:
set(train_df['sentiment'].values)

{'1', '10', '2', '3', '4', '7', '8', '9'}

In [20]:
train_df["sentiment"][1]

'8'

In [21]:
y_train[1]

array([0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.], dtype=float32)

# Generate & Learn Word Embeddings

## Create Tokenizer with Keras API

In [22]:
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

tokenizer_obj = Tokenizer()
total_reviews = X_train + X_test
tokenizer_obj.fit_on_texts(total_reviews)

In [23]:
max_length = max([len(s.split()) for s in total_reviews])

vocab_size = len(tokenizer_obj.word_index) + 1

X_train_tokens = tokenizer_obj.texts_to_sequences(X_train)
X_test_tokens = tokenizer_obj.texts_to_sequences(X_test)

In [24]:
X_train_tokens[0][:10]

[1125, 7, 7, 1125, 6, 3, 52, 427, 858, 860]

In [25]:
X_train_pad = pad_sequences(X_train_tokens, maxlen=max_length, padding="post")
X_test_pad = pad_sequences(X_test_tokens, maxlen=max_length, padding="post")

In [26]:
X_train_pad[0][:10]

array([1125,    7,    7, 1125,    6,    3,   52,  427,  858,  860],
      dtype=int32)

## Build Model

In [69]:
from keras.models import Sequential
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, Dropout, add, concatenate, Flatten
from keras.layers import CuDNNLSTM, Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.layers.embeddings import Embedding

# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense, Embedding, LSTM, GRU, CuDNNGRU, CuDNNLSTM
# from tensorflow.keras.layers.embeddings import Embedding

In [70]:
vocab_size**(1/float(4))

18.825538582823786

In [71]:
# EMBEDDING_DIM = 20
EMBEDDING_DIM = 50

model = Sequential()
model.add(Embedding(vocab_size, EMBEDDING_DIM, input_length=max_length))
model.add(SpatialDropout1D(0.3))
model.add(Bidirectional(CuDNNLSTM(units=256, return_sequences=True)))
model.add(Bidirectional(CuDNNLSTM(units=256, return_sequences=True)))
model.add(Dense(1024, activation='relu'))
model.add(Dense(1024, activation='relu'))
model.add(Flatten())
model.add(Dense(11, activation="sigmoid"))
model.compile(loss="binary_crossentropy", optimizer='adam', metrics=['accuracy'])

print('Summary of the built model...')
print(model.summary())

Summary of the built model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_20 (Embedding)     (None, 2710, 50)          6280000   
_________________________________________________________________
spatial_dropout1d_18 (Spatia (None, 2710, 50)          0         
_________________________________________________________________
bidirectional_32 (Bidirectio (None, 2710, 512)         630784    
_________________________________________________________________
bidirectional_33 (Bidirectio (None, 2710, 512)         1576960   
_________________________________________________________________
dense_22 (Dense)             (None, 2710, 1024)        525312    
_________________________________________________________________
dense_23 (Dense)             (None, 2710, 1024)        1049600   
_________________________________________________________________
flatten_1 (Flatten)          (None, 2775040)  

In [None]:
embedding_matrix = np.loadtxt('embedding_concat.txt', dtype=int)
words = Input(shape=(max_length,))
x = Embedding(*embedding_matrix.shape, weights=[embedding_matrix], trainable=False)(words)
x = SpatialDropout1D(0.3)(x)
x = Bidirectional(CuDNNLSTM(LSTM_UNITS, return_sequences=True))(x)
x = Bidirectional(CuDNNLSTM(LSTM_UNITS, return_sequences=True))(x)

hidden = concatenate([
    GlobalMaxPooling1D()(x),
    GlobalAveragePooling1D()(x),
])
hidden = add([hidden, Dense(DENSE_HIDDEN_UNITS, activation='relu')(hidden)])
hidden = add([hidden, Dense(DENSE_HIDDEN_UNITS, activation='relu')(hidden)])
result = Dense(11, activation='sigmoid')(hidden)

model = Model(inputs=words, outputs=result)
model.compile(loss='binary_crossentropy', optimizer='adam')

# Train model

In [73]:
from keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateScheduler

tbCallBack = keras.callbacks.TensorBoard(log_dir=LOG_DIR, histogram_freq=0, write_graph=True, write_images=True)
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1)
mc = ModelCheckpoint('best_model.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)
ls_sched = LearningRateScheduler(lambda epoch: 1e-3 * (0.6 ** global_epoch))

In [74]:

model.fit(X_train_pad, y_train, batch_size=64, epochs=10, 
          validation_data=(X_test_pad, y_test), callbacks=[tbCallBack, es, mc])

Train on 25000 samples, validate on 25000 samples
Epoch 1/10

Epoch 00001: val_acc improved from -inf to 0.91456, saving model to best_model.h5
Epoch 2/10

Epoch 00002: val_acc improved from 0.91456 to 0.91829, saving model to best_model.h5
Epoch 3/10

Epoch 00003: val_acc did not improve from 0.91829
Epoch 00003: early stopping


<keras.callbacks.History at 0x7f15dcd74ef0>

# Load Trained Model

In [77]:
model.load_weights('best_model.h5')

In [78]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_20 (Embedding)     (None, 2710, 50)          6280000   
_________________________________________________________________
spatial_dropout1d_18 (Spatia (None, 2710, 50)          0         
_________________________________________________________________
bidirectional_32 (Bidirectio (None, 2710, 512)         630784    
_________________________________________________________________
bidirectional_33 (Bidirectio (None, 2710, 512)         1576960   
_________________________________________________________________
dense_22 (Dense)             (None, 2710, 1024)        525312    
_________________________________________________________________
dense_23 (Dense)             (None, 2710, 1024)        1049600   
_________________________________________________________________
flatten_1 (Flatten)          (None, 2775040)           0         
__________

In [None]:
model.predict()

# An Alternative - Word2Vec
Instead of training the embedding layer, we can first separetely learn word embeddings and then pass them onto the embedding layer

## Train word2vec embedding

In [None]:
import string 
from nltk.tokenize import word_tokenize
from mltk.corpus import stopwords

total_df = X_train + X_test

review_lines = list()
lines = total_df.values.tolist()

for line in lines:
    tokens = word_tokenize(line)
    #convert to lower case
    tokens = [w.lower() for w in tokens]
    #remove punctuation
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    #remove non-alphabetic tokens
    words=[word for word in stripped if word is alpha()]
    #filter out stop words
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    review_lines.append(words)

In [None]:
len(review_lines)

In [None]:
import gensim

model = gensim.models.Word2Vec(sentences=review_lines, size=EMBEDDING_DIM, 
                               window=5, workers=4, min_count=1)
words = list(model.wv.vocab)
print("Vocabulary size: %d" % len(words))

## Test & save word2vec

In [None]:
model.wv.most_similar('horrible')

In [None]:
model.wv.save_word2vec_format('imdb_embedding_word2vec.txt', binary=False)