In [1]:
import os
import numpy as np
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.initializers import Constant
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Dense, GlobalMaxPooling1D
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Embedding

In [2]:
print("Setting random seed")
np.random.seed(0)
if(tf.test.is_gpu_available()):
    print("Using GPU..")
else:
    print("Using CPU..")

Setting random seed
Using GPU..


In [3]:
GLOVE_DIR=os.path.abspath("./glove")
TRAIN_DATA_DIR=os.path.abspath("./aclImdb/train")
TEST_DATA_DIR=os.path.abspath("./aclImdb/test")

In [4]:
MAX_SEQUENCE_LENGTH=1000
MAX_NUM_WORDS=20000
EMBEDDING_DIM=100
VALIDATION_SPLIT=0.2

In [5]:
def get_data(data_dir):
    texts = []
    labels_index = {'pos':1, 'neg':0}
    labels = []
    for name in sorted(os.listdir(data_dir)):
        path = os.path.join(data_dir, name)
        if os.path.isdir(path):
            if name=='pos' or name=='neg':
                label_id = labels_index[name]
                for fname in sorted(os.listdir(path)):
                        fpath = os.path.join(path, fname)
                        text = open(fpath,encoding='utf8').read()
                        texts.append(text)
                        labels.append(label_id)
    return texts, labels

train_texts, train_labels = get_data(TRAIN_DATA_DIR)
test_texts, test_labels = get_data(TEST_DATA_DIR)
labels_index = {'pos':1, 'neg':0} 

In [6]:
print(train_texts[0])
print(train_labels[0])
print("------------")
print(test_texts[24999])
print(test_labels[24999])

Story of a man who has unnatural feelings for a pig. Starts out with a opening scene that is a terrific example of absurd comedy. A formal orchestra audience is turned into an insane, violent mob by the crazy chantings of it's singers. Unfortunately it stays absurd the WHOLE time with no general narrative eventually making it just too off putting. Even those from the era should be turned off. The cryptic dialogue would make Shakespeare seem easy to a third grader. On a technical level it's better than you might think with some good cinematography by future great Vilmos Zsigmond. Future stars Sally Kirkland and Frederic Forrest can be seen briefly.
0
------------
I've seen this story before but my kids haven't. Boy with troubled past joins military, faces his past, falls in love and becomes a man. The mentor this time is played perfectly by Kevin Costner; An ordinary man with common everyday problems who lives an extraordinary conviction, to save lives. After losing his team he takes a 

In [7]:
tokenizer=Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(train_texts)

train_sequences=tokenizer.texts_to_sequences(train_texts)
test_sequences=tokenizer.texts_to_sequences(test_texts)

word_index=tokenizer.word_index
print(f"Number of unique tokens: {len(word_index)}")

Number of unique tokens: 88582


In [8]:
trainvalid_data=pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)
test_data=pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

trainvalid_labels=to_categorical(np.asarray(train_labels))
test_labels=to_categorical(np.asarray(test_labels))

indices=np.arange(trainvalid_data.shape[0])
np.random.shuffle(indices)

trainvalid_data=trainvalid_data[indices]
trainvalid_labels=trainvalid_labels[indices]

num_validation_samples=int(VALIDATION_SPLIT*trainvalid_data.shape[0])

X_train=trainvalid_data[:-num_validation_samples]
y_train=trainvalid_labels[:-num_validation_samples]

x_val=trainvalid_data[-num_validation_samples:]
y_val=trainvalid_labels[-num_validation_samples:]

print("Splitting train data into train and validation is done")

Splitting train data into train and validation is done


In [9]:
print('Preparing embedding matrix')

embeddings_index = {}
with open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'),encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print('Found %s word vectors in Glove embeddings.' % len(embeddings_index))

num_words = min(MAX_NUM_WORDS, len(word_index)) + 1
embedding_matrix=np.zeros((num_words, EMBEDDING_DIM))

for word, i in word_index.items():
    if i > MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
        
embedding_layer=Embedding(num_words,
                          EMBEDDING_DIM,
                          embeddings_initializer=Constant(embedding_matrix),
                          input_length=MAX_SEQUENCE_LENGTH,
                          trainable=False)

print("Preparing of embedding matrix is done")

Preparing embedding matrix
Found 400000 word vectors in Glove embeddings.
Preparing of embedding matrix is done


In [10]:
print("Define a 1D CNN model")

cnnmodel=Sequential()
cnnmodel.add(embedding_layer)

cnnmodel.add(Conv1D(128, 5, activation="relu"))
cnnmodel.add(MaxPooling1D(5))

cnnmodel.add(Conv1D(128, 5, activation="relu"))
cnnmodel.add(MaxPooling1D(5))

cnnmodel.add(Conv1D(128, 5, activation="relu"))
cnnmodel.add(GlobalMaxPooling1D())

cnnmodel.add(Dense(128, activation="relu"))
cnnmodel.add(Dense(len(labels_index), activation="softmax"))

cnnmodel.compile(loss="categorical_crossentropy",
                 optimizer="rmsprop",
                 metrics=["acc"])

cnnmodel.fit(X_train, y_train,
             batch_size=128,
             epochs=20,
             validation_data=(x_val, y_val))

score, acc=cnnmodel.evaluate(test_data, test_labels)
print(f"Test accuracy with CNN: {acc}")

Define a 1D CNN model
Train on 20000 samples, validate on 5000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


Test accuracy with CNN: 0.8345999717712402


In [11]:
print("Define a 1D CNN model. Training word embeddings on the fly.")

cnnmodel=Sequential()
cnnmodel.add(Embedding(MAX_NUM_WORDS, 128))

cnnmodel.add(Conv1D(128, 5, activation="relu"))
cnnmodel.add(MaxPooling1D(5))

cnnmodel.add(Conv1D(128, 5, activation="relu"))
cnnmodel.add(MaxPooling1D(5))

cnnmodel.add(Conv1D(128, 5, activation="relu"))
cnnmodel.add(GlobalMaxPooling1D())

cnnmodel.add(Dense(128, activation="relu"))
cnnmodel.add(Dense(len(labels_index), activation="softmax"))

cnnmodel.compile(loss="categorical_crossentropy",
                 optimizer="rmsprop",
                 metrics=["acc"])

cnnmodel.fit(X_train, y_train,
             batch_size=128,
             epochs=20,
             validation_data=(x_val, y_val))

score, acc=cnnmodel.evaluate(test_data, test_labels)
print(f"Test accuracy with CNN: {acc}")

Define a 1D CNN model. Training word embeddings on the fly.
Train on 20000 samples, validate on 5000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


Test accuracy with CNN: 0.8508800268173218
