## Classification of Movie Reviews with a Embedding Layer and Convlutional Layer

In [1]:
import pickle
import tensorflow as tf
from tensorflow import keras
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


def load_dataset(filename):
    return pickle.load(open(filename, 'rb'))

def create_vocab(docs):
    vocab = []
    for doc in docs:
        vocab.extend(doc)
    return vocab
    
def create_tokenizer(docs):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(docs)
    return tokenizer

def shuffle_data(X, y):
    X, y = np.array(X), np.array(y)
    X_y = np.concatenate((X, y[:, np.newaxis]), axis=1) 
    np.random.shuffle(X_y)
    return X_y

def encode_pad_documents(tokenizer,max_length, docs):
    encoded_docs = tokenizer.texts_to_sequences(docs)
    padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
    return padded_docs

def define_model(vocab_size, max_length):
    model = tf.keras.models.Sequential([
        tf.keras.layers.Embedding(vocab_size, 100, input_length=max_length),
        tf.keras.layers.Conv1D(filters=32, kernel_size=8, activation='relu'),
        tf.keras.layers.MaxPooling1D(pool_size=2),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(10, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid'),
    ])
    model.compile(loss='binary_crossentropy', optimizer = 'adam', metrics=["accuracy"])
    return model


In [4]:
# load the train and test sets
X_train, y_train = load_dataset('movie_reviews_train.pkl')
X_test, y_test = load_dataset('movie_reviews_test.pkl')
X = X_train + X_test

#Create tokenizer
tokenizer = create_tokenizer(X)
vocab_size = len(tokenizer.word_index) + 1
max_length = max([len(doc) for doc in X])

# encoding to padded document
X_train = encode_pad_documents(tokenizer, max_length, X_train)
X_test = encode_pad_documents(tokenizer, max_length, X_test)

# Shuffle data
train = shuffle_data(X_train, y_train)
test = shuffle_data(X_test, y_test)

# split train set into train and valid set
train, valid = train[:1700], train[1700:]
X_train, y_train = train[:, :-1], train[:, -1:]
X_valid, y_valid = valid[:, :-1], valid[:, -1:]
X_test, y_test = test[:, :-1], test[:, -1:]

# train model and evaluate
model = define_model(vocab_size, max_length)
model.fit(X_train, y_train, 
            epochs=10, 
            verbose=1, 
            validation_data=(X_valid, y_valid), 
            )
model.save('model_movie_review_embedding_cnn.h5')
print(model.evaluate(X_test, y_test))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
[0.40335407853126526, 0.8700000047683716]
