## Movie Review Classification with multichannel model
A standard Model for document classification contains an Embedding layer as input, followed by a one-dimensional convolutional neural network, pooling layer, and then a prediction output layer. The kernel size in the convolutional layer is the number of words used in convolution to produce a single grouping parameter. A multi-channel convolutional neural networds uses mulitple versions of a standard mainly with different kernel size. This approach lets to process the document at at different n-grams at a time.

In [34]:
import pickle
import tensorflow as tf
from tensorflow import keras
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


def load_dataset(filename):
    return pickle.load(open(filename, 'rb'))

def create_vocab(docs):
    vocab = []
    for doc in docs:
        vocab.extend(doc)
    return vocab
    
def create_tokenizer(docs):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(docs)
    return tokenizer

def shuffle_data(X, y):
    X, y = np.array(X), np.array(y)
    X_y = np.concatenate((X, y[:, np.newaxis]), axis=1) 
    np.random.shuffle(X_y)
    return X_y

def encode_pad_documents(tokenizer,max_length, docs):
    encoded_docs = tokenizer.texts_to_sequences(docs)
    padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
    return padded_docs

def define_model(vocab_size, length):
    # channel 1
    inputs1 =  keras.Input(shape=(length,))
    embedding1 = keras.layers.Embedding(vocab_size, 100)(inputs1)
    conv1 = keras.layers.Conv1D(filters=32, kernel_size=4, activation='relu')(embedding1)
    drop1 = keras.layers.Dropout(0.5)(conv1)
    pool1 = keras.layers.MaxPooling1D(pool_size=2)(drop1)
    flat1 = keras.layers.Flatten()(pool1)
    # channel 2
    inputs2 =  keras.Input(shape=(length,))
    embedding2 = keras.layers.Embedding(vocab_size, 100)(inputs2)
    conv2 = keras.layers.Conv1D(filters=32, kernel_size=6, activation='relu')(embedding2)
    drop2 = keras.layers.Dropout(0.5)(conv2)
    pool2 = keras.layers.MaxPooling1D(pool_size=2)(drop2)
    flat2 = keras.layers.Flatten()(pool2)
    # channel 3
    inputs3 =  keras.Input(shape=(length,))
    embedding3 = keras.layers.Embedding(vocab_size, 100)(inputs3)
    conv3 = keras.layers.Conv1D(filters=32, kernel_size=8, activation='relu')(embedding3)
    drop3 = keras.layers.Dropout(0.5)(conv3)
    pool3 = keras.layers.MaxPooling1D(pool_size=2)(drop3)
    flat3 = keras.layers.Flatten()(pool3)
    # merge
    merged = keras.layers.concatenate([flat1, flat2, flat3])
    # FCN
    dense1 = keras.layers.Dense(10, activation='relu')(merged)
    outputs = keras.layers.Dense(1, activation='sigmoid')(dense1)
    model = keras.models.Model(inputs=[inputs1, inputs2, inputs3], outputs=outputs)
    # compile
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

Lets load the train ans test sets that are already cleaned.

In [37]:
# load the train and test sets
X_train, y_train = load_dataset('movie_reviews_train.pkl')
X_test, y_test = load_dataset('movie_reviews_test.pkl')
X = X_train + X_test

#Create tokenizer
tokenizer = create_tokenizer(X)
vocab_size = len(tokenizer.word_index) + 1
max_length = max([len(doc) for doc in X])

# encoding to padded document
X_train = encode_pad_documents(tokenizer, max_length, X_train)
X_test = encode_pad_documents(tokenizer, max_length, X_test)

# Shuffle data
train = shuffle_data(X_train, y_train)
test = shuffle_data(X_test, y_test)

# split train set into train and valid set
train, valid = train[:1700], train[1700:]
X_train, y_train = train[:, :-1], train[:, -1:]
X_valid, y_valid = valid[:, :-1], valid[:, -1:]
X_test, y_test = test[:, :-1], test[:, -1:]

# train model and evaluate
model = define_model(vocab_size, max_length)
earlystop_callback = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy',
                                                    min_delta=0,
                                                    patience=5,
                                                    mode='max',
                                                    restore_best_weights=True)
model.fit([X_train,X_train,X_train], y_train, 
            epochs=10, 
            verbose=1, 
            validation_data=([X_valid,X_valid,X_valid], y_valid),
            batch_size=16,
            callbacks = [earlystop_callback]
            )
model.save('model_movie_review_multichannel.h5')
print(model.evaluate([X_test,X_test,X_test], y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
[0.3751285672187805, 0.8600000143051147]
