## Classification of Movie Reviews with Simple Neural Network

In [107]:
import pickle
import tensorflow as tf
from tensorflow import keras
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer

tf.random.set_seed(0)
np.random.seed(0)

def load_dataset(filename):
    return pickle.load(open(filename, 'rb'))

def create_vocab(docs):
    vocab = []
    for doc in docs:
        vocab.extend(doc)
    return vocab
    
def create_tokenizer(docs):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(docs)
    return tokenizer

def shuffle_data(X, y):
    X, y = np.array(X), np.array(y)
    X_y = np.concatenate((X, y[:, np.newaxis]), axis=1) 
    np.random.shuffle(X_y)
    return X_y

def define_model(input_shape):
    model = tf.keras.models.Sequential([
        tf.keras.layers.Input(shape=(input_shape,)),
        tf.keras.layers.Dense(256, activation='relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
    model.compile(loss='binary_crossentropy', optimizer = 'Adam', metrics=["accuracy"])
    return model

def get_callbacks():
    def scheduler(epoch, lr):
        if epoch < 5:
            return lr
        else:
            return lr*0.90
    lr_scheduler_callback = tf.keras.callbacks.LearningRateScheduler(schedule=scheduler)
    earlystopping_callback = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy',
                                                      min_delta=0,
                                                      patience=20,
                                                      mode='max',
                                                      restore_best_weights=True)
    return [lr_scheduler_callback, earlystopping_callback]


In [108]:
train_docs, train_labels = load_dataset('movie_reviews_train.pkl')
test_docs, test_labels = load_dataset('movie_reviews_test.pkl')
docs = train_docs+test_docs
tokenizer = create_tokenizer(docs)
X_train = tokenizer.texts_to_matrix(train_docs, mode='binary')
X_test = tokenizer.texts_to_matrix(test_docs, mode='binary')

# Shuffle data
train = shuffle_data(X_train, train_labels)
test = shuffle_data(X_test, test_labels)

# split train set in train and valid set
train, valid = train[:1700], train[1700:]
X_train, y_train = train[:, :-1], train[:, -1:]
X_valid, y_valid = valid[:, :-1], valid[:, -1:]
X_test, y_test = test[:, :-1], test[:, -1:]

# obtain the model and fit it
model = define_model(input_shape=X_train.shape[-1])
call_backs = get_callbacks()
model.fit(X_train, y_train, 
            epochs=100, 
            verbose=0, 
            validation_data=(X_valid, y_valid), 
            batch_size = 128, 
            callbacks=call_backs)
model.save('model_movie_review_simpleNN.h5')
print(model.summary())
loss, accuracy = model.evaluate(X_test, y_test)
print('Test Accuracy: ', accuracy*100)


Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_39 (Dense)             (None, 256)               11817472  
_________________________________________________________________
batch_normalization_29 (Batc (None, 256)               1024      
_________________________________________________________________
dense_40 (Dense)             (None, 128)               32896     
_________________________________________________________________
batch_normalization_30 (Batc (None, 128)               512       
_________________________________________________________________
dense_41 (Dense)             (None, 64)                8256      
_________________________________________________________________
batch_normalization_31 (Batc (None, 64)                256       
_________________________________________________________________
dense_42 (Dense)             (None, 1)               

In [141]:
import re
import string
from nltk.corpus import stopwords

def clean_doc(docs):
    """ Removal of puntuations, stopwords, non-numerics"""
    tokens = docs.split(' ')
    # remove punctuations
    reg_punctuation = re.compile('[%s]' % re.escape(string.punctuation))
    tokens = [reg_punctuation.sub('', w) for w in tokens]
    # romove numerics and stop words
    tokens = [token for token in tokens if token.isalpha()]
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if not word in stop_words]
    return tokens

def predict_sentiment(review, tokenizer, model, vocab):
    tokens = clean_doc(review)
    tokens = [w for w in tokens if w in vocab]
    line = ' '.join(tokens)
    encoded = tokenizer.texts_to_matrix([line], mode='binary')
    y_pred = model.predict(encoded, verbose=0)
    percent_pos = y_pred[0,0]
    if round(percent_pos) == 0:
        return  'NEGATIVE'
    return  'POSITIVE'



In [142]:

train_docs, _ = load_dataset('movie_reviews_train.pkl')
test_docs, _ = load_dataset('movie_reviews_test.pkl')
docs = train_docs+test_docs

vocab = set([word for doc in docs for word in doc])

text = """It’s a piece on two of my favorite films of 2017, “Lady Bird” and
        “Call Me By Your Name”, and about how their very different modes of storytelling 
        speak to the different sorts of stories we tell ourselves. Objectively, I don’t know 
        if this is my best work in terms of pure style and craft, but I do think it’s the most 
        emblematic in terms of what I value in cinema. I think every film is, in some way, 
        a treatise on how certain memories are remembered, and I think cinema matters 
        partly because the best examples of it are prisms through which the human experience 
        is refracted.Above everything else, every movie has to begin with a good story, and 
        the greatest stories are the ones that mirror not just life, but the ways in which 
        life is distorted and restructured through the process of remembering. Every aspect 
        of a film, from its screenplay on down, must add something to the film’s portrayal 
        of remembering, and “Lady Bird” and “Call Me By Your Name” accomplish this organic 
        unity of theme with such charm yet in such distinct ways, that they were the perfect 
        counterpoints to each other, as well as the perfect stand-ins for cinema as a whole, 
        for me."""

sentiment = predict_sentiment(text, tokenizer, model, vocab)
print(sentiment,' ')

POSITIVE  


In [143]:
text ="""Judging by the movie's enduring popularity, the message that stupidity is 
        redemption is clearly what a lot of Americans want to hear."""
sentiment = predict_sentiment(text, tokenizer, model, vocab)
print(sentiment,' ')

NEGATIVE  
