## Sentiment Analysis with RNN

In [3]:
import tensorflow as tf
from tensorflow import keras
import pickle
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

In [4]:
def load_data(filename):
    return pickle.load(open(filename, 'rb'))

def create_tokenizer(texts):
    tokenizer =  Tokenizer()
    tokenizer.fit_on_texts(texts)
    return tokenizer

def encode_texts(tokenizer, max_length, texts):
    seq = tokenizer.texts_to_sequences(texts)
    seq_padded = pad_sequences(seq, maxlen=max_length, padding='post')
    return seq_padded

def shuffle_data(X, y):
    X, y = np.array(X), np.array(y)
    X_y = np.concatenate((X, y[:, np.newaxis]), axis=1) 
    np.random.shuffle(X_y)
    return X_y

def define_model(vocab_size):
    model = keras.models.Sequential([
        keras.layers.Embedding(input_dim=vocab_size+100, output_dim=100, mask_zero=True),
        keras.layers.LSTM(20, return_sequences=True,input_shape=[None], dropout=0.3),
        keras.layers.LSTM(20, dropout=0.3),
        keras.layers.Dense(1, activation='sigmoid')
    ])
    return model

In [5]:
X_train, y_train = load_data('dataset/movie_reviews_train.pkl')
X_test, y_test = load_data('dataset/movie_reviews_test.pkl')
X = X_train+X_test

tokenizer = create_tokenizer(X)
vocab_size = len(tokenizer.word_index)+1
max_length = max(len(doc) for doc in X)

X_train_encoded = encode_texts(tokenizer, max_length, X_train)
X_test_encoded = encode_texts(tokenizer, max_length, X_test)

# Shuffle data
X_train_encoded = shuffle_data(X_train_encoded, y_train)

# split train set into train and valid set
train, valid = X_train_encoded[:1700], X_train_encoded[1700:]
X_train, y_train = train[:, :-1], train[:, -1:]
X_valid, y_valid = valid[:, :-1], valid[:, -1:]

model = define_model(vocab_size)
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
history = model.fit(X_train, y_train, epochs=10,
                    validation_data=(X_valid, y_valid),
                    )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [8]:
model.evaluate(X_test_encoded, np.array(y_test))



[0.9177277088165283, 0.8199999928474426]

In [9]:
model.save('sentiment_analysis_RNN.h5')