In [1]:
from zipfile import ZipFile
filename = "txt_sentoken.zip"
with ZipFile(filename, 'r') as zip:
  zip.extractall()
  print('Done')

Done


In [3]:
import string
import re
from os import listdir
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer

In [4]:
# load doc into memory
def load_doc(filename):
  # open the file as read only
  file = open(filename, 'r')
  # read all text
  text = file.read()
  # close the file
  file.close()
  return text

In [5]:
# turn a doc into clean tokens
def clean_doc(doc):
  # split into tokens by white space
  tokens = doc.split()
  # prepare regex for char filtering
  re_punc = re.compile('[%s]' % re.escape(string.punctuation))
  # remove punctuation from each word
  tokens = [re_punc.sub('', w) for w in tokens]
  # remove remaining tokens that are not alphabetic
  tokens = [word for word in tokens if word.isalpha()]
  # filter out stop words
  stop_words = set(stopwords.words('english'))
  tokens = [w for w in tokens if not w in stop_words]
  # filter out short tokens
  tokens = [word for word in tokens if len(word) > 1]
  return tokens

In [6]:
# load doc, clean and return line of tokens
def doc_to_line(filename, vocab):
  # load the doc
  doc = load_doc(filename)
  # clean doc
  tokens = clean_doc(doc)
  # filter by vocab
  tokens = [w for w in tokens if w in vocab]
  return ' '.join(tokens)

In [7]:
# load all docs in a directory
def process_docs(directory, vocab, is_train):
  lines = list()
  # walk through all files in the folder
  for filename in listdir(directory):
  # skip any reviews in the test set
    if is_train and filename.startswith('cv9'):
      continue
    if not is_train and not filename.startswith('cv9'):
      continue
    path = directory + '/' + filename
    # load and clean the doc
    line = doc_to_line(path, vocab)
    # add to list
    lines.append(line)
  return lines

In [8]:
# load and clean a dataset
def load_clean_dataset(vocab, is_train):
  # load documents
  neg = process_docs('txt_sentoken/neg', vocab, is_train)
  pos = process_docs('txt_sentoken/pos', vocab, is_train)
  docs = neg + pos
  # prepare labels
  labels = [0 for _ in range(len(neg))] + [1 for _ in range(len(pos))]
  return docs, labels

In [9]:
# fit a tokenizer
def create_tokenizer(lines):
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(lines)
  return tokenizer

In [12]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [13]:
# load the vocabulary
vocab_filename = 'vocab.txt'
vocab = load_doc(vocab_filename)
vocab = set(vocab.split())
# load all reviews
train_docs, ytrain = load_clean_dataset(vocab, True)
test_docs, ytest = load_clean_dataset(vocab, False)
# create the tokenizer
tokenizer = create_tokenizer(train_docs)
# encode data
Xtrain = tokenizer.texts_to_matrix(train_docs, mode='freq')
Xtest = tokenizer.texts_to_matrix(test_docs, mode='freq')
print(Xtrain.shape, Xtest.shape)

(1800, 14781) (200, 14781)


In [14]:
n_words = Xtest.shape[1]

In [15]:
n_words

14781

In [16]:
# define the model
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import Dense

def define_model(n_words):
  # define network
  model = Sequential()
  model.add(Dense(50, input_shape=(n_words,), activation='relu'))
  model.add(Dense(1, activation='sigmoid'))
  # compile network
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
  # summarize defined model
  model.summary()
  plot_model(model, to_file='model.png', show_shapes=True)
  return model

In [17]:
model = define_model(n_words)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 50)                739100    
                                                                 
 dense_1 (Dense)             (None, 1)                 51        
                                                                 
Total params: 739,151
Trainable params: 739,151
Non-trainable params: 0
_________________________________________________________________


In [19]:
# load the vocabulary
vocab_filename = 'vocab.txt'
vocab = load_doc(vocab_filename)
vocab = set(vocab.split())
# load all reviews
train_docs, ytrain = load_clean_dataset(vocab, True)
test_docs, ytest = load_clean_dataset(vocab, False)
# create the tokenizer
tokenizer = create_tokenizer(train_docs)
# encode data
Xtrain = tokenizer.texts_to_matrix(train_docs, mode='freq')
Xtest = tokenizer.texts_to_matrix(test_docs, mode='freq')

In [25]:
import numpy as np
ytrain = np.array(ytrain)
ytest = np.array(ytest)

In [27]:
# fit model
model.fit(Xtrain, ytrain, epochs=10, verbose=2)

Epoch 1/10
57/57 - 1s - loss: 0.6913 - accuracy: 0.6189 - 1s/epoch - 20ms/step
Epoch 2/10
57/57 - 0s - loss: 0.6803 - accuracy: 0.8311 - 472ms/epoch - 8ms/step
Epoch 3/10
57/57 - 0s - loss: 0.6601 - accuracy: 0.8206 - 468ms/epoch - 8ms/step
Epoch 4/10
57/57 - 0s - loss: 0.6292 - accuracy: 0.8967 - 465ms/epoch - 8ms/step
Epoch 5/10
57/57 - 0s - loss: 0.5879 - accuracy: 0.9306 - 482ms/epoch - 8ms/step
Epoch 6/10
57/57 - 0s - loss: 0.5452 - accuracy: 0.9206 - 490ms/epoch - 9ms/step
Epoch 7/10
57/57 - 0s - loss: 0.4965 - accuracy: 0.9394 - 481ms/epoch - 8ms/step
Epoch 8/10
57/57 - 1s - loss: 0.4506 - accuracy: 0.9489 - 519ms/epoch - 9ms/step
Epoch 9/10
57/57 - 0s - loss: 0.4068 - accuracy: 0.9511 - 481ms/epoch - 8ms/step
Epoch 10/10
57/57 - 0s - loss: 0.3673 - accuracy: 0.9578 - 468ms/epoch - 8ms/step


<keras.callbacks.History at 0x7ff12b7f2650>

In [28]:
# evaluate
loss, acc = model.evaluate(Xtest, ytest, verbose=0)
print('Test Accuracy: %f' % (acc*100))

Test Accuracy: 88.000000


In [29]:
# Predict new review
# classify a review as negative or positive
def predict_sentiment(review, vocab, tokenizer, model):
  # clean
  tokens = clean_doc(review)
  # filter by vocab
  tokens = [w for w in tokens if w in vocab]
  # convert to line
  line = ' '.join(tokens)
  # encode
  encoded = tokenizer.texts_to_matrix([line], mode='binary')
  # predict sentiment
  yhat = model.predict(encoded, verbose=0)
  # retrieve predicted percentage and label
  percent_pos = yhat[0,0]
  if round(percent_pos) == 0:
    return (1-percent_pos), 'NEGATIVE'
  return percent_pos, 'POSITIVE'

In [30]:
# positive review
text = 'Best movie ever! It was great, I recommend it.'
percent, sentiment = predict_sentiment(text, vocab, tokenizer, model)
print('Review: [%s]\nSentiment: %s (%.3f%%)' % (text, sentiment, percent*100))
# negative review
text = 'This is a bad movie, not recommend'
percent, sentiment = predict_sentiment(text, vocab, tokenizer, model)
print('Review: [%s]\nSentiment: %s (%.3f%%)' % (text, sentiment, percent*100))

Review: [Best movie ever! It was great, I recommend it.]
Sentiment: POSITIVE (99.999%)
Review: [This is a bad movie, not recommend]
Sentiment: NEGATIVE (99.971%)
