In [1]:
import nltk
from nltk.corpus import PlaintextCorpusReader
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/tony/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/tony/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

The product review corpus contains reviews scored as positive and negative opinions. Pre-process
your text, prepare the review examples for training and evaluation. Implement, train and evaluate a
neural network that can classify an input review to either a positive or a negative class. You are
free to choose any neural network/deep learning technique taught in the Chapter “Deep Learning
for NLP”, e.g., multi-layer perceptron, LSTM, bi-directional LSTM, etc. You should design
appropriate experiments to evaluate your classifier’s classification accuracy based on 5-fold cross
validation (CV).

In [2]:
# Set up useful dictionary mappings
fileNames = {} # Dictionary to fetch file name based on docID (docID : fileName)
docTerms = {} # Dictionary to fetch terms of a doc based on fileName (fileName : [terms_list]) where terms_list includes duplicates
review_data = [] # Stores review samples

# Set up PlaintextCorpusReader object to read all txt files in product_reviews folder
data = None
path = "product_reviews"
data = PlaintextCorpusReader(path, '.*.txt')

# Initialise dictionaries
documents = data.fileids()
documents.remove("README.txt")
for i in range(0,len(documents)):
  docID = i+1
  file_name = documents[i]
  fileNames.update({docID : file_name})


Pre-processing the text

In [3]:
from nltk.stem import snowball
import regex as re
import string

def filter_words(words: list) -> list:
    """Clean a list of tokens

    Args:
     words: the list of words derived from a sentence

    Returns:
     list of cleaned tokens from the list of words
    """
    final_words = words
    # Use only lowercase characters
    final_words = [word.lower() for word in words]
    
    # Remove stopwords
    stop_words = stopwords.words('english')
    final_words = [word for word in final_words if word not in stop_words]
    
    # Remove punctation in between words
    final_words = [word.translate(str.maketrans('', '', string.punctuation)) for word in final_words]
    
    # Remove anything that isn't alphabetic
    final_words = [word for word in final_words if word.isalpha() == True]
    
    # Remove strings with length 1
    final_words = [word for word in final_words if len(word) > 1]

    # Stemming using Snowball Stemmer (Porter2)
    stemmer = snowball.SnowballStemmer('english')
    final_words = [stemmer.stem(word) for word in final_words]
    return final_words

def filter_review(review: str) -> str:
    """Clean a review by removing tags and keeping text only

    Args:
     review: the string review

    Returns:
     list of cleaned tokens from the review
    """
    # The comment of a review appears after the ## tag in each document
    try:
        splitting_index = review.index('##')
    except: # Some reviews aren't tagged with ## before the the review's comment so look directly after the rating tag
        splitting_index = review.rfind(']')
    comment = review[splitting_index+2:]
    return " ".join(filter_words(comment.split()))
    
    
def preprocess_document(document: str):
    """Preprocess a single document to classify a review as negative/positive

    Args:
     document: the raw string of the document
    """
    raw_text = data.raw(document)
    lines = raw_text.splitlines()

    for line in lines:
        res = re.search('\[(?:\+|\-)\d\]',line) # Search for any tags in the form [+n] or [-n] in the review
        if res is not None: 
            review = filter_review(line)
            # Generate tagged data based on the rating tag (class 1 = positive, class 0 = negative)
            if '+' in res.group():
                review_data.append([review,1])
            else:
                review_data.append([review,0])

# Pre-process each document
for doc in documents:
    preprocess_document(doc)
    

Preparing the review examples for training and evaluation

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
import numpy as np

# Prepare review examples
reviews = [review for review,_ in review_data]
labels = [label for _,label in review_data]

# Only used to observe the positive/negative sample split
good_reviews = [review for review in review_data if review[1]==1]
bad_reviews = [review for review in review_data if review[1]==0]

print("Sample number:",len(reviews))
print("Positive:",len(good_reviews))
print("Negative:",len(bad_reviews))



Sample number: 2090
Positive: 1344
Negative: 746


In [5]:
import scipy
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras import layers
from keras import losses
from keras.preprocessing.sequence import pad_sequences
from collections import Counter
from sklearn.metrics import confusion_matrix, classification_report
from keras.utils.vis_utils import plot_model

def count_words(data):
    """Count the total number of unique words in the dataset

    Args:
     data: a list containing the tokenised reviews
     
    Returns:
     the total number of unique words in the dataset
    """
    total_counter = set()
    for sent in data:
        words = sent.split()
        for word in words:
            total_counter.add(word)
    return len(total_counter)
        

def model_and_evaluate(training_data,test_data,EPOCHS=8,BATCH_SIZE=32,VERBOSE=False,MODEL_TYPE='GRU',DIM_SIZE=32):
    """Train a model using given training data and evaluate it against the training data

    Args:
     training_data: a list containing the training data samples
     test_data: a list containing the test data samples
     EPOCHS: number of epochs to fit the model
     BATCH_SIZE: batch size to fit the model
     VERBOSE: flag to print out intermediate outputs of the model
     
    Returns:
     the model and the accuracy of the model
    """   
    
    # Setup train and test data alongside their labels
    train_reviews = [review for review,_ in training_data]
    train_labels = np.array([label for _,label in training_data],dtype=np.int8)
    test_reviews = [review for review,_ in test_data]
    test_labels = np.array([label for _,label in test_data],dtype=np.int8)
    
    # Update internal vocabulary based on reviews
    num_words = count_words(train_reviews)
    max_length = 20
    tokenizer = Tokenizer(num_words=num_words)
    tokenizer.fit_on_texts(train_reviews)
    
    # Transform text into sequence of integers based on vocabulary mapping
    train_seq = tokenizer.texts_to_sequences(train_reviews)
    test_seq = tokenizer.texts_to_sequences(test_reviews)
    
    # Pad sequences to standardise length of vectors
    train_padded = pad_sequences(train_seq,maxlen=max_length,padding="post",truncating="post")
    test_padded = pad_sequences(test_seq,maxlen=max_length,padding="post",truncating="post")
    
    model = None
    if MODEL_TYPE == 'LSTM':    
        # Create a model that has an Embedding and Bidirectional LSTM layer
        model = Sequential([
            layers.Embedding(num_words,DIM_SIZE,input_length=max_length),
            layers.Bidirectional(layers.LSTM(DIM_SIZE, recurrent_dropout=0.2)),
            layers.Dense(1,activation="sigmoid")
        ])
    else:
        # Create a model that has an Embedding and Bidirectional GRU layer
        model = Sequential([
            layers.Embedding(num_words,DIM_SIZE,input_length=max_length),
            layers.Bidirectional(layers.GRU(DIM_SIZE, recurrent_dropout=0.2)),
            layers.Dense(1,activation="sigmoid")
        ])       

    # Configure the model for training
    model.compile(loss="binary_crossentropy",optimizer='adam',metrics=["accuracy"])

    # Fit the model
    history = model.fit(
        train_padded,
        train_labels,
        epochs=EPOCHS,
        batch_size=BATCH_SIZE,
        verbose=VERBOSE)
    
    # Evaluate the model
    loss, accuracy = model.evaluate(test_padded,test_labels)
    
    # Construct confusion matrix
    y_pred = model.predict(test_padded)
    result = classification_report(test_labels,np.rint(y_pred),target_names=['negative','positive'])
    print(result)
    plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)
    return model, accuracy

Using TensorFlow backend.


In [7]:
import random

# Shuffle the data
random.shuffle(review_data)

# Set up training and test data
index = int(len(review_data)*0.8)
training_data = review_data[:index]
test_data = review_data[index:]

model, accuracy = model_and_evaluate(training_data,test_data,VERBOSE=True,BATCH_SIZE=64,EPOCHS=12,DIM_SIZE=16)
print("Test accuracy:",accuracy)




Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12
              precision    recall  f1-score   support

    negative       0.60      0.62      0.61       140
    positive       0.81      0.79      0.80       278

    accuracy                           0.73       418
   macro avg       0.70      0.70      0.70       418
weighted avg       0.74      0.73      0.73       418

Test accuracy: 0.7320574162679426


Designing and implementing experiments to evaluate classification accuracy based on 5-fold cross validation.

In [20]:
import random
random.shuffle(review_data)

k = 5

# Split dataset into 5 groups
groups = np.array_split(review_data, k) 

# Set up reviews and labels for those reviews
reviews = [review for review,_ in review_data]
labels = [label for _,label in review_data]

# 5-fold cross validation
accuracies = []
for i in range (0,k):
    # Select the i'th group to be the test data
    test_data = groups[i]
    # Set up the other groups to be the training data
    other_groups = list(range(0,k))
    other_groups.remove(i)
    training_data = [review for j in other_groups for review in groups[j]]
    # Model and evaluate
    model, accuracy = model_and_evaluate(training_data,test_data,BATCH_SIZE=64,EPOCHS=12,DIM_SIZE=16)
    accuracies.append(accuracy)
    
print("5-fold accuracy:",np.mean(accuracies))

              precision    recall  f1-score   support

    negative       0.63      0.66      0.64       147
    positive       0.81      0.79      0.80       271

    accuracy                           0.74       418
   macro avg       0.72      0.72      0.72       418
weighted avg       0.75      0.74      0.75       418

              precision    recall  f1-score   support

    negative       0.62      0.53      0.57       161
    positive       0.73      0.79      0.76       257

    accuracy                           0.69       418
   macro avg       0.67      0.66      0.66       418
weighted avg       0.69      0.69      0.69       418

              precision    recall  f1-score   support

    negative       0.68      0.70      0.69       155
    positive       0.82      0.81      0.81       263

    accuracy                           0.77       418
   macro avg       0.75      0.75      0.75       418
weighted avg       0.77      0.77      0.77       418

              preci