In [1]:
import nltk
from nltk.corpus import PlaintextCorpusReader
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/tony/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/tony/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

The product review corpus contains reviews scored as positive and negative opinions. Pre-process
your text, prepare the review examples for training and evaluation. Implement, train and evaluate a
neural network that can classify an input review to either a positive or a negative class. You are
free to choose any neural network/deep learning technique taught in the Chapter “Deep Learning
for NLP”, e.g., multi-layer perceptron, LSTM, bi-directional LSTM, etc. You should design
appropriate experiments to evaluate your classifier’s classification accuracy based on 5-fold cross
validation (CV).

In [2]:
# Set up useful dictionary mappings
fileNames = {} # Dictionary to fetch file name based on docID (docID : fileName)
docTerms = {} # Dictionary to fetch terms of a doc based on fileName (fileName : [terms_list]) where terms_list includes duplicates
termFreq = {} # Dictionary to fetch TF of a term
review_data = [] # Stores review samples

# Set up PlaintextCorpusReader object to read all txt files in product_reviews folder
data = None
path = "product_reviews"
data = PlaintextCorpusReader(path, '.*.txt')

# Initialise dictionaries
documents = data.fileids()
documents.remove("README.txt")
for i in range(0,len(documents)):
  docID = i+1
  file_name = documents[i]
  fileNames.update({docID : file_name})


Pre-processing the text

In [3]:
from nltk.stem import snowball
import regex as re
import string

def filter_words(words: list) -> list:
    """
    Returns a list of words that are:
    - lowercase
    - not stopwords
    - length > 1
    - have no punctation in each word
    - stemmed
    """
    # Use only lowercase characters
    lowercase_words = [word.lower() for word in words]
    
    # Remove stopwords
    stop_words = stopwords.words('english')
    no_stop_words = [word for word in lowercase_words if word not in stop_words]
    
    # Remove strings with length 1
    long_words = [word for word in no_stop_words if len(word) > 1]
    
    # Remove punctation in between words
    nopunct_words = [word.translate(str.maketrans('', '', string.punctuation)) for word in long_words]
    
    # Stemming using Snowball Stemmer (Porter2)
    stemmer = snowball.SnowballStemmer('english')
    stemmed_words = [stemmer.stem(word) for word in nopunct_words]
    return stemmed_words

def filter_review(review: str) -> str:
    """
    Returns a string review that:
    - does not include the rating in the review
    - is lexically processed
    """
    # The comment of a review appears after the ## tag in each document
    try:
        splitting_index = review.index('##')
    except: # Some reviews aren't tagged with ## before the the review's comment so look directly after the rating tag
        splitting_index = review.rfind(']')
    comment = review[splitting_index+2:]
    return " ".join(filter_words(comment.split()))
    
    
def preprocess_document(document: str):
    raw_text = data.raw(document)
    lines = raw_text.splitlines()

    for line in lines:
        res = re.search('\[(?:\+|\-)\d\]',line) # Search for any tags in the form [+n] or [-n] in the review
        if res is not None: 
            review = filter_review(line)
            # Generate tagged data based on the rating tag (class 1 = positive, class 0 = negative)
            if '+' in res.group():
                review_data.append([review,1])
            else:
                review_data.append([review,0])

# Pre-process each doucment
for doc in documents:
    preprocess_document(doc)


Preparing the review examples for training and evaluation

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import numpy as np

cv = CountVectorizer()
reviews = [review for review,_ in review_data]

X = cv.fit_transform(reviews)
y = np.array([label for _,label in review_data])

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1672, 3319)
(418, 3319)
(1672,)
(418,)


In [5]:
import tensorflow as tf
from keras.models import Sequential
from keras import layers
from keras import losses

model = tf.keras.Sequential([
  layers.Embedding(X_train.shape[1], 16),
  layers.Dropout(0.2),
  layers.GlobalAveragePooling1D(),
  layers.Dropout(0.2),
  layers.Dense(1)])

model.summary()

model.compile(loss=losses.BinaryCrossentropy(from_logits=True),
              optimizer='adam',
              metrics=tf.metrics.BinaryAccuracy(threshold=0.0))

epochs = 10
history = model.fit(
    X_train,
    y_train,
    epochs=10)


2022-11-23 21:52:59.990012: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-11-23 21:53:00.596651: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2022-11-23 21:53:00.596706: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 16)          53104     
                                                                 
 dropout (Dropout)           (None, None, 16)          0         
                                                                 
 global_average_pooling1d (G  (None, 16)               0         
 lobalAveragePooling1D)                                          
                                                                 
 dropout_1 (Dropout)         (None, 16)                0         
                                                                 
 dense (Dense)               (None, 1)                 17        
                                                                 
Total params: 53,121
Trainable params: 53,121
Non-trainable params: 0
____________________________________________________

2022-11-23 21:53:01.411219: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-11-23 21:53:01.442351: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudnn.so.8'; dlerror: libcudnn.so.8: cannot open shared object file: No such file or directory
2022-11-23 21:53:01.442377: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1934] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
2022-11-23 21:53:01.442721: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neur

InvalidArgumentError: {{function_node __wrapped__SerializeManySparse_device_/job:localhost/replica:0/task:0/device:CPU:0}} indices[2] = [0,1960] is out of order. Many sparse ops require sorted indices.
    Use `tf.sparse.reorder` to create a correctly ordered copy.

 [Op:SerializeManySparse]