In [1]:
from nltk.corpus.reader.wordnet import Lemma
import nltk
from nltk import FreqDist
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
# For file reading
import os
from os import listdir
# For pre-processing 
from nltk.tokenize import punkt
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer
import string
import time

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rubyli/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/rubyli/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/rubyli/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


# Task 1 - Step 1

In [2]:
def read_data(path: str) -> list:
    """ Read files from a directory and then append the data of each file into a list. """
    folder = listdir(path)
    res = []
    for files in folder:
        # check if current path is a file
        if files != "README.txt":
            filePath = os.path.join(path, files)
            if os.path.isfile(filePath):
                with open(filePath, "r") as file:
                    lines = file.readlines()
                res.append(lines)
    return res

In [3]:
path = "/Users/rubyli/Desktop/GitHubRepos/UoM/NLP-Distributional-Semantics-Neural-Network-for-Classifying-Product-Reviews/product_reviews"
res = read_data(path)
doc = ""
for a in res:
    for b in a:
        doc += b
print(res[0][:5])
# print(doc)

['[t]\n', 'Works[+3]##Works great, no odor, and uses regular bags.\n', "##Can't complain at all!\n", '[t]\n', "Diaper Champ[+2]##So far (3 weeks), we've had no problems with the Diaper Champ at all.\n"]


In [4]:
def process_document(document: str) -> list:
        """ pre-process a document and return a list of its terms: str->list"""
        
        pattern = r'''(?x)        # set flag to allow verbose regexps
                    (?:[A-Z]\.)+     #abbreviations
                    |\w+(?:[-']\w+)*   #word-internal hyphens
                    |'
                    |\[
                    |\#
                    |[-.(]+           #double hyphen, ellipsis, open parenthesis
                    |\S\w*
                    |\$?\d+(?:\.\d+)?%? #currency and percentages
        '''
        #Tokenization
        tokenList = nltk.regexp_tokenize(document, pattern)
        #To lower case
        tokenList = [word.lower() for word in tokenList]
        #Remove Punctuation
        tokenList = list(filter(lambda word: punkt.PunktToken(word).is_non_punct,tokenList))
        #Remove stopwords
        tokenList = list(filter(lambda word: word not in stopwords.words("english"),tokenList))
        # Lemmatisation 
        lemma = WordNetLemmatizer()
        tokenList = [lemma.lemmatize(word) for word in tokenList]

        return tokenList  

In [5]:
producedDoc = process_document(doc)
producedDoc[:20]

['work',
 'work',
 'great',
 'odor',
 'us',
 'regular',
 'bag',
 "can't",
 'complain',
 'diaper',
 'champ',
 'far',
 '3',
 'week',
 "we've",
 'problem',
 'diaper',
 'champ',
 'diaper',
 'contains']

In [6]:
# Find the 50 most frequently occurred words
# Get the frequency of each word
word_frequencies = FreqDist(producedDoc)
word_frequencies

FreqDist({'2': 467, '1': 355, 'u': 347, 'phone': 343, 'use': 339, 'one': 331, 'router': 330, 'ipod': 320, 'player': 309, 'camera': 308, ...})

In [7]:
# Sort the dictionary by frequency
sorted_frequencies = sorted(word_frequencies.items(), key=lambda item: item[1], reverse=True)

# Select the top 50 words
target_words = [item[0] for item in sorted_frequencies[:50]]
print(target_words)

['2', '1', 'u', 'phone', 'use', 'one', 'router', 'ipod', 'player', 'camera', '3', 'get', 'battery', 'diaper', 'product', 'work', 'like', 'great', 'time', 'problem', 'feature', 'good', 'zen', 'quality', 'would', 'also', 'sound', 'computer', 'software', 'really', "i've", 'picture', 'micro', 'well', 'take', 'thing', 'even', 'easy', "i'm", 'need', 'used', 'first', 'bag', 'much', 'want', 'champ', 'better', 'p', 'look', 'creative']


# Task 1 - Step 2

In [8]:
import random

# Sample half of the target words
sample_size = len(target_words) // 2
sample = random.sample(target_words, sample_size)
# Create pseudowords for the sampled target words
madeups = [word[::-1] for word in sample]
# Replace the sampled occurrences of the target words with their pseudowords
pseudowords = target_words
for i, word in enumerate(target_words):
    if word in sample:
        # Find the index of the word to be replace
        replacement_index = sample.index(word)
        # Replace the word with the corresponding word from the replacement list
        pseudowords[i] = madeups[replacement_index]

In [9]:
print(pseudowords)

['2', '1', 'u', 'enohp', 'esu', 'eno', 'retuor', 'ipod', 'reyalp', 'aremac', '3', 'teg', 'yrettab', 'diaper', 'tcudorp', 'work', 'ekil', 'taerg', 'time', 'melborp', 'feature', 'doog', 'zen', 'quality', 'would', 'also', 'sound', 'retupmoc', 'software', 'really', "i've", 'picture', 'micro', 'well', 'ekat', 'gniht', 'even', 'ysae', "m'i", 'need', 'used', 'tsrif', 'bag', 'hcum', 'tnaw', 'champ', 'retteb', 'p', 'look', 'evitaerc']


# Task 1 - Step 3

In [10]:
# Create a corpus containing the target words and pseudowords
corpus = target_words + pseudowords
print(len(corpus))

100


In [11]:
def word_count(str):
    counts = dict()
    words = str.split()
    for word in words:
        if word in counts:
            counts[word] += 1
        else:
            counts[word] = 1
    return counts

In [12]:
# Get all the sentences in all documents
sentences = []
for doc in res:
    for sen in doc:
        sentences.append(sen)

print(sentences[:4])

['[t]\n', 'Works[+3]##Works great, no odor, and uses regular bags.\n', "##Can't complain at all!\n", '[t]\n']


In [13]:
# Loop through each sentence and count the frequency of each word
from collections import Counter

sen_count = []
for i, sentence in enumerate(sentences):
    counts = dict()
    words = process_document(sentence)
    for word in words:
        if word in counts:
            counts[word] += 1
        else:
            counts[word] = 1
    sen_count.append(counts)
print(sen_count[:10])

[{}, {'work': 2, 'great': 1, 'odor': 1, 'us': 1, 'regular': 1, 'bag': 1}, {"can't": 1, 'complain': 1}, {}, {'diaper': 2, 'champ': 2, 'far': 1, '3': 1, 'week': 1, "we've": 1, 'problem': 1}, {'diaper': 2, 'contains': 1, 'smell': 1, "baby's": 1, 'use': 1, 'kind': 1, 'bag': 1, 'inside': 1}, {'also': 1, 'sprinkled': 1, 'baking': 2, 'soda': 2, 'bottom': 1, 'diaper': 1, 'champ': 1, 'help': 1, 'absorb': 1, 'odor': 1, 'every': 1, 'awhile': 1, 'empty': 1, 'old': 1, 'replace': 1}, {'odor': 2, ',refills': 1, 'know': 1, "we'll": 1, 'run': 1, 'trouble': 1, 'road': 1, 'far': 1, 'complaint': 1, "i'm": 1, 'happy': 1, 'buy': 1, 'refill': 1}, {}, {'started': 1, 'diaper': 1, 'genie': 1, 'new': 1, 'parent': 1}]


In [14]:
import numpy as np

# Construct Nxd array based on the word-sentence frequency
pre_M = []
for word in corpus:
    word_freq = []
    for sentence in sen_count:
        tempfreq = sentence.get(word)
        if tempfreq != None:
            word_freq.append(tempfreq)
        else:
            word_freq.append(0)
    pre_M.append(word_freq)
M = np.array(pre_M)
print(M[0][30:50])

[1 1 1 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0]


In [15]:
print(M.shape)

(100, 4584)


In [16]:
print(type(M))

<class 'numpy.ndarray'>


In [18]:
def featureVector(target_words: list, pseudowords: list, res: list) -> np.array:
    
    # Create a corpus containing the target words and pseudowords
    corpus = target_words + pseudowords
    
    # Get all the sentences in all documents
    sentences = []
    for doc in res:
        for sen in doc:
            sentences.append(sen)
    
    # Loop through each sentence and count the frequency of each word
    sen_count = []
    for i, sentence in enumerate(sentences):
        counts = dict()
        words = process_document(sentence)
        for word in words:
            if word in counts:
                counts[word] += 1
            else:
                counts[word] = 1
        sen_count.append(counts)
        
    # Construct Nxd array based on the word-sentence frequency
    pre_M = []
    for word in corpus:
        word_freq = []
        for sentence in sen_count:
            tempfreq = sentence.get(word)
            if tempfreq != None:
                word_freq.append(tempfreq)
            else:
                word_freq.append(0)
        pre_M.append(word_freq)
    M = np.array(pre_M)
    
    return M