In [1]:
from nltk.corpus.reader.wordnet import Lemma
import nltk
from nltk import FreqDist
nltk.download('stopwords')

# For file reading
import os
from os import listdir
# For pre-processing 
from nltk import word_tokenize
from nltk.tokenize import punkt
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer
import re
import string
import random

import numpy as np
from collections import Counter
# For bag of word model
!pip install gensim
import gensim
from gensim.models import Word2Vec

# For Cluster
from sklearn.cluster import KMeans

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rubyli/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!




In [2]:
def read_data(path: str) -> list:
    """ Read files from a directory and then append the data of each file into a list. """
    folder = listdir(path)
    res = []
    for files in folder:
        # check if current path is a file
        if files != "README.txt":
            filePath = os.path.join(path, files)
            if os.path.isfile(filePath):
                with open(filePath, "r") as file:
                    lines = file.readlines()
                res.append(lines)
    return res

def process_document(document: str) -> list:
        """ pre-process a document and return a list of its terms: str->list"""
        
        # Remove number
        text_nonum = re.sub(r'\d+', ' ', document)
        
        pattern = r'''(?x)        # set flag to allow verbose regexps
                    (?:[A-Z]\.)+     #abbreviations
                    |\[
                    |[^\w\s]
                    |\#
                    |[-.(]+           #double hyphen, ellipsis, open parenthesis
                    |\S\w*
                    |\$?\d+(?:\.\d+)?%? #currency and percentages
        '''
        #Tokenization
        tokenList = nltk.regexp_tokenize(text_nonum, pattern)
        #To lower case
        tokenList = [word.lower() for word in tokenList]
        #Remove Punctuation
        tokenList = list(filter(lambda word: punkt.PunktToken(word).is_non_punct,tokenList))
        #Remove stopwords
        stopW = stopwords.words("english")
        stopW.append("u")
        stopW.append("p")
        # stopW.append("mp")
        tokenList = list(filter(lambda word: word not in stopW,tokenList))
        # Lemmatisation 
        lemma = WordNetLemmatizer()
        tokenList = [lemma.lemmatize(word) for word in tokenList]

        return tokenList 


def process_reviews_str(res: list) -> list:
   # merge all reviews
    doc = ""
    for a in res:
        for b in a:
            doc += b
    # Pre-process documents        
    producedDoc = process_document(doc)
    return producedDoc

def process_reviews_list(res: list) -> list:
    # store the processed doc in list of sentences 
    producedDoc = []
    for a in res:
        for b in a:
            # Pre-process documents        
            producedDoc.append(process_document(b))
    return producedDoc


def get_top50(producedDoc: list) -> list:
    '''Find the 50 most frequently occurred words'''
    # Get the frequency of each word
    word_frequencies = FreqDist(producedDoc)
    # Sort the dictionary by frequency
    sorted_frequencies = sorted(word_frequencies.items(), key=lambda item: item[1], reverse=True)
    # Select the top 50 words
    target_words = [item[0] for item in sorted_frequencies[:50]]
    
    return target_words

def pseudowords(target_words: list, processed_corpus: list) -> list:
    # Construct a dic to store all the words' position
    position_list = []
    for sen in processed_corpus:
        pos_dic = {}
        for i, word in enumerate(sen):
            pos_dic[i] = word
        position_list.append(pos_dic)

    # Sample half of the target words
    sample_size = len(target_words) // 2
    sample = random.sample(target_words, sample_size)
    
    # Create reversed words for the sampled target words
    madeups = [word[::-1] for word in sample]
    
    # Replace the sampled reversed words in the target words
    pseudowords = target_words[:]
    for i, word in enumerate(target_words):
        if word in sample:
            # Find the index of the word to be replace
            replacement_index = sample.index(word)
            # Replace the word with the corresponding word from the replacement list
            pseudowords[i] = madeups[replacement_index]
            
    # Get the position of all the sample words
    # sample position
    sPos = []
    for s in sample:
        # sentence position
        re_pos = []
        for i, sentence in enumerate(processed_corpus):
            # word position
            indices = [j for j, word in enumerate(sentence) if word == s]
            re_pos.append(indices)
        sPos.append(re_pos)
    
    # Randomly generate the index of the word to be replace
    re_corpus = processed_corpus[:]
    for s, sam in enumerate(sPos):
        for i, sent in enumerate(sam):
            if sent != []:
                # replace half of its occurence in corpus with their reversed words
                sSize = len(sent) // 2
                sIndex = random.sample(sent, sSize)
                for ind in sIndex:
                    re_corpus[i][ind] = madeups[s]
    
    return pseudowords, re_corpus

def featureVector(corpus_pos: list, sample: str) -> np.array:
    # Vectorize sentences
    example_sequence = []
  
    for word in sample:
        sequence = []
        for sentence in corpus_pos:
            word_counts = Counter(sentence)
            if word in sentence:
                count = word_counts[word]
            else:
                count = 0
            sequence.append(count)
        example_sequence.append(sequence)

    M = np.array(example_sequence)
    
    # svd
    u, s, v = np.linalg.svd(M)
    M = np.dot(u, np.diag(s))
    
    # normalisation
    M = M / np.linalg.norm(M, axis=1)[:, None]
    return M

def getClusterProb(X: np.array) -> int:
    # Set the number of clusters to 50
    num_clusters = 50
    
    # Create a KMeans instance with the specified number of clusters
    km = KMeans(n_clusters=50).fit(X)
    
    labels = km.labels_
    labels.tolist()
    
    accuracy = (np.sum(labels[0:50] == labels[50:100])) / len(labels[0:50])
    # print("target words label: " + str(labels[50:100]))
    # print("pesudo words label: " + str(labels[0:50]))
    # print("accuracy: " + str(accuracy))
        
    return accuracy

In [3]:
path = "/Users/rubyli/Desktop/GitHubRepos/UoM/NLP-Distributional-Semantics-Neural-Network-for-Classifying-Product-Reviews/product_reviews"

# Reading documents
res = read_data(path)
processed_str = process_reviews_str(res)
processed_list = process_reviews_list(res)

target_words = get_top50(processed_str)
accuracy = []
for i in range(10):
    pseudo_words, new_corpus = pseudowords(target_words, processed_list)
    sample = target_words + pseudo_words
    M = featureVector(new_corpus, sample)
    prob = getClusterProb(M)
    accuracy.append(prob)
print("Accuracy: ", accuracy)
print("Mean: ", np.mean(accuracy))
print("Standard Deviation ", np.std(accuracy))

Accuracy:  [0.92, 0.92, 0.94, 0.9, 0.92, 0.94, 0.88, 0.92, 0.92, 0.86]
Mean:  0.9119999999999999
Standard Deviation  0.023999999999999997
