In [1]:
from nltk.corpus.reader.wordnet import Lemma
import nltk
from nltk import FreqDist
nltk.download('stopwords')

# For file reading
import os
from os import listdir
# For pre-processing 
from nltk import word_tokenize
from nltk.tokenize import punkt
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer
import re
import string
import random

import numpy as np
from collections import Counter
# For bag of word model
!pip install gensim
import gensim
from gensim.models import Word2Vec

# For Cluster
from sklearn.cluster import KMeans

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rubyli/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!




# Task 1 - Step 1 - Pre-processing words and get top 50 frequent words

In [2]:
def read_data(path: str) -> list:
    """ Read files from a directory and then append the data of each file into a list. """
    folder = listdir(path)
    res = []
    for files in folder:
        # check if current path is a file
        if files != "README.txt":
            filePath = os.path.join(path, files)
            if os.path.isfile(filePath):
                with open(filePath, "r") as file:
                    lines = file.readlines()
                res.append(lines)
    return res

In [3]:
path = "/Users/rubyli/Desktop/GitHubRepos/UoM/NLP-Distributional-Semantics-Neural-Network-for-Classifying-Product-Reviews/product_reviews"
res = read_data(path)
doc = ""
for a in res:
    for b in a:
        doc += b
print(res[0][:5])
# print(doc)

['[t]\n', 'Works[+3]##Works great, no odor, and uses regular bags.\n', "##Can't complain at all!\n", '[t]\n', "Diaper Champ[+2]##So far (3 weeks), we've had no problems with the Diaper Champ at all.\n"]


In [4]:
from nltk import word_tokenize
def process_document(document: str) -> list:
        """ pre-process a document and return a list of its terms: str->list"""
        
        # Remove number
        text_nonum = re.sub(r'\d+', ' ', document)
        
        pattern = r'''(?x)        # set flag to allow verbose regexps
                    (?:[A-Z]\.)+     #abbreviations
                    |\[
                    |[^\w\s]
                    |\#
                    |[-.(]+           #double hyphen, ellipsis, open parenthesis
                    |\S\w*
                    |\$?\d+(?:\.\d+)?%? #currency and percentages
        '''
        #Tokenization
        tokenList = nltk.regexp_tokenize(text_nonum, pattern)
        #To lower case
        tokenList = [word.lower() for word in tokenList]
        #Remove Punctuation
        tokenList = list(filter(lambda word: punkt.PunktToken(word).is_non_punct,tokenList))
        #Remove stopwords
        stopW = stopwords.words("english")
        stopW.append("u")
        stopW.append("p")
        # stopW.append("mp")
        tokenList = list(filter(lambda word: word not in stopW,tokenList))
        # Lemmatisation 
        lemma = WordNetLemmatizer()
        tokenList = [lemma.lemmatize(word) for word in tokenList]

        return tokenList

def process_reviews_str(res: list) -> list:
   # merge all reviews
    doc = ""
    for a in res:
        for b in a:
            doc += b
    # Pre-process documents        
    producedDoc = process_document(doc)
    return producedDoc

def process_reviews_list(res: list) -> list:
    # merge all reviews
    producedDoc = []
    for a in res:
        for b in a:
            # Pre-process documents        
            producedDoc.append(process_document(b))
    return producedDoc

def get_top50(producedDoc: list) -> list:
    # Find the 50 most frequently occurred words
    # Get the frequency of each word
    word_frequencies = FreqDist(producedDoc)
    # Sort the dictionary by frequency
    sorted_frequencies = sorted(word_frequencies.items(), key=lambda item: item[1], reverse=True)
    # Select the top 50 words
    target_words = [item[0] for item in sorted_frequencies[:50]]
    
    return target_words

# Task 1 - Step 2 - Sample and Pseudowords

In [17]:
import random

def pseudowords(target_words: list, processed_corpus: list) -> list:
    # Construct a dic to store all the words' position
    position_list = []
    for sen in processed_corpus:
        pos_dic = {}
        for i, word in enumerate(sen):
            pos_dic[i] = word
        position_list.append(pos_dic)
        # print(position_list)

    # Sample half of the target words
    sample_size = len(target_words) // 2
    sample = random.sample(target_words, sample_size)
    
    # Create pseudowords for the sampled target words
    madeups = [word[::-1] for word in sample]
    
    # Replace the sampled reversed words in the target words
    pseudowords = target_words[:]
    for i, word in enumerate(target_words):
        if word in sample:
            # Find the index of the word to be replace
            replacement_index = sample.index(word)
            # Replace the word with the corresponding word from the replacement list
            pseudowords[i] = madeups[replacement_index]
            
    # Get the position of all the sample words
    # sample position
    sPos = []
    for s in sample:
        # sentence position
        re_pos = []
        for i, sentence in enumerate(processed_corpus):
            # word position
            indices = [j for j, word in enumerate(sentence) if word == s]
            # inSen = []
            # for j, word in enumerate(sentence):
            #     if word == s:
            #         inSen.append(j)
            # re_pos.append(inSen)
            re_pos.append(indices)
        sPos.append(re_pos)
    
    # print("sPos(sample):\n ",len(sPos))
    # print("re_pos(sentence):\n ",len(sPos[1]))
    # print("inSen(word):\n ",len(sPos[1][1]))

    
    # Randomly generate the index of the word to be replace
    re_corpus = processed_corpus[:]
    for s, sam in enumerate(sPos):
        for i, sent in enumerate(sam):
            if sent != []:
                # replace half of its occurence in corpus with their reversed words
                sSize = len(sent) // 2
                sIndex = random.sample(sent, sSize)
                for ind in sIndex:
                    re_corpus[i][ind] = madeups[s]
    
    return pseudowords, re_corpus

In [19]:
path = "/Users/rubyli/Desktop/GitHubRepos/UoM/NLP-Distributional-Semantics-Neural-Network-for-Classifying-Product-Reviews/product_reviews"

# Reading documents
res = read_data(path)
processed_str = process_reviews_str(res)
processed_list = process_reviews_list(res)

target_words = get_top50(processed_str)
pseudo_words, new_corpus = pseudowords(target_words, processed_list)

# print("Target Words: \n",target_words)
# print("Pseudo Words: \n",pseudo_words)
# print("Processed List: \n", processed_list)
print("New corpus: \n", new_corpus)

# sample = target_words + pseudo_words

# print(new_corpus.index("use"))

New corpus: 


In [16]:
# pseudo_words, new_corpus = pseudowords(target_words, processed_list)

In [None]:
# print(processed_list[:100])

# Task 1 - Step 3 - Feature Vector

In [23]:
def featureVector(corpus_pos: list, sample: str) -> np.array:
    # Vectorize sentences
    example_sequence = []
  
    for word in sample:
        sequence = []
        for sentence in corpus_pos:
            word_counts = Counter(sentence)
            if word in sentence:
                count = word_counts[word]
            else:
                count = 0
            sequence.append(count)
        example_sequence.append(sequence)

    M = np.array(example_sequence)  
    print(M.shape)
    
    # svd
    # u, s, v = np.linalg.svd(positive_skip_grams)
    u, s, v = np.linalg.svd(M)
    M = np.dot(u, np.diag(s))
    
    # norm
    M = M / np.linalg.norm(M, axis=1)[:, None]
    return M

# def featureVector(processed_list: list, sample: str) -> np.array:
#     pList = []
#     for sentence in processed_list:
#         for word in sentence:
#             pList.append(word)

#     print(pList[200:500])
#     sg_model = Word2Vec(pList, min_count = 1, vector_size = 100, window = 5, sg = 1) 
#     # cbow_model = Word2Vec(processed_list, min_count = 1, vector_size = 100, window = 5)
    
#     M = []
#     for s in sample:
#         vec = sg_model.wv[s]
#         M.append(vec)
    
#     return M

In [24]:
sample = target_words + pseudo_words
M = featureVector(new_corpus, sample)

print(M)

(100, 4584)
[[ 2.32829626e-01 -1.65693671e-01  3.16579061e-02 ... -1.36683838e-47
  -1.44561428e-33 -3.58145104e-33]
 [ 8.35311501e-01  5.27377745e-01 -9.35417464e-02 ...  5.79680518e-33
   2.15200381e-18 -2.71931313e-18]
 [ 2.64841320e-01 -1.70772611e-01  4.54337280e-02 ... -2.64449288e-47
  -7.35907800e-33 -6.30865828e-33]
 ...
 [ 2.21123281e-02 -1.19926229e-02  4.61618289e-03 ... -8.97787994e-47
  -6.10135273e-32 -1.83627443e-32]
 [ 1.03453381e-01 -4.88297788e-02  1.70320517e-02 ... -5.88758573e-33
   5.51996625e-17 -4.39618223e-17]
 [ 4.73181096e-02 -7.88100297e-02 -3.13683762e-02 ...  1.34223680e-46
   1.24879673e-32 -1.18396540e-32]]


# Task 1 - Step 4 - input matrix X put them into 50 cluster

In [25]:
def getCluster(X: np.array) -> np.array:
    # Set the number of clusters to 50
    num_clusters = 50
    
    # Create a KMeans instance with the specified number of clusters
    # kmeans = KMeans(n_clusters=num_clusters, n_init=10, max_iter=300, tol=1e-04, random_state=0)
    km = KMeans(n_clusters=50).fit(X)
    
    # Cluster the words using the feature matrix
    # clusters = km.fit_predict(X)
    performance = []
    
    labels = km.labels_
    labels.tolist()
    performance.append((np.sum(labels[0:50] == labels[50:100])) / len(labels[0:50]))
    print("target words: " + str(labels[50:100]))
    print("pesudo words: " + str(labels[0:50]))
    print("Performance: " + str(performance[0]))
        
    return performance

In [26]:
getCluster(M)

target words: [39 11 32 16 17 23  9 22 26  0 12 10  7 47 43 25 37 40 38  5 21 19  1 18
 36 13 48 14  5 45  6 29 44 42 28 41 24  8 31  2 34  0  9 27 20  3 30 33
 15  4]
pesudo words: [39 11 32 16 17 23  9 22 26  0 12 10  7 47 43 25 37 40 38  5 21 19  1 18
 36 13 48 14  5 49  6 29 35 42 28 46 24  8 31  2 34  0  9 27 20  3 30 33
 15  4]
Performance: 0.94


[0.94]

In [27]:
print(M.shape)

(100, 100)


In [28]:
clust = getCluster(M)
print(clust)

target words: [ 1 25 29 32  7 22 18 17 40  2 30  6 13 43 37 31 38 36 44  9 15 19 16  3
 34 10 42 11  9 46 33 27 49  0 24 41  4 39 48  8 28 45 18 20 26  5 14 23
 12 21]
pesudo words: [ 1 25 29 32  7 22 18 17 40  2 30  6 13 43 37 31 38 36 44  9 15 19 16  3
 34 10 42 11  9 46 33 27 35  0 24 47  4 39 48  8 28 45 18 20 26  5 14 23
 12 21]
Performance: 0.96
[0.96]


# Task 1 - Step 5

In [None]:
# cluster_size = 50
# temp_count = 0
# for i in range(cluster_size):
#     if(target_cluster[i] == pseudo_cluster[i]):
#         temp_count += 1
# p = temp_count/cluster_size
# print(p)

In [None]:
# print(type(clust))

In [None]:
# def getProbability(cluster: np.array) -> int:
#     cluster_size = len(cluster) // 2 
#     temp_count = 0
#     for i in range(cluster_size):
#         if(target_cluster[i] == pseudo_cluster[i]):
#             temp_count += 1
#     p = temp_count/cluster_size
#     return p
# getProbability(clust)