In [1]:
from nltk.corpus.reader.wordnet import Lemma
import nltk
from nltk import FreqDist
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
# For file reading
import os
from os import listdir
# For pre-processing 
from nltk import word_tokenize
from nltk.tokenize import punkt
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer
import re
import string
import time

from collections import Counter
import numpy as np
from sklearn.cluster import KMeans


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rubyli/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/rubyli/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/rubyli/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


# Task 1 - Step 1 - Pre-processing words and get top 50 frequent words

In [2]:
def read_data(path: str) -> list:
    """ Read files from a directory and then append the data of each file into a list. """
    folder = listdir(path)
    res = []
    for files in folder:
        # check if current path is a file
        if files != "README.txt":
            filePath = os.path.join(path, files)
            if os.path.isfile(filePath):
                with open(filePath, "r") as file:
                    lines = file.readlines()
                res.append(lines)
    return res

In [32]:
path = "/Users/rubyli/Desktop/GitHubRepos/UoM/NLP-Distributional-Semantics-Neural-Network-for-Classifying-Product-Reviews/product_reviews"
res = read_data(path)
doc = ""
for a in res:
    for b in a:
        doc += b
print(res[0][:5])
# print(doc)

['[t]\n', 'Works[+3]##Works great, no odor, and uses regular bags.\n', "##Can't complain at all!\n", '[t]\n', "Diaper Champ[+2]##So far (3 weeks), we've had no problems with the Diaper Champ at all.\n"]


In [33]:
from nltk import word_tokenize
def process_document(document: str) -> list:
        """ pre-process a document and return a list of its terms: str->list"""
        
        # Remove number
        text_nonum = re.sub(r'\d+', ' ', document)
        
        pattern = r'''(?x)        # set flag to allow verbose regexps
                    (?:[A-Z]\.)+     #abbreviations
                    |\[
                    |[^\w\s]
                    |\#
                    |[-.(]+           #double hyphen, ellipsis, open parenthesis
                    |\S\w*
                    |\$?\d+(?:\.\d+)?%? #currency and percentages
        '''
        #Tokenization
        tokenList = nltk.regexp_tokenize(text_nonum, pattern)
        #To lower case
        tokenList = [word.lower() for word in tokenList]
        
        # #Remove number
        # tokenList = ''.join([i for i in tokenList if not i.isdigit()])
        #Remove Punctuation
        tokenList = list(filter(lambda word: punkt.PunktToken(word).is_non_punct,tokenList))
        #Remove stopwords
        stopW = stopwords.words("english")
        stopW.append("u")
        stopW.append("p")
        # stopW.append("mp")
        tokenList = list(filter(lambda word: word not in stopW,tokenList))
        # Lemmatisation 
        lemma = WordNetLemmatizer()
        tokenList = [lemma.lemmatize(word) for word in tokenList]


        return tokenList 
    
#     text = document.lower()
    
#     # remove numbers
#     text_nonum = re.sub(r'\d+', ' ', text)
    
#     text_p = "".join([char for char in text_nonum if char not in string.punctuation])
    
#     words = word_tokenize(text_p)
    
#     stop_words = stopwords.words('english')
#     stop_words.append("im")
#     stop_words.append("ive")
    
#     filtered_words = [word for word in words if word not in stop_words]
    
#     lemma = WordNetLemmatizer()
#     tokenList = [lemma.lemmatize(word) for word in filtered_words]
    
#     return tokenList
    

In [34]:
producedDoc = process_document(doc)
print(producedDoc[:50])

['work', 'work', 'great', 'odor', 'us', 'regular', 'bag', 'complain', 'diaper', 'champ', 'far', 'week', 'problem', 'diaper', 'champ', 'diaper', 'contains', 'smell', 'baby', 'diaper', 'use', 'kind', 'bag', 'inside', 'also', 'sprinkled', 'baking', 'soda', 'bottom', 'diaper', 'champ', 'help', 'absorb', 'odor', 'every', 'awhile', 'empty', 'old', 'baking', 'soda', 'replace', 'odor', 'refill', 'know', 'run', 'trouble', 'road', 'odor', 'far', 'complaint']


In [35]:
# Find the 50 most frequently occurred words
# Get the frequency of each word
word_frequencies = FreqDist(producedDoc)
word_frequencies

FreqDist({'use': 353, 'phone': 351, 'one': 337, 'router': 337, 'ipod': 329, 'camera': 322, 'player': 313, 'get': 274, 'battery': 264, 'diaper': 231, ...})

In [36]:
# Sort the dictionary by frequency
sorted_frequencies = sorted(word_frequencies.items(), key=lambda item: item[1], reverse=True)

# Select the top 50 words
target_words = [item[0] for item in sorted_frequencies[:50]]
print(target_words)

['use', 'phone', 'one', 'router', 'ipod', 'camera', 'player', 'get', 'battery', 'diaper', 'product', 'work', 'like', 'great', 'time', 'feature', 'problem', 'good', 'quality', 'zen', 'would', 'also', 'sound', 'computer', 'software', 'picture', 'well', 'really', 'micro', 'take', 'easy', 'thing', 'even', 'first', 'used', 'need', 'creative', 'bag', 'much', 'want', 'better', 'champ', 'mp', 'look', 'go', 'size', 'music', 'norton', 'little', 'price']


# Task 1 - Step 2 - Sample and Pseudowords

In [37]:
import random

# Sample half of the target words
sample_size = len(target_words) // 2
sample = random.sample(target_words, sample_size)
# Create pseudowords for the sampled target words
madeups = [word[::-1] for word in sample]
# Replace the sampled occurrences of the target words with their pseudowords
pseudowords = target_words[:]
for i, word in enumerate(target_words):
    if word in sample:
        # Find the index of the word to be replace
        replacement_index = sample.index(word)
        # Replace the word with the corresponding word from the replacement list
        pseudowords[i] = madeups[replacement_index]

In [38]:
print(pseudowords)

['esu', 'enohp', 'eno', 'router', 'dopi', 'aremac', 'reyalp', 'get', 'yrettab', 'repaid', 'tcudorp', 'krow', 'like', 'taerg', 'time', 'erutaef', 'melborp', 'good', 'quality', 'nez', 'dluow', 'also', 'dnuos', 'retupmoc', 'software', 'picture', 'llew', 'really', 'micro', 'ekat', 'ysae', 'thing', 'neve', 'tsrif', 'used', 'deen', 'creative', 'gab', 'much', 'want', 'better', 'champ', 'mp', 'look', 'go', 'size', 'cisum', 'norton', 'little', 'price']


# Task 1 - Step 3 - Feature Vector

In [39]:
# Create a corpus containing the target words and pseudowords
corpus = target_words + pseudowords
print(len(corpus))

100


In [40]:
def word_count(str):
    counts = dict()
    words = str.split()
    for word in words:
        if word in counts:
            counts[word] += 1
        else:
            counts[word] = 1
    return counts

In [41]:
# Get all the sentences in all documents
sentences = []
for doc in res:
    for sen in doc:
        sentences.append(sen)

print(sentences[:4])

['[t]\n', 'Works[+3]##Works great, no odor, and uses regular bags.\n', "##Can't complain at all!\n", '[t]\n']


In [42]:
# Loop through each sentence and count the frequency of each word
from collections import Counter

sen_count = []
for i, sentence in enumerate(sentences):
    counts = dict()
    words = process_document(sentence)
    for word in words:
        if word in counts:
            counts[word] += 1
        else:
            counts[word] = 1
    sen_count.append(counts)
print(sen_count[:10])

[{}, {'work': 2, 'great': 1, 'odor': 1, 'us': 1, 'regular': 1, 'bag': 1}, {'complain': 1}, {}, {'diaper': 2, 'champ': 2, 'far': 1, 'week': 1, 'problem': 1}, {'diaper': 2, 'contains': 1, 'smell': 1, 'baby': 1, 'use': 1, 'kind': 1, 'bag': 1, 'inside': 1}, {'also': 1, 'sprinkled': 1, 'baking': 2, 'soda': 2, 'bottom': 1, 'diaper': 1, 'champ': 1, 'help': 1, 'absorb': 1, 'odor': 1, 'every': 1, 'awhile': 1, 'empty': 1, 'old': 1, 'replace': 1}, {'odor': 2, 'refill': 2, 'know': 1, 'run': 1, 'trouble': 1, 'road': 1, 'far': 1, 'complaint': 1, 'happy': 1, 'buy': 1}, {}, {'started': 1, 'diaper': 1, 'genie': 1, 'new': 1, 'parent': 1}]


In [43]:
import numpy as np

# Construct Nxd array based on the word-sentence frequency
pre_M = []
for word in corpus:
    word_freq = []
    for sentence in sen_count:
        tempfreq = sentence.get(word)
        if tempfreq != None:
            word_freq.append(tempfreq)
        else:
            word_freq.append(0)
    pre_M.append(word_freq)
M = np.array(pre_M)
print(M[0][30:50])

[0 0 0 0 0 0 1 0 2 0 0 0 0 0 0 0 0 0 0 0]


In [44]:
print(M.shape)

(100, 4584)


In [45]:
print(type(M))

<class 'numpy.ndarray'>


In [46]:
def featureVector(target_words: list, pseudowords: list, res: list) -> np.array:
    
    # Create a corpus containing the target words and pseudowords
    corpus = target_words + pseudowords
    
    # Get all the sentences in all documents
    sentences = []
    for doc in res:
        for sen in doc:
            sentences.append(sen)
    
    # Loop through each sentence and count the frequency of each word
    sen_count = []
    for i, sentence in enumerate(sentences):
        counts = dict()
        words = process_document(sentence)
        for word in words:
            if word in counts:
                counts[word] += 1
            else:
                counts[word] = 1
        sen_count.append(counts)
        
    # Construct Nxd array based on the word-sentence frequency
    pre_M = []
    for word in corpus:
        word_freq = []
        for sentence in sen_count:
            tempfreq = sentence.get(word)
            if tempfreq != None:
                word_freq.append(tempfreq)
            else:
                word_freq.append(0)
        pre_M.append(word_freq)
    M = np.array(pre_M)
#     svd
    U, s, V = np.linalg.svd(M)
    M = np.dot(U, np.diag(s))
    
#     norm
    
    M = M / np.linalg.norm(M, axis=1)[:, None]
    
    
    return M


In [47]:
M = featureVector(target_words, pseudowords, res)
print(M)

[[-3.50681333e-01 -1.21673651e-01  1.55185114e-01 ...  2.34154045e-55
   5.24427312e-62  9.64246028e-34]
 [-1.99720589e-01 -1.66879368e-01 -1.18660664e-02 ...  3.85995237e-50
   8.67333780e-58  8.37067678e-34]
 [-3.24555618e-01 -8.88820602e-02  1.10482516e-01 ... -7.51707403e-50
  -1.97543893e-57  7.63837020e-33]
 ...
 [-4.77342020e-02 -2.42668421e-02  4.05259002e-02 ... -5.50077773e-39
   1.66961395e-45  4.61104882e-18]
 [-1.20461506e-01 -5.98516000e-02  3.91142245e-02 ... -2.39880034e-39
   2.99778527e-46  2.91166907e-18]
 [-1.77816733e-01  1.62346111e-02 -8.13168738e-03 ...  8.15615897e-39
  -1.48553449e-45 -1.78754266e-17]]


# Task 1 - Step 4 - input matrix X put them into 50 cluster

In [48]:
# import nltk
# from nltk.cluster import KMeansClusterer

# X = M
# # Set the number of clusters to 50
# num_clusters = 50

# # Create a KMeansClusterer instance with the specified number of clusters
# clusterer = KMeansClusterer(num_clusters, distance=nltk.cluster.util.cosine_distance, avoid_empty_clusters=True)

# # Cluster the words using the feature matrix
# clusters = clusterer.cluster(X, assign_clusters=True)

# # Print the cluster assignments for each word
# for i, cluster in enumerate(clusters):
#     print(f"Word {i+1} is in cluster {cluster}.")

In [49]:
from sklearn.cluster import KMeans

# X = M

# # Set the number of clusters to 50
# num_clusters = 50

# # Create a KMeans instance with the specified number of clusters
# kmeans = KMeans(n_clusters=num_clusters, n_init=10, max_iter=300, tol=1e-04, random_state=0)

# # Cluster the words using the feature matrix
# clusters = kmeans.fit_predict(X)

# visualClusters = {}
# # Print the cluster assignments for each word
# for i, cluster in enumerate(clusters):
# #    print(f"Word {i+1} is in cluster {cluster}.")
#     visualClusters[corpus[i]] = cluster

In [50]:
# def getCluster(X: np.array) -> dict:
#     # Set the number of clusters to 50
#     num_clusters = 50

#     # Create a KMeans instance with the specified number of clusters
#     kmeans = KMeans(n_clusters=num_clusters, n_init=10, max_iter=300, tol=1e-04, random_state=0)

#     # Cluster the words using the feature matrix
#     clusters = kmeans.fit_predict(X)

#     visualClusters = {}
#     # Print the cluster assignments for each word
#     for i, cluster in enumerate(clusters):
#     #    print(f"Word {i+1} is in cluster {cluster}.")
#         visualClusters[corpus[i]] = cluster
#     return visualClusters

In [51]:
def getCluster(X: np.array) -> np.array:
    # Set the number of clusters to 50
    num_clusters = 50
    
    # Create a KMeans instance with the specified number of clusters
    # kmeans = KMeans(n_clusters=num_clusters, n_init=10, max_iter=300, tol=1e-04, random_state=0)
    km = KMeans(n_clusters=50).fit(X)
    
    # Cluster the words using the feature matrix
    # clusters = km.fit_predict(X)
    performance = []
    
    labels = km.labels_
    labels.tolist()
    performance.append((np.sum(labels[0:50] == labels[50:100])) / len(labels[0:50]))
    print("target words: " + str(labels[50:100]))
    print("pesudo words: " + str(labels[0:50]))
    print("Performance: " + str(performance[0]))
        
    return performance

In [52]:
getCluster(M)

target words: [47 25 44 17 45 32 35  4 33 33 48 14 11 49  8 40  0 12  3 41 43 37 39  0
 16 30 34 10  2 17 28 24 36  0  6 46 19 31 18 13  9 23  5 21  1 15 29 26
 20  7]
pesudo words: [27 38 27 17 42 30  5  4 38 23 26 22 11  3  8 38 26 12  3  2  4 37  3 16
 16 30 22 10  2 30 27 24  0 24  6  4 19 23 18 13  9 23  5 21  1 15 42 26
 20  7]
Performance: 0.5


[0.5]

In [53]:
print(M.shape)

(100, 100)


In [54]:
clust = getCluster(M)
print(clust)

target words: [12 12 22 14 43 39 45 15 38 38 32 34  3 37 21 31  0  1  2  0 11 33 41 49
 19 10 30  9  4 47 40 20 44 27 28 13 25 42 36 26 17  8  6 18 24 23  0  7
  5 35]
pesudo words: [29 48 46 14 46 10  6 15 48  8  7 16  3  2 21 48  7  1  2  4 15 33  2 19
 19 10 16  9  4 10 29 20 15 20 28 15 25  8 36 26 17  8  6 18 24 23 46  7
  5 35]
Performance: 0.5
[0.5]


# Task 1 - Step 5

In [55]:
# target_cluster = clust[:50]
# pseudo_cluster = clust[50:]
# print(target_cluster,len(target_cluster),"\n",pseudo_cluster, len(pseudo_cluster))

[0.5] 1 
 [] 0


In [56]:
print(pseudowords, len(pseudowords))

['esu', 'enohp', 'eno', 'router', 'dopi', 'aremac', 'reyalp', 'get', 'yrettab', 'repaid', 'tcudorp', 'krow', 'like', 'taerg', 'time', 'erutaef', 'melborp', 'good', 'quality', 'nez', 'dluow', 'also', 'dnuos', 'retupmoc', 'software', 'picture', 'llew', 'really', 'micro', 'ekat', 'ysae', 'thing', 'neve', 'tsrif', 'used', 'deen', 'creative', 'gab', 'much', 'want', 'better', 'champ', 'mp', 'look', 'go', 'size', 'cisum', 'norton', 'little', 'price'] 50


In [57]:
print(target_words, len(target_words))

['use', 'phone', 'one', 'router', 'ipod', 'camera', 'player', 'get', 'battery', 'diaper', 'product', 'work', 'like', 'great', 'time', 'feature', 'problem', 'good', 'quality', 'zen', 'would', 'also', 'sound', 'computer', 'software', 'picture', 'well', 'really', 'micro', 'take', 'easy', 'thing', 'even', 'first', 'used', 'need', 'creative', 'bag', 'much', 'want', 'better', 'champ', 'mp', 'look', 'go', 'size', 'music', 'norton', 'little', 'price'] 50


In [58]:
# cluster_size = 50
# temp_count = 0
# for i in range(cluster_size):
#     if(target_cluster[i] == pseudo_cluster[i]):
#         temp_count += 1
# p = temp_count/cluster_size
# print(p)

IndexError: list index out of range

In [None]:
# print(type(clust))

In [None]:
# def getProbability(cluster: np.array) -> int:
#     cluster_size = len(cluster) // 2 
#     temp_count = 0
#     for i in range(cluster_size):
#         if(target_cluster[i] == pseudo_cluster[i]):
#             temp_count += 1
#     p = temp_count/cluster_size
#     return p
# getProbability(clust)