In [1]:
from nltk.corpus.reader.wordnet import Lemma
import nltk
from nltk import FreqDist
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
# For file reading
import os
from os import listdir
# For pre-processing 
from nltk import word_tokenize
from nltk.tokenize import punkt
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer
import re
import string
import time

from collections import Counter
import numpy as np
from sklearn.cluster import KMeans


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rubyli/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/rubyli/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/rubyli/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


# Task 1 - Step 1 - Pre-processing words and get top 50 frequent words

In [2]:
def read_data(path: str) -> list:
    """ Read files from a directory and then append the data of each file into a list. """
    folder = listdir(path)
    res = []
    for files in folder:
        # check if current path is a file
        if files != "README.txt":
            filePath = os.path.join(path, files)
            if os.path.isfile(filePath):
                with open(filePath, "r") as file:
                    lines = file.readlines()
                res.append(lines)
    return res

In [3]:
path = "/Users/rubyli/Desktop/GitHubRepos/UoM/NLP-Distributional-Semantics-Neural-Network-for-Classifying-Product-Reviews/product_reviews"
res = read_data(path)
doc = ""
for a in res:
    for b in a:
        doc += b
print(res[0][:5])
# print(doc)

['[t]\n', 'Works[+3]##Works great, no odor, and uses regular bags.\n', "##Can't complain at all!\n", '[t]\n', "Diaper Champ[+2]##So far (3 weeks), we've had no problems with the Diaper Champ at all.\n"]


In [4]:
from nltk import word_tokenize
def process_document(document: str) -> list:
        """ pre-process a document and return a list of its terms: str->list"""
        
        # Remove number
        text_nonum = re.sub(r'\d+', ' ', document)
        
        pattern = r'''(?x)        # set flag to allow verbose regexps
                    (?:[A-Z]\.)+     #abbreviations
                    |\[
                    |[^\w\s]
                    |\#
                    |[-.(]+           #double hyphen, ellipsis, open parenthesis
                    |\S\w*
                    |\$?\d+(?:\.\d+)?%? #currency and percentages
        '''
        #Tokenization
        tokenList = nltk.regexp_tokenize(text_nonum, pattern)
        #To lower case
        tokenList = [word.lower() for word in tokenList]
        #Remove Punctuation
        tokenList = list(filter(lambda word: punkt.PunktToken(word).is_non_punct,tokenList))
        #Remove stopwords
        stopW = stopwords.words("english")
        stopW.append("u")
        stopW.append("p")
        # stopW.append("mp")
        tokenList = list(filter(lambda word: word not in stopW,tokenList))
        # Lemmatisation 
        lemma = WordNetLemmatizer()
        tokenList = [lemma.lemmatize(word) for word in tokenList]

        return tokenList

In [42]:
def get_top50(res: list) -> list:
    doc = ""
    for a in res:
        for b in a:
            doc += b
    # Pre-process documents        
    producedDoc = process_document(doc)
    # Find the 50 most frequently occurred words
    # Get the frequency of each word
    word_frequencies = FreqDist(producedDoc)
    # Sort the dictionary by frequency
    sorted_frequencies = sorted(word_frequencies.items(), key=lambda item: item[1], reverse=True)
    # Select the top 50 words
    target_words = [item[0] for item in sorted_frequencies[:50]]
    
    return target_words
target_words = get_top50(res)
print("Target Words: \n",target_words)

Target Words: 
 ['use', 'phone', 'one', 'router', 'ipod', 'camera', 'player', 'get', 'battery', 'diaper', 'product', 'work', 'like', 'great', 'time', 'feature', 'problem', 'good', 'quality', 'zen', 'would', 'also', 'sound', 'computer', 'software', 'picture', 'well', 'really', 'micro', 'take', 'easy', 'thing', 'even', 'first', 'used', 'need', 'creative', 'bag', 'much', 'want', 'better', 'champ', 'mp', 'look', 'go', 'size', 'music', 'norton', 'little', 'price']


# Task 1 - Step 2 - Sample and Pseudowords

In [43]:
import random

def pseudowords(target_words: list) -> list:
    # Sample half of the target words
    sample_size = len(target_words) // 2
    sample = random.sample(target_words, sample_size)
    # Create pseudowords for the sampled target words
    madeups = [word[::-1] for word in sample]
    # Replace the sampled occurrences of the target words with their pseudowords
    pseudowords = target_words[:]
    for i, word in enumerate(target_words):
        if word in sample:
            # Find the index of the word to be replace
            replacement_index = sample.index(word)
            # Replace the word with the corresponding word from the replacement list
            pseudowords[i] = madeups[replacement_index]
    return pseudowords

In [44]:
pseudo_words = pseudowords(target_words)
print(pseudo_words)

['esu', 'phone', 'eno', 'retuor', 'dopi', 'aremac', 'reyalp', 'teg', 'yrettab', 'repaid', 'tcudorp', 'work', 'like', 'great', 'emit', 'erutaef', 'melborp', 'good', 'ytilauq', 'zen', 'would', 'also', 'dnuos', 'computer', 'software', 'picture', 'well', 'yllaer', 'micro', 'ekat', 'easy', 'gniht', 'neve', 'first', 'desu', 'need', 'creative', 'gab', 'hcum', 'tnaw', 'better', 'pmahc', 'mp', 'look', 'go', 'size', 'cisum', 'norton', 'little', 'price']


# Task 1 - Step 3 - Feature Vector

In [10]:
# Create a corpus containing the target words and pseudowords
corpus = target_words + pseudowords
print(len(corpus))

100


In [73]:
# Get all the sentences in all documents
sentences = []
for doc in res:
    for i, sen in enumerate(doc):
        pos = {}
        process_sen = process_document(sen)
        for j, word in enumerate(process_sen):
            key = j
            if word == "":
                pos[key] = "null"
            else:
                pos[key] = word
        sentences.append(pos)
print(sentences[:10])
print(len(sentences))
print(len(sentences[1].keys()))

[{}, {0: 'work', 1: 'work', 2: 'great', 3: 'odor', 4: 'us', 5: 'regular', 6: 'bag'}, {0: 'complain'}, {}, {0: 'diaper', 1: 'champ', 2: 'far', 3: 'week', 4: 'problem', 5: 'diaper', 6: 'champ'}, {0: 'diaper', 1: 'contains', 2: 'smell', 3: 'baby', 4: 'diaper', 5: 'use', 6: 'kind', 7: 'bag', 8: 'inside'}, {0: 'also', 1: 'sprinkled', 2: 'baking', 3: 'soda', 4: 'bottom', 5: 'diaper', 6: 'champ', 7: 'help', 8: 'absorb', 9: 'odor', 10: 'every', 11: 'awhile', 12: 'empty', 13: 'old', 14: 'baking', 15: 'soda', 16: 'replace'}, {0: 'odor', 1: 'refill', 2: 'know', 3: 'run', 4: 'trouble', 5: 'road', 6: 'odor', 7: 'far', 8: 'complaint', 9: 'happy', 10: 'buy', 11: 'refill'}, {}, {0: 'started', 1: 'diaper', 2: 'genie', 3: 'new', 4: 'parent'}]
4584
7


In [74]:
for va in sentences[1].values():
    print(va)

work
work
great
odor
us
regular
bag


In [75]:
# Loop through each sentence and count the frequency of each word
from collections import Counter

# window_size = 5
# sen_count = []
# for tword in corpus:
#     count = 0
#     for i, dic in enumerate(sentences):
#         if dic!={} and tword in dic.values():
#             numkeys = len(sentences[1].keys())
#             if numkeys >= window_size:
#                 pos = dic.get(tword)
#                 for j in range(window_size):
#                     pos 
                    
            

In [14]:
import numpy as np

# Construct Nxd array based on the word-sentence frequency
pre_M = []
for word in corpus:
    word_freq = []
    for sentence in sen_count:
        tempfreq = sentence.get(word)
        if tempfreq != None:
            word_freq.append(tempfreq)
        else:
            word_freq.append(0)
    pre_M.append(word_freq)
M = np.array(pre_M)
print(M[0][30:50])

[0 0 0 0 0 0 1 0 2 0 0 0 0 0 0 0 0 0 0 0]


In [15]:
print(M.shape)

(100, 4584)


In [32]:
print(type(M))

<class 'numpy.ndarray'>


In [33]:
def featureVector(target_words: list, pseudowords: list, res: list) -> np.array:
    
    # Create a corpus containing the target words and pseudowords
    corpus = target_words + pseudowords
    
    # Get all the sentences in all documents
    sentences = []
    for doc in res:
        for sen in doc:
            sentences.append(sen)
    
    # Loop through each sentence and count the frequency of each word
    sen_count = []
    for i, sentence in enumerate(sentences):
        counts = dict()
        words = process_document(sentence)
        for word in words:
            if word in counts:
                counts[word] += 1
            else:
                counts[word] = 1
        sen_count.append(counts)
        
    # Construct Nxd array based on the word-sentence frequency
    pre_M = []
    for word in corpus:
        word_freq = []
        for sentence in sen_count:
            tempfreq = sentence.get(word)
            if tempfreq != None:
                word_freq.append(tempfreq)
            else:
                word_freq.append(0)
        pre_M.append(word_freq)
    M = np.array(pre_M)
    # svd
    u,s,v = np.linalg.svd(M)
    M = np.dot(u, np.diag(s))
    
    # normalization
    M = M / np.linalg.norm(M, axis=1)[:, None]
    
    
    return M


In [34]:
M = featureVector(target_words, pseudowords, res)
print(M)

[[ 2.94517363e-01 -1.09916877e-01  1.09754768e-01 ... -1.19649893e-57
   8.56668817e-57  0.00000000e+00]
 [ 1.86513270e-01  6.82420224e-02  4.01085331e-02 ...  6.39723485e-52
   1.82490552e-51  0.00000000e+00]
 [ 2.86161762e-01 -1.20454719e-01  6.98359679e-02 ...  1.45793383e-51
  -1.34945098e-50  0.00000000e+00]
 ...
 [ 3.26530072e-02 -1.78876092e-02  1.12007960e-02 ...  2.37221014e-42
  -1.44430190e-41  0.00000000e+00]
 [ 1.22920048e-01  2.86932862e-02  3.87232786e-02 ...  1.41558380e-41
  -8.53435940e-41  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
   0.00000000e+00  1.00000000e+00]]


# Task 1 - Step 4 - input matrix X put them into 50 cluster

In [35]:
# import nltk
# from nltk.cluster import KMeansClusterer

# X = M
# # Set the number of clusters to 50
# num_clusters = 50

# # Create a KMeansClusterer instance with the specified number of clusters
# clusterer = KMeansClusterer(num_clusters, distance=nltk.cluster.util.cosine_distance, avoid_empty_clusters=True)

# # Cluster the words using the feature matrix
# clusters = clusterer.cluster(X, assign_clusters=True)

# # Print the cluster assignments for each word
# for i, cluster in enumerate(clusters):
#     print(f"Word {i+1} is in cluster {cluster}.")

In [36]:
from sklearn.cluster import KMeans

# X = M

# # Set the number of clusters to 50
# num_clusters = 50

# # Create a KMeans instance with the specified number of clusters
# kmeans = KMeans(n_clusters=num_clusters, n_init=10, max_iter=300, tol=1e-04, random_state=0)

# # Cluster the words using the feature matrix
# clusters = kmeans.fit_predict(X)

# visualClusters = {}
# # Print the cluster assignments for each word
# for i, cluster in enumerate(clusters):
# #    print(f"Word {i+1} is in cluster {cluster}.")
#     visualClusters[corpus[i]] = cluster

In [37]:
# def getCluster(X: np.array) -> dict:
#     # Set the number of clusters to 50
#     num_clusters = 50

#     # Create a KMeans instance with the specified number of clusters
#     kmeans = KMeans(n_clusters=num_clusters, n_init=10, max_iter=300, tol=1e-04, random_state=0)

#     # Cluster the words using the feature matrix
#     clusters = kmeans.fit_predict(X)

#     visualClusters = {}
#     # Print the cluster assignments for each word
#     for i, cluster in enumerate(clusters):
#     #    print(f"Word {i+1} is in cluster {cluster}.")
#         visualClusters[corpus[i]] = cluster
#     return visualClusters

In [38]:
def getCluster(X: np.array) -> np.array:
    # Set the number of clusters to 50
    num_clusters = 50
    
    # Create a KMeans instance with the specified number of clusters
    # kmeans = KMeans(n_clusters=num_clusters, n_init=10, max_iter=300, tol=1e-04, random_state=0)
    km = KMeans(n_clusters=50).fit(X)
    
    # Cluster the words using the feature matrix
    # clusters = km.fit_predict(X)
    performance = []
    
    labels = km.labels_
    labels.tolist()
    performance.append((np.sum(labels[0:50] == labels[50:100])) / len(labels[0:50]))
    print("target words: " + str(labels[50:100]))
    print("pesudo words: " + str(labels[0:50]))
    print("Performance: " + str(performance[0]))
        
    return performance

In [39]:
getCluster(M)

target words: [34 45 40 12  9  6 29 39 24 15  0 17  8  0 19 16 26 27  4  3 42 25 28 38
  1 35 36  0  3 23 33 21 31  5  2 30 32 44 48 11 41 44  7 10 22 49 22 13
 18 14]
pesudo words: [37 16  9 12  9  6  7  0 24 15 13 17  8 28 46 16 20 27  4  3  0 25 28  1
  1 23 17  8  3 23 37  8  0  5  2  0 32 15 47 11 47 15  7 10 43 10 22 13
 18 12]
Performance: 0.5


[0.5]

In [40]:
print(M.shape)

(100, 100)


In [41]:
clust = getCluster(M)
print(clust)

target words: [49 45 29 13  4  3 18 34 17 11 40  8 25 46 26  9 33 43  6  1 35 21  5  7
 12  7 37  0  1 23 31 42 30 22  0 39 20 38 47 14 16 32 41 15 27 36  2 24
 19 10]
pesudo words: [28  9  4 13  4  3  7 44 17 11 24  8 25  5  0  9  7 43  6  1 44 21  5 12
 12 23  8 25  1 23 28 25 44 22  0 44 20 11 48 14 48 11 41 15  7 15  2 24
 19 13]
Performance: 0.5
[0.5]


# Task 1 - Step 5

In [26]:
# target_cluster = clust[:50]
# pseudo_cluster = clust[50:]
# print(target_cluster,len(target_cluster),"\n",pseudo_cluster, len(pseudo_cluster))

In [27]:
print(pseudowords, len(pseudowords))

['esu', 'enohp', 'eno', 'router', 'ipod', 'camera', 'reyalp', 'teg', 'battery', 'diaper', 'tcudorp', 'work', 'like', 'taerg', 'emit', 'feature', 'melborp', 'good', 'quality', 'zen', 'dluow', 'also', 'sound', 'retupmoc', 'software', 'erutcip', 'llew', 'yllaer', 'micro', 'take', 'ysae', 'gniht', 'neve', 'first', 'used', 'deen', 'creative', 'gab', 'hcum', 'want', 'retteb', 'pmahc', 'mp', 'look', 'og', 'ezis', 'music', 'norton', 'little', 'ecirp'] 50


In [28]:
print(target_words, len(target_words))

['use', 'phone', 'one', 'router', 'ipod', 'camera', 'player', 'get', 'battery', 'diaper', 'product', 'work', 'like', 'great', 'time', 'feature', 'problem', 'good', 'quality', 'zen', 'would', 'also', 'sound', 'computer', 'software', 'picture', 'well', 'really', 'micro', 'take', 'easy', 'thing', 'even', 'first', 'used', 'need', 'creative', 'bag', 'much', 'want', 'better', 'champ', 'mp', 'look', 'go', 'size', 'music', 'norton', 'little', 'price'] 50


In [29]:
# cluster_size = 50
# temp_count = 0
# for i in range(cluster_size):
#     if(target_cluster[i] == pseudo_cluster[i]):
#         temp_count += 1
# p = temp_count/cluster_size
# print(p)

In [30]:
# print(type(clust))

In [31]:
# def getProbability(cluster: np.array) -> int:
#     cluster_size = len(cluster) // 2 
#     temp_count = 0
#     for i in range(cluster_size):
#         if(target_cluster[i] == pseudo_cluster[i]):
#             temp_count += 1
#     p = temp_count/cluster_size
#     return p
# getProbability(clust)