In [1]:
from nltk.corpus.reader.wordnet import Lemma
import nltk
from nltk import FreqDist
nltk.download('stopwords')

# For file reading
import os
from os import listdir
# For pre-processing 
from nltk import word_tokenize
from nltk.tokenize import punkt
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer
import re
import string
import random

import numpy as np
# For bag of word model
!pip install gensim
import gensim
from gensim.models import Word2Vec

# For Cluster
from sklearn.cluster import KMeans


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rubyli/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!




# Task 1 - Step 1 - Pre-processing words and get top 50 frequent words

In [2]:
def read_data(path: str) -> list:
    """ Read files from a directory and then append the data of each file into a list. """
    folder = listdir(path)
    res = []
    for files in folder:
        # check if current path is a file
        if files != "README.txt":
            filePath = os.path.join(path, files)
            if os.path.isfile(filePath):
                with open(filePath, "r") as file:
                    lines = file.readlines()
                res.append(lines)
    return res

In [3]:
path = "/Users/rubyli/Desktop/GitHubRepos/UoM/NLP-Distributional-Semantics-Neural-Network-for-Classifying-Product-Reviews/product_reviews"
res = read_data(path)
doc = ""
for a in res:
    for b in a:
        doc += b
print(res[0][:5])
# print(doc)

['[t]\n', 'Works[+3]##Works great, no odor, and uses regular bags.\n', "##Can't complain at all!\n", '[t]\n', "Diaper Champ[+2]##So far (3 weeks), we've had no problems with the Diaper Champ at all.\n"]


In [4]:
from nltk import word_tokenize
def process_document(document: str) -> list:
        """ pre-process a document and return a list of its terms: str->list"""
        
        # Remove number
        text_nonum = re.sub(r'\d+', ' ', document)
        
        pattern = r'''(?x)        # set flag to allow verbose regexps
                    (?:[A-Z]\.)+     #abbreviations
                    |\[
                    |[^\w\s]
                    |\#
                    |[-.(]+           #double hyphen, ellipsis, open parenthesis
                    |\S\w*
                    |\$?\d+(?:\.\d+)?%? #currency and percentages
        '''
        #Tokenization
        tokenList = nltk.regexp_tokenize(text_nonum, pattern)
        #To lower case
        tokenList = [word.lower() for word in tokenList]
        #Remove Punctuation
        tokenList = list(filter(lambda word: punkt.PunktToken(word).is_non_punct,tokenList))
        #Remove stopwords
        stopW = stopwords.words("english")
        stopW.append("u")
        stopW.append("p")
        # stopW.append("mp")
        tokenList = list(filter(lambda word: word not in stopW,tokenList))
        # Lemmatisation 
        lemma = WordNetLemmatizer()
        tokenList = [lemma.lemmatize(word) for word in tokenList]

        return tokenList

def process_reviews_str(res: list) -> list:
    # merge all reviews
    doc = ""
    for a in res:
        for b in a:
            doc += b
    # Pre-process documents        
    producedDoc = process_document(doc)
    return producedDoc

In [5]:
def get_top50(res: list) -> list:
    doc = ""
    for a in res:
        for b in a:
            doc += b
    # Pre-process documents        
    producedDoc = process_document(doc)
    # Find the 50 most frequently occurred words
    # Get the frequency of each word
    word_frequencies = FreqDist(producedDoc)
    # Sort the dictionary by frequency
    sorted_frequencies = sorted(word_frequencies.items(), key=lambda item: item[1], reverse=True)
    # Select the top 50 words
    target_words = [item[0] for item in sorted_frequencies[:50]]
    
    return target_words

# Task 1 - Step 2 - Sample and Pseudowords

In [6]:
path = "/Users/rubyli/Desktop/GitHubRepos/UoM/NLP-Distributional-Semantics-Neural-Network-for-Classifying-Product-Reviews/product_reviews"
# Reading documents
res = read_data(path)
processed_str = process_reviews_str(res)
# print(processed_str[:10])
target_words = get_top50(res)

In [7]:
import random

def pseudowords(target_words: list, processed_corpus: list) -> list:
    # Construct a dic to store all the words' position
    pos_dic = {}
    for i, word in enumerate(processed_corpus):
        pos_dic[i] = word
    # print("\n BEFORE",pos_dic)
    
    # Sample half of the target words
    sample_size = len(target_words) // 2
    sample = random.sample(target_words, sample_size)
    
    # Create pseudowords for the sampled target words
    madeups = [word[::-1] for word in sample]
    
    # Replace the sampled reversed words in the target words
    pseudowords = target_words[:]
    for i, word in enumerate(target_words):
        if word in sample:
            # Find the index of the word to be replace
            replacement_index = sample.index(word)
            # Replace the word with the corresponding word from the replacement list
            pseudowords[i] = madeups[replacement_index]
    
    # replace half of its occurence in corpus with their reversed words
    # Get the position of all the sample words
    positions = []
    for s in sample:
        
        pos_list = []
        for index, word in pos_dic.items():
            if s == word:
                pos_list.append(index) 
        # print("\n",s, pos_list)
        positions.append(pos_list)

    for i, pos_list in enumerate(positions):
        # Randomly select to be replaced words' index
        samp_size = len(pos_list) // 2
        sample_pos = random.sample(pos_list, samp_size)
        # Replace all the random occurences to reversed words
        for index in sample_pos:
            pos_dic[index] = madeups[i]
    
    pseudo_corpus = []
    for word in pos_dic.values():
        pseudo_corpus.append(word)
        
    return pseudowords, pseudo_corpus

In [8]:
pseudo_words, new_corpus = pseudowords(target_words, processed_str)

In [9]:
print("Target Words: \n",target_words)
print("Pseudo Words: \n",pseudo_words)

Target Words: 
 ['use', 'phone', 'one', 'router', 'ipod', 'camera', 'player', 'get', 'battery', 'diaper', 'product', 'work', 'like', 'great', 'time', 'feature', 'problem', 'good', 'quality', 'zen', 'would', 'also', 'sound', 'computer', 'software', 'picture', 'well', 'really', 'micro', 'take', 'easy', 'thing', 'even', 'first', 'used', 'need', 'creative', 'bag', 'much', 'want', 'better', 'champ', 'mp', 'look', 'go', 'size', 'music', 'norton', 'little', 'price']
Pseudo Words: 
 ['esu', 'enohp', 'one', 'retuor', 'dopi', 'camera', 'reyalp', 'get', 'yrettab', 'diaper', 'tcudorp', 'work', 'ekil', 'taerg', 'time', 'feature', 'problem', 'good', 'ytilauq', 'zen', 'would', 'osla', 'dnuos', 'computer', 'erawtfos', 'picture', 'llew', 'yllaer', 'orcim', 'ekat', 'easy', 'gniht', 'even', 'first', 'desu', 'need', 'creative', 'gab', 'hcum', 'want', 'better', 'champ', 'pm', 'look', 'go', 'size', 'cisum', 'norton', 'elttil', 'ecirp']


In [10]:
print(processed_str[:100])

['work', 'work', 'great', 'odor', 'us', 'regular', 'bag', 'complain', 'diaper', 'champ', 'far', 'week', 'problem', 'diaper', 'champ', 'diaper', 'contains', 'smell', 'baby', 'diaper', 'use', 'kind', 'bag', 'inside', 'also', 'sprinkled', 'baking', 'soda', 'bottom', 'diaper', 'champ', 'help', 'absorb', 'odor', 'every', 'awhile', 'empty', 'old', 'baking', 'soda', 'replace', 'odor', 'refill', 'know', 'run', 'trouble', 'road', 'odor', 'far', 'complaint', 'happy', 'buy', 'refill', 'started', 'diaper', 'genie', 'new', 'parent', 'hated', 'thing', 'new', 'design', 'still', 'need', 'lot', 'improvement', 'bag', 'never', 'fit', 'right', 'never', 'spun', 'right', 'two', 'month', 'sat', 'corner', 'room', 'never', 'used', 'entered', 'toddlerhood', 'started', 'bad', 'diaper', 'becasue', 'cutting', 'teeth', 'diet', 'change', 'decided', 'something', 'started', 'researching', 'diaper', 'champ', 'diaper', 'champ', 'best', 'found']


In [11]:
print(new_corpus[:100])

['work', 'work', 'great', 'odor', 'us', 'regular', 'bag', 'complain', 'diaper', 'champ', 'far', 'week', 'problem', 'diaper', 'champ', 'diaper', 'contains', 'smell', 'baby', 'diaper', 'use', 'kind', 'bag', 'inside', 'also', 'sprinkled', 'baking', 'soda', 'bottom', 'diaper', 'champ', 'help', 'absorb', 'odor', 'every', 'awhile', 'empty', 'old', 'baking', 'soda', 'replace', 'odor', 'refill', 'know', 'run', 'trouble', 'road', 'odor', 'far', 'complaint', 'happy', 'buy', 'refill', 'started', 'diaper', 'genie', 'new', 'parent', 'hated', 'thing', 'new', 'design', 'still', 'need', 'lot', 'improvement', 'gab', 'never', 'fit', 'right', 'never', 'spun', 'right', 'two', 'month', 'sat', 'corner', 'room', 'never', 'desu', 'entered', 'toddlerhood', 'started', 'bad', 'diaper', 'becasue', 'cutting', 'teeth', 'diet', 'change', 'decided', 'something', 'started', 'researching', 'diaper', 'champ', 'diaper', 'champ', 'best', 'found']


# Task 1 - Step 3 - Feature Vector

In [12]:
def featureVector(processed_list: list, sample: str) -> np.array:
    
    sg_model = Word2Vec(processed_list, min_count = 1, vector_size = 100, window = 5, sg = 1) 
    # cbow_model = Word2Vec(processed_list, min_count = 1, vector_size = 100, window = 5)
    
    M = []
    for s in sample:
        vec = sg_model.wv[s]
        M.append(vec)
    
    return M


In [13]:
sample = target_words + pseudo_words
print(new_corpus.index("use"))
M = featureVector(new_corpus, sample)
print(M)

20


KeyError: "Key 'use' not present"

# Task 1 - Step 4 - input matrix X put them into 50 cluster

In [None]:
# import nltk
# from nltk.cluster import KMeansClusterer

# X = M
# # Set the number of clusters to 50
# num_clusters = 50

# # Create a KMeansClusterer instance with the specified number of clusters
# clusterer = KMeansClusterer(num_clusters, distance=nltk.cluster.util.cosine_distance, avoid_empty_clusters=True)

# # Cluster the words using the feature matrix
# clusters = clusterer.cluster(X, assign_clusters=True)

# # Print the cluster assignments for each word
# for i, cluster in enumerate(clusters):
#     print(f"Word {i+1} is in cluster {cluster}.")

In [None]:
from sklearn.cluster import KMeans

# X = M

# # Set the number of clusters to 50
# num_clusters = 50

# # Create a KMeans instance with the specified number of clusters
# kmeans = KMeans(n_clusters=num_clusters, n_init=10, max_iter=300, tol=1e-04, random_state=0)

# # Cluster the words using the feature matrix
# clusters = kmeans.fit_predict(X)

# visualClusters = {}
# # Print the cluster assignments for each word
# for i, cluster in enumerate(clusters):
# #    print(f"Word {i+1} is in cluster {cluster}.")
#     visualClusters[corpus[i]] = cluster

In [None]:
# def getCluster(X: np.array) -> dict:
#     # Set the number of clusters to 50
#     num_clusters = 50

#     # Create a KMeans instance with the specified number of clusters
#     kmeans = KMeans(n_clusters=num_clusters, n_init=10, max_iter=300, tol=1e-04, random_state=0)

#     # Cluster the words using the feature matrix
#     clusters = kmeans.fit_predict(X)

#     visualClusters = {}
#     # Print the cluster assignments for each word
#     for i, cluster in enumerate(clusters):
#     #    print(f"Word {i+1} is in cluster {cluster}.")
#         visualClusters[corpus[i]] = cluster
#     return visualClusters

In [None]:
def getCluster(X: np.array) -> np.array:
    # Set the number of clusters to 50
    num_clusters = 50
    
    # Create a KMeans instance with the specified number of clusters
    # kmeans = KMeans(n_clusters=num_clusters, n_init=10, max_iter=300, tol=1e-04, random_state=0)
    km = KMeans(n_clusters=50).fit(X)
    
    # Cluster the words using the feature matrix
    # clusters = km.fit_predict(X)
    performance = []
    
    labels = km.labels_
    labels.tolist()
    performance.append((np.sum(labels[0:50] == labels[50:100])) / len(labels[0:50]))
    print("target words: " + str(labels[50:100]))
    print("pesudo words: " + str(labels[0:50]))
    print("Performance: " + str(performance[0]))
        
    return performance

In [None]:
getCluster(M)

In [None]:
print(M.shape)

In [None]:
clust = getCluster(M)
print(clust)

# Task 1 - Step 5

In [None]:
# target_cluster = clust[:50]
# pseudo_cluster = clust[50:]
# print(target_cluster,len(target_cluster),"\n",pseudo_cluster, len(pseudo_cluster))

In [None]:
print(pseudowords, len(pseudowords))

In [None]:
print(target_words, len(target_words))

In [None]:
# cluster_size = 50
# temp_count = 0
# for i in range(cluster_size):
#     if(target_cluster[i] == pseudo_cluster[i]):
#         temp_count += 1
# p = temp_count/cluster_size
# print(p)

In [None]:
# print(type(clust))

In [None]:
# def getProbability(cluster: np.array) -> int:
#     cluster_size = len(cluster) // 2 
#     temp_count = 0
#     for i in range(cluster_size):
#         if(target_cluster[i] == pseudo_cluster[i]):
#             temp_count += 1
#     p = temp_count/cluster_size
#     return p
# getProbability(clust)