In [1]:
from nltk.corpus.reader.wordnet import Lemma
import nltk
from nltk import FreqDist
nltk.download('stopwords')

# For file reading
import os
from os import listdir
# For pre-processing 
from nltk import word_tokenize
from nltk.tokenize import punkt
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer
import re
import string
import random

import numpy as np
# For skip-gram model
!pip install gensim
import gensim
from gensim.models import Word2Vec
# import tensorflow as tf

# For Cluster
from sklearn.cluster import KMeans


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rubyli/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!




In [43]:
def read_data(path: str) -> list:
    """ Read files from a directory and then append the data of each file into a list. """
    folder = listdir(path)
    res = []
    for files in folder:
        # check if current path is a file
        if files != "README.txt":
            filePath = os.path.join(path, files)
            if os.path.isfile(filePath):
                with open(filePath, "r") as file:
                    lines = file.readlines()
                res.append(lines)
    return res

def process_document(document: str) -> list:
        """ pre-process a document and return a list of its terms: str->list"""
        
        # Remove number
        text_nonum = re.sub(r'\d+', ' ', document)
        
        pattern = r'''(?x)        # set flag to allow verbose regexps
                    (?:[A-Z]\.)+     #abbreviations
                    |\[
                    |[^\w\s]
                    |\#
                    |[-.(]+           #double hyphen, ellipsis, open parenthesis
                    |\S\w*
                    |\$?\d+(?:\.\d+)?%? #currency and percentages
        '''
        #Tokenization
        tokenList = nltk.regexp_tokenize(text_nonum, pattern)
        #To lower case
        tokenList = [word.lower() for word in tokenList]
        #Remove Punctuation
        tokenList = list(filter(lambda word: punkt.PunktToken(word).is_non_punct,tokenList))
        #Remove stopwords
        stopW = stopwords.words("english")
        stopW.append("u")
        stopW.append("p")
        # stopW.append("mp")
        tokenList = list(filter(lambda word: word not in stopW,tokenList))
        # Lemmatisation 
        lemma = WordNetLemmatizer()
        tokenList = [lemma.lemmatize(word) for word in tokenList]

        return tokenList 

def process_reviews_str(res: list) -> str:
    # merge all reviews
    doc = ""
    for a in res:
        for b in a:
            doc += b
    # Pre-process documents        
    producedDoc = process_document(doc)
    return producedDoc

def process_reviews_list(res: list) -> list:
    # merge all reviews
    producedDoc = []
    doc = ""
    for a in res:
        for b in a:
            doc += b
        producedDoc.append(process_document(doc))
    return producedDoc

def get_top50(producedDoc: str) -> list:
    # Find the 50 most frequently occurred words
    # Get the frequency of each word
    word_frequencies = FreqDist(producedDoc)
    # Sort the dictionary by frequency
    sorted_frequencies = sorted(word_frequencies.items(), key=lambda item: item[1], reverse=True)
    # Select the top 50 words
    target_words = [item[0] for item in sorted_frequencies[:50]]
    
    return target_words

def pseudowords(target_words: list) -> list:
    # Sample half of the target words
    sample_size = len(target_words) // 2
    sample = random.sample(target_words, sample_size)
    # Create pseudowords for the sampled target words
    madeups = [word[::-1] for word in sample]
    # Replace the sampled occurrences of the target words with their pseudowords
    pseudowords = target_words[:]
    for i, word in enumerate(target_words):
        if word in sample:
            # Find the index of the word to be replace
            replacement_index = sample.index(word)
            # Replace the word with the corresponding word from the replacement list
            pseudowords[i] = madeups[replacement_index]
    return pseudowords

def featureVector(processed_list: list, sample: str) -> np.array:
    
    # result = process_reviews_list(res)
    # # Create a corpus containing the target words and pseudowords
    # sample = target_words + pseudowords
    
    # # Skip-Gram Model
    # sg_model = Word2Vec(processed_list, min_count = 1, vector_size = 100, window = 5, sg = 1) 
    # word_vectors = sg_model.wv
    # print(word_vectors.most_similar('use'))
    
    cbow_model = Word2Vec(processed_list, min_count = 1, vector_size = 100, window = 5)
    word_vectors = cbow_model.wv
    # print(cbow_model.wv.most_similar('use'))
    
    pre_M = []
    for s in sample:
        

#     # svd
#     # u, s, v = np.linalg.svd(positive_skip_grams)
#     u, s, v = np.linalg.svd(word_vectors)
#     # u, s, v = np.linalg.svd(M)
#     M = np.dot(u, np.diag(s))
    
#     # norm
#     M = M / np.linalg.norm(M, axis=1)[:, None]
#     return M

def getCluster(X: np.array) -> np.array:
    # Set the number of clusters to 50
    num_clusters = 50
    
    # Create a KMeans instance with the specified number of clusters
    km = KMeans(n_clusters=50).fit(X)
    
    performance = []
    
    labels = km.labels_
    labels.tolist()
    performance.append((np.sum(labels[0:50] == labels[50:100])) / len(labels[0:50]))
    print("target words: " + str(labels[50:100]))
    print("pesudo words: " + str(labels[0:50]))
    print("Performance: " + str(performance[0]))
        
    return performance

# def getProbability(cluster: np.array) -> int:
#     cluster_size = len(cluster) // 2 
#     temp_count = 0
#     for i in range(cluster_size):
#         if(target_cluster[i] == pseudo_cluster[i]):
#             temp_count += 1
#     p = temp_count/cluster_size
#     return p

# def main():
#     path = "/Users/rubyli/Desktop/GitHubRepos/UoM/NLP-Distributional-Semantics-Neural-Network-for-Classifying-Product-Reviews/product_reviews"
#     # Reading documents
#     res = read_data(path)
#     target_words = get_top50(res)
#     pseudo_words = pseudowords(target_words)
    
#     M = featureVector(target_words, pseudo_words, res)
#     clust = getCluster(M)
#     print(clust)

In [44]:
# main()

In [45]:
path = "/Users/rubyli/Desktop/GitHubRepos/UoM/NLP-Distributional-Semantics-Neural-Network-for-Classifying-Product-Reviews/product_reviews"
# Reading documents
res = read_data(path)
processed_str = process_reviews_str(res)
processed_list = process_reviews_list(res)
target_words = get_top50(processed_str)
pseudo_words = pseudowords(target_words)

In [27]:
print(target_words)

['use', 'phone', 'one', 'router', 'ipod', 'camera', 'player', 'get', 'battery', 'diaper', 'product', 'work', 'like', 'great', 'time', 'feature', 'problem', 'good', 'quality', 'zen', 'would', 'also', 'sound', 'computer', 'software', 'picture', 'well', 'really', 'micro', 'take', 'easy', 'thing', 'even', 'first', 'used', 'need', 'creative', 'bag', 'much', 'want', 'better', 'champ', 'mp', 'look', 'go', 'size', 'music', 'norton', 'little', 'price']


In [28]:
print(pseudo_words)

['esu', 'enohp', 'eno', 'router', 'ipod', 'camera', 'reyalp', 'teg', 'yrettab', 'repaid', 'tcudorp', 'krow', 'ekil', 'great', 'time', 'erutaef', 'melborp', 'good', 'quality', 'zen', 'dluow', 'also', 'dnuos', 'computer', 'erawtfos', 'picture', 'llew', 'yllaer', 'orcim', 'ekat', 'easy', 'thing', 'neve', 'first', 'used', 'need', 'evitaerc', 'bag', 'much', 'want', 'better', 'champ', 'mp', 'look', 'go', 'ezis', 'cisum', 'norton', 'elttil', 'ecirp']


In [46]:
# M = featureVector(target_words, pseudo_words, res)
# clust = getCluster(M)
featureVector(processed_list)

AttributeError: 'KeyedVectors' object has no attribute 'shape'

In [None]:
print(M.shape)

NameError: name 'M' is not defined

In [None]:
print(clust)

In [None]:
# p = getProbability(clust)
# print(p)