In [1]:
from nltk.corpus.reader.wordnet import Lemma
import nltk
from nltk import FreqDist
nltk.download('stopwords')

# For file reading
import os
from os import listdir
# For pre-processing 
from nltk import word_tokenize
from nltk.tokenize import punkt
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer
import re
import string
import random

# from collections import Counter
import numpy as np
from sklearn.cluster import KMeans


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rubyli/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
def read_data(path: str) -> list:
    """ Read files from a directory and then append the data of each file into a list. """
    folder = listdir(path)
    res = []
    for files in folder:
        # check if current path is a file
        if files != "README.txt":
            filePath = os.path.join(path, files)
            if os.path.isfile(filePath):
                with open(filePath, "r") as file:
                    lines = file.readlines()
                res.append(lines)
    return res

def process_document(document: str) -> list:
    """ pre-process a document and return a list of its terms: str->list"""
    text = document.lower()
    # remove numbers
    text_nonum = re.sub(r'\d+', ' ', text)
    
    text_p = "".join([char for char in text_nonum if char not in string.punctuation])
    
    words = word_tokenize(text_p)
    
    stop_words = stopwords.words('english')
    stop_words.append("im")
    stop_words.append("ive")
    
    filtered_words = [word for word in words if word not in stop_words]
    
    lemma = WordNetLemmatizer()
    tokenList = [lemma.lemmatize(word) for word in filtered_words]
    
    return tokenList
    

def get_top50(res: list) -> list:
    doc = ""
    for a in res:
        for b in a:
            doc += b
    # Pre-process documents        
    producedDoc = process_document(doc)
    # Find the 50 most frequently occurred words
    # Get the frequency of each word
    word_frequencies = FreqDist(producedDoc)
    # Sort the dictionary by frequency
    sorted_frequencies = sorted(word_frequencies.items(), key=lambda item: item[1], reverse=True)
    # Select the top 50 words
    target_words = [item[0] for item in sorted_frequencies[:50]]
    
    return target_words

def pseudowords(target_words: list) -> list:
    # Sample half of the target words
    sample_size = len(target_words) // 2
    sample = random.sample(target_words, sample_size)
    # Create pseudowords for the sampled target words
    madeups = [word[::-1] for word in sample]
    # Replace the sampled occurrences of the target words with their pseudowords
    pseudowords = target_words[:]
    for i, word in enumerate(target_words):
        if word in sample:
            # Find the index of the word to be replace
            replacement_index = sample.index(word)
            # Replace the word with the corresponding word from the replacement list
            pseudowords[i] = madeups[replacement_index]
    return pseudowords

def featureVector(target_words: list, pseudowords: list, res: list) -> np.array:
    
    # Create a corpus containing the target words and pseudowords
    corpus = target_words + pseudowords
    
    # Get all the sentences in all documents
    sentences = []
    for doc in res:
        for sen in doc:
            sentences.append(sen)
    
    # Loop through each sentence and count the frequency of each word
    sen_count = []
    for i, sentence in enumerate(sentences):
        counts = dict()
        words = process_document(sentence)
        for word in words:
            if word in counts:
                counts[word] += 1
            else:
                counts[word] = 1
        sen_count.append(counts)
        
    # Construct Nxd array based on the word-sentence frequency
    pre_M = []
    for word in corpus:
        word_freq = []
        for sentence in sen_count:
            tempfreq = sentence.get(word)
            if tempfreq != None:
                word_freq.append(tempfreq)
            else:
                word_freq.append(0)
        pre_M.append(word_freq)
    M = np.array(pre_M)
    
    # svd
    U, s, V = np.linalg.svd(M)
    M = np.dot(U, np.diag(s))
    
    # norm
    M = M / np.linalg.norm(M, axis=1)[:, None]
    return M

def getCluster(X: np.array) -> np.array:
    # Set the number of clusters to 50
    num_clusters = 50
    
    # Create a KMeans instance with the specified number of clusters
    kmeans = KMeans(n_clusters=num_clusters, n_init=10, max_iter=300, tol=1e-04, random_state=0)
    
    # Cluster the words using the feature matrix
    clusters = kmeans.fit_predict(X)
    
    return clusters

def getProbability(cluster: np.array) -> int:
    cluster_size = len(cluster) // 2 
    temp_count = 0
    for i in range(cluster_size):
        if(target_cluster[i] == pseudo_cluster[i]):
            temp_count += 1
    p = temp_count/cluster_size
    return p

# def main():
#     path = "/Users/rubyli/Desktop/GitHubRepos/UoM/NLP-Distributional-Semantics-Neural-Network-for-Classifying-Product-Reviews/product_reviews"
#     # Reading documents
#     res = read_data(path)
#     target_words = get_top50(res)
#     pseudo_words = pseudowords(target_words)
    
#     M = featureVector(target_words, pseudo_words, res)
#     clust = getCluster(M)
#     print(clust)

In [12]:
# main()

In [13]:
path = "/Users/rubyli/Desktop/GitHubRepos/UoM/NLP-Distributional-Semantics-Neural-Network-for-Classifying-Product-Reviews/product_reviews"
# Reading documents
res = read_data(path)
target_words = get_top50(res)
pseudo_words = pseudowords(target_words)

In [14]:
print(target_words)

['use', 'phone', 'ipod', 'router', 'one', 'camera', 'player', 'get', 'battery', 'diaper', 'product', 'work', 'like', 'great', 'time', 'problem', 'feature', 'good', 'zen', 'quality', 'dont', 'u', 'would', 'also', 'sound', 'software', 'computer', 'picture', 'really', 'micro', 'well', 'take', 'easy', 'thing', 'used', 'even', 'need', 'first', 'much', 'want', 'bag', 'champ', 'better', 'mp', 'look', 'creative', 'size', 'go', 'music', 'little']


In [15]:
print(pseudo_words)

['esu', 'enohp', 'ipod', 'retuor', 'one', 'camera', 'player', 'teg', 'yrettab', 'diaper', 'product', 'work', 'ekil', 'taerg', 'emit', 'problem', 'erutaef', 'good', 'nez', 'ytilauq', 'dont', 'u', 'dluow', 'also', 'dnuos', 'software', 'computer', 'picture', 'yllaer', 'orcim', 'well', 'take', 'ysae', 'thing', 'desu', 'even', 'deen', 'tsrif', 'much', 'tnaw', 'bag', 'pmahc', 'retteb', 'pm', 'look', 'evitaerc', 'size', 'go', 'cisum', 'little']


In [None]:
M = featureVector(target_words, pseudo_words, res)
clust = getCluster(M)

In [None]:
print(M.shape)

(100, 100)


In [8]:
print(clust)
print(len(clust))

[ 7 46  2 16 16  5 35 49 46  8 28  9 21 20  9  6 46  4 29 20 21  1 49 12
 20 27 10 18 19 29 38 39  7 23 39 14 19  0 36 49  3  8 13 15 33 26 34 22
  2 11 24 44  2 43 16  5 35 42  3  8 28  9 17  0  3  6  3  4 30 31 21  1
 40 12 25 27 10 18 37  0 38 39  0 23  0 14 47 42 36  3  3 45 48 32 33 41
 34 22  3 11]
100


In [9]:
p = getProbability(clust)
print(p)

NameError: name 'target_cluster' is not defined