In [1]:
from nltk.corpus.reader.wordnet import Lemma
import nltk
from nltk import FreqDist
nltk.download('stopwords')

# For file reading
import os
from os import listdir
# For pre-processing 
from nltk.tokenize import punkt
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer
import string
import random

from collections import Counter
import numpy as np

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rubyli/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
def read_data(path: str) -> list:
    """ Read files from a directory and then append the data of each file into a list. """
    folder = listdir(path)
    res = []
    for files in folder:
        # check if current path is a file
        if files != "README.txt":
            filePath = os.path.join(path, files)
            if os.path.isfile(filePath):
                with open(filePath, "r") as file:
                    lines = file.readlines()
                res.append(lines)
    return res

def process_document(document: str) -> list:
        """ pre-process a document and return a list of its terms: str->list"""
        
        pattern = r'''(?x)        # set flag to allow verbose regexps
                    (?:[A-Z]\.)+     #abbreviations
                    |\w+(?:[-']\w+)*   #word-internal hyphens
                    |'
                    |\[
                    |\#
                    |[-.(]+           #double hyphen, ellipsis, open parenthesis
                    |\S\w*
                    |\$?\d+(?:\.\d+)?%? #currency and percentages
        '''
        #Tokenization
        tokenList = nltk.regexp_tokenize(document, pattern)
        #To lower case
        tokenList = [word.lower() for word in tokenList]
        #Remove Punctuation
        tokenList = list(filter(lambda word: punkt.PunktToken(word).is_non_punct,tokenList))
        #Remove stopwords
        tokenList = list(filter(lambda word: word not in stopwords.words("english"),tokenList))
        # Lemmatisation 
        lemma = WordNetLemmatizer()
        tokenList = [lemma.lemmatize(word) for word in tokenList]

        return tokenList

def get_top50(res: list) -> list:
    doc = ""
    for a in res:
        for b in a:
            doc += b
    # Pre-process documents        
    producedDoc = process_document(doc)
    # Find the 50 most frequently occurred words
    # Get the frequency of each word
    word_frequencies = FreqDist(producedDoc)
    # Sort the dictionary by frequency
    sorted_frequencies = sorted(word_frequencies.items(), key=lambda item: item[1], reverse=True)
    # Select the top 50 words
    target_words = [item[0] for item in sorted_frequencies[:50]]
    
    return target_words

def pseudowords(target_words: list) -> list:
    # Sample half of the target words
    sample_size = len(target_words) // 2
    sample = random.sample(target_words, sample_size)
    # Create pseudowords for the sampled target words
    madeups = [word[::-1] for word in sample]
    # Replace the sampled occurrences of the target words with their pseudowords
    pseudowords = target_words
    for i, word in enumerate(target_words):
        if word in sample:
            # Find the index of the word to be replace
            replacement_index = sample.index(word)
            # Replace the word with the corresponding word from the replacement list
            pseudowords[i] = madeups[replacement_index]
    return pseudowords

def featureVector(target_words: list, pseudowords: list, res: list) -> np.array:
    
    # Create a corpus containing the target words and pseudowords
    corpus = target_words + pseudowords
    
    # Get all the sentences in all documents
    sentences = []
    for doc in res:
        for sen in doc:
            sentences.append(sen)
    
    # Loop through each sentence and count the frequency of each word
    sen_count = []
    for i, sentence in enumerate(sentences):
        counts = dict()
        words = process_document(sentence)
        for word in words:
            if word in counts:
                counts[word] += 1
            else:
                counts[word] = 1
        sen_count.append(counts)
        
    # Construct Nxd array based on the word-sentence frequency
    pre_M = []
    for word in corpus:
        word_freq = []
        for sentence in sen_count:
            tempfreq = sentence.get(word)
            if tempfreq != None:
                word_freq.append(tempfreq)
            else:
                word_freq.append(0)
        pre_M.append(word_freq)
    M = np.array(pre_M)
    
    return M

def main():
    path = "/Users/rubyli/Desktop/GitHubRepos/UoM/NLP-Distributional-Semantics-Neural-Network-for-Classifying-Product-Reviews/product_reviews"
    # Reading documents
    res = read_data(path)
    target_words = get_top50(res)
    pseudo_words = pseudowords(target_words)
    M = featureVector(target_words, pseudo_words, res)
    print(M[0][30:50])

In [4]:
main()

[1 1 1 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0]
