In [1]:
from google.colab import drive

drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
# You can change the paths here.

# Text files' folder path
texts_folder_path = "/content/drive/MyDrive/product_reviews"

In [3]:
!pip install nltk
!pip install pandas

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
from nltk import download

download("stopwords")
download("wordnet")
download("punkt")
download("omw-1.4")
download("averaged_perceptron_tagger")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [23]:
import os
import copy
import string
import random
import numpy as np
import multiprocessing

from gensim.models import Word2Vec
from nltk import pos_tag
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.cluster import GAAClusterer


files = [file for file in os.listdir(texts_folder_path) if file != "README.txt"]

translational_table = str.maketrans(
    "",
    "",
    (string.punctuation) + "§―•\t←→",
)

# All possible stopwords
stop_words = set(stopwords.words())

wordnet_lemmatizer = WordNetLemmatizer()

cores = multiprocessing.cpu_count()


def pos_tagger(nltk_tag):
    """
    Take a POS Tag, and return a wordnet equivalent tag to use for lemmatization
    """
    if nltk_tag == None:
        return None

    if nltk_tag.startswith("J"):
        return wordnet.ADJ
    elif nltk_tag.startswith("V"):
        return wordnet.VERB
    elif nltk_tag.startswith("N"):
        return wordnet.NOUN
    elif nltk_tag.startswith("R"):
        return wordnet.ADV
    else:
        return None


final_processed_reviews = []
processed_words = []
for filename in files:
    with open(texts_folder_path + "/" + filename, "r", encoding="utf-8-sig") as file:
        raw_text = file.read()

    # Replacing the front slash with space
    raw_lines = [
        line for line in raw_text.lower().replace("/", " ").split("\n") if line != ""
    ]

    ## Retrieving reviews by sentences
    reviews = []
    t_separate = []
    for i in range(len(raw_lines)):
        sentence = raw_lines[i].split("##")

        if len(sentence) < 2:
            reviews.append(copy.deepcopy(t_separate))
            t_separate = []
        else:
            t_separate.append(sentence[1])

        if i == len(raw_lines) - 1:
            reviews.append(copy.deepcopy(t_separate))

    reviews = [review for review in reviews if len(review) > 0]

    # Processing Each Reviews
    for review in reviews:
        processed_lines = []
        # Processing each sentence/line in review
        for line in review:
            tokens_with_punctuations = word_tokenize(line)

            ############ POS TAGGING AND LEMMATIZATION ############
            tokens_tags = pos_tag(tokens_with_punctuations)

            # Preparing to lemmatize.
            # Changing from POS Tags to WordNet Tags
            wordnet_tags = [(x[0], pos_tagger(x[1])) for x in tokens_tags]

            # Lemmatize with WordNet Lemmatizer
            lemmatized_tokens = [
                word if tag is None else wordnet_lemmatizer.lemmatize(word, tag)
                for word, tag in wordnet_tags
            ]
            ####################################################################

            # Removing stopwords
            lemmatized_uni_tokens_without_sw = [
                word for word in lemmatized_tokens if not word in stop_words
            ]

            # Removing punctuations, and other unnecessary tokens
            lemmatized_without_punct = [
                word.translate(translational_table)
                for word in lemmatized_uni_tokens_without_sw
                if word != ""
                and word != "'s"
                and word != "'m"
                and word != "'re"
                and word != "'ve"
                and word != "n't"
            ]

            # Removing numerical tokens
            lemmatized_without_punct = [
                token
                for token in lemmatized_without_punct
                if token != "" and token.isnumeric() == False
            ]

            processed_lines += lemmatized_without_punct
        processed_words += processed_lines
        final_processed_reviews += [processed_lines]

# Acquiring words and each respective counts
word_counts = {}
for word in processed_words:
    if word in word_counts:
        word_counts[word] += 1
    else:
        word_counts[word] = 1

# Sorting
sorted_words_counts = sorted(
    [[key, value] for key, value in word_counts.items()], key=lambda val: val[1], reverse=True
)

# Acquiring top 50 words its counts
top_50_words_counts = []
i = 0
while len(top_50_words_counts) < 50:
    if "".join(reversed(sorted_words_counts[i][0])) != sorted_words_counts[i][0]:
        top_50_words_counts.append(
            (sorted_words_counts[i][0], sorted_words_counts[i][1])
        )
    i += 1

# Top-50 words
top_50_words = [item[0] for item in top_50_words_counts]
# Pseudo made-up words
reverse_top_50_words = ["".join(reversed(word)) for word in top_50_words]

# Top 50 words plus its pseudo words
one_hundred_words = top_50_words + reverse_top_50_words

# Dictionary for top 50 words
# Tracking counts with ->     word:count
top_50_words_counts = {item[0]: item[1] for item in top_50_words_counts}

percentages = []
# Performing an experiment for 10 times from step 2 to step 5
for i in range(10):
    # Random sampling half of top 50 words in the corpus
    replace_list = []
    for word in top_50_words:
        li = list(range(top_50_words_counts[word]))
        random.shuffle(li)
        replace_list.append(li[: top_50_words_counts[word] // 2])

    # Deepcopying the processed reviews not to affect the final_processed_reviews
    replaced_processed_reviews = copy.deepcopy(final_processed_reviews)

    # Replacing random sampled words with made-up reverse words
    for target in range(len(top_50_words)):
        count = 0
        for r in range(len(replaced_processed_reviews)):
            for s in range(len(replaced_processed_reviews[r])):
                if replaced_processed_reviews[r][s] == top_50_words[target]:
                    if count in replace_list[target]:
                        replaced_processed_reviews[r][s] = reverse_top_50_words[target]
                    count += 1

    # Setting up the parameters of the model one-by-one
    w2v = Word2Vec(min_count=5, window=6, sg=1, size=160, workers=cores - 1)
    # Building the vocabulary from a sequence of sentences
    w2v.build_vocab(replaced_processed_reviews, progress_per=10000)
    # Training the model
    w2v.train(
        replaced_processed_reviews,
        total_examples=w2v.corpus_count,
        epochs=20,
        report_delay=1,
    )

    w2v.init_sims(replace=True)
    w2v_vectors = w2v.wv.vectors
    w2v_indices = {word: w2v.wv.vocab[word].index for word in w2v.wv.vocab}

    feature_vecs = []
    for word in one_hundred_words:
        feature_vec = w2v_vectors[w2v_indices[word]]
        feature_vecs.append(feature_vec)

    # Clustering with Group Average Agglomerative clustering

    clusterer = GAAClusterer(50)
    labels = clusterer.cluster(feature_vecs, True)

    # Calculating the percentage
    p = 0
    for i in range(50):
        if labels[i] == labels[i + 50]:
            p += 1
    p = (p / 50) * 100
    percentages.append(p)

percentages = np.array(percentages)
mean = np.mean(percentages)
print("Average Accuracy Percentage: ", mean)
std = np.std(percentages)
print("Standard Deviation: ", std)

Average Accuracy Percentage:  79.6
Standard Deviation:  2.4979991993593593
