In [1]:
# Importing metadata zip file and converting it to dataframe 
import gzip
import pandas as pd

def getDF(path):
  # Unzip the file, load in each line as an object
  g = gzip.open(path, 'rb')
  g = [json.loads(l) for l in g]

  # Map to a dictionary, then load in as a dataframe
  dict_df = {i: d for (i, d) in enumerate(g)}
  return pd.DataFrame.from_dict(dict_df, orient='index')

df = getDF('All_Beauty.json.gz')

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# Basic cleaning steps

# Remove empty or null descriptions
df = df.drop(df[ ( (df["reviewText"] == "" ) | ( df["reviewText"].isnull()) ) ].index)

# Selecting only the columns that are required for analysis
colums_reviewtext_asin = ["reviewText", "asin"]
df = df[colums_reviewtext_asin]

# Removing \n
df["newReviewText"] = df.reviewText.map(lambda x: x.replace("\n",""))


# Remove all duplicate of reviews so can have only unique reviews for classifier
df[df.duplicated("newReviewText")]
df = df.drop_duplicates("newReviewText", keep="last")
# Matches number of unique reviews now
df.describe() 

Unnamed: 0,reviewText,asin,newReviewText
count,319639,319639,319639
unique,319639,31438,319639
top,My husband wanted to reading about the Negro ...,B000FOI48G,My husband wanted to reading about the Negro ...
freq,1,8268,1


In [4]:
# Remove reviews within set limit of words

# Split reviews at white spaces
df["num_words_reviews"] = df["newReviewText"].apply(lambda x: len(x.split()))

# Remove reviews that don't meet this criteria
df = df[(df["num_words_reviews"] <= 181) & (df["num_words_reviews"] >= 14)]

df["newReviewText"].describe() #210657

count                                                210657
unique                                               210657
top       My  husband wanted to reading about the Negro ...
freq                                                      1
Name: newReviewText, dtype: object

In [7]:
# Regrex for removing characters
import re

# Spacy for spell check
import spacy
import contextualSpellCheck
nlp = spacy.load("en_core_web_sm")
contextualSpellCheck.add_to_pipe(nlp)

# Spacy stop word creation
stopping_words = spacy.lang.en.stop_words.STOP_WORDS
stopping_words_new = stopping_words
# -----------------------------
print(stopping_words)
print(len(stopping_words))

# NLTK for tokenization and lemmatization
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
nltk.download('wordnet')


{'throughout', 'me', 'beside', 'namely', 'go', 'less', 'are', 'everything', 'so', 'sometimes', 'seemed', 'but', 'keep', 'become', 'should', 'cannot', 'is', 'than', 'seems', 'get', 'more', 'can', 'who', 'quite', 'around', 'even', 'might', 'not', 'top', 'neither', 'itself', 'whom', 'any', 'at', 'yourself', 'other', 'hereupon', 'nevertheless', 'wherever', 'via', 'only', 'somewhere', 'perhaps', 'made', 'yet', 'see', '‘ll', '‘m', 'thereafter', 'an', 're', 'where', '’ll', 'here', 'most', 'such', 'bottom', 'there', 'well', '‘ve', 'just', 'about', 'twelve', 'does', 'whereafter', 'eleven', 'else', 'using', 'ourselves', 'on', 'latterly', 'besides', 'that', 'whoever', 'into', 'or', 'sometime', 'becomes', 'had', 'us', 'almost', 'ca', 'someone', 'least', "'ll", 'somehow', 'due', 'n’t', 'whereas', 'move', 'hundred', 'again', 'sixty', 'also', 'against', 'thereby', 'serious', 'our', 'afterwards', '’ve', 'anyone', 'rather', 'everyone', 'themselves', 'nowhere', 'anyway', 'above', "'s", 'something', 'tak

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [8]:
# Removing certain stop words from the list of stop words because they are part of our seed terms
stopping_words_to_keep = ['no']
for word in list(stopping_words_new):
    if word in stopping_words_to_keep:
        stopping_words_new.remove(word)
print(stopping_words_new)
print(len(stopping_words_new))


{'throughout', 'me', 'beside', 'namely', 'go', 'less', 'are', 'everything', 'so', 'sometimes', 'seemed', 'but', 'keep', 'become', 'should', 'cannot', 'is', 'than', 'seems', 'get', 'more', 'can', 'who', 'quite', 'around', 'even', 'might', 'not', 'top', 'neither', 'itself', 'whom', 'any', 'at', 'yourself', 'other', 'hereupon', 'nevertheless', 'wherever', 'via', 'only', 'somewhere', 'perhaps', 'made', 'yet', 'see', '‘ll', '‘m', 'thereafter', 'an', 're', 'where', '’ll', 'here', 'most', 'such', 'bottom', 'there', 'well', '‘ve', 'just', 'about', 'twelve', 'does', 'whereafter', 'eleven', 'else', 'using', 'ourselves', 'on', 'latterly', 'besides', 'that', 'whoever', 'into', 'or', 'sometime', 'becomes', 'had', 'us', 'almost', 'ca', 'someone', 'least', "'ll", 'somehow', 'due', 'n’t', 'whereas', 'move', 'hundred', 'again', 'sixty', 'also', 'against', 'thereby', 'serious', 'our', 'afterwards', '’ve', 'anyone', 'rather', 'everyone', 'themselves', 'nowhere', 'anyway', 'above', "'s", 'something', 'tak

In [9]:
# Preprocessing of keywords

def preprocessing_keywords(raw_string):
    
    # Remove html tags and anything inside them 
    no_html = re.sub(r'<[^>]*>','', raw_string)

    # Make everything lowercase
    lowercase_column = no_html.lower()

    # TODO have a look at apostrophe again
    # Remove apostrophe 
    without_apostrophe = re.sub(r'[\']', '', lowercase_column)
    
    # Remove all non alphabetic instances that aren't a space and replace them with a space 
    alphabetic_keywords = re.sub(r'[^a-z\s]', ' ', without_apostrophe)

    # Tokenize string into individual words
    tokens = word_tokenize(alphabetic_keywords) 
    
    # Remove stopping words
    tokens_without_stopping_words = [token for token in tokens if token not in stopping_words_new]

    # Lemmatize tokens using nltk and join them into phrases
    sentances_without_stop_words = ' '.join([lemmatizer.lemmatize(t) for t in tokens_without_stopping_words])

    return sentances_without_stop_words


In [10]:
###############################################################
#                      Cleaning Keywords                                

#ngram (1,2)
enviromental = ['recyclable', 'recycled', ' Environmentally friendly ','biodegradable', ' no packaging', 'sustainable', ' ecological ', 'plastic-free', 'compostable', 'renewable', 'reusable', 'biodegradable', 'organic', 'refillable', 'refills', 'recycled', 'reef safe',  'no oxybenzone ',  'triclosan-free', 'low-impact', 'soil association', 'conservation', 'COSMOS ', 'NATRUE ', 'FSC', 'eco']   


social = ['cruelty-free', 'equality', 'PETA', 'leaping bunny', 'fair trade', 'no animal', 'ethical ', ' Non-profit ', ' Donated ']  


economic = ['Fair trade', 'renewable', 'locally sourced', 'small business', 'recycled', ' Fair wage', 'community', ' Economic prosperity ', ' local ingredients ', 'local farmers ']   


health = ['non-toxic', 'no toxic', 'organic', 'paraben-free', 'triclosan-free', 'phthalates-free', 'non-nano', 'formaldehyde-free', 'non GMO', 'soil association', 'COSMOS ', 'NATRUE', 'USDA', ' Fragrance-free ', ' no fragrance ', ' sulfate free ']   



# Cleaning keywords/phrases 
enviro_p = list(set(preprocessing_keywords(phrase) for phrase in enviromental))
social_p = list(set(preprocessing_keywords(phrase) for phrase in social))
economic_p = list(set(preprocessing_keywords(phrase) for phrase in economic))
health_p = list(set(preprocessing_keywords(phrase) for phrase in health))

print(enviro_p)
print(social_p)
print(economic_p)
print(health_p)


['reusable', 'recycled', 'low impact', 'environmentally friendly', 'cosmos', 'eco', 'recyclable', 'natrue', 'reef safe', 'triclosan free', 'sustainable', 'refill', 'plastic free', 'ecological', 'conservation', 'refillable', 'compostable', 'soil association', 'biodegradable', 'no packaging', 'renewable', 'no oxybenzone', 'fsc', 'organic']
['cruelty free', 'ethical', 'donated', 'equality', 'peta', 'no animal', 'leaping bunny', 'fair trade', 'non profit']
['recycled', 'economic prosperity', 'local farmer', 'renewable', 'local ingredient', 'small business', 'locally sourced', 'community', 'fair trade', 'fair wage']
['natrue', 'phthalates free', 'fragrance free', 'no toxic', 'usda', 'paraben free', 'triclosan free', 'no fragrance', 'non nano', 'cosmos', 'sulfate free', 'formaldehyde free', 'soil association', 'non gmo', 'organic', 'non toxic']


In [11]:

# Spell check (creating dictionary)
import pkg_resources
from symspellpy import SymSpell, Verbosity

sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
dictionary_path = pkg_resources.resource_filename("symspellpy", "frequency_dictionary_en_82_765.txt")
# term_index is the column of the term and count_index is the column of the term frequency
sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)

# Combining all keywords as individual words
anchor_words_combined = list(set([word for sublist in [enviro_p, social_p, economic_p, health_p] for phrase in sublist for word in phrase.split()]))

# Add all words to the dictionary
for word in anchor_words_combined:
    suggestions = sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=2)
    if len(suggestions) > 0 and suggestions[0].term == word:
        # the best suggestion for this word is itself - it must exist in the dictionary
        continue

    # add to the dictionary with frequency 1 
    sym_spell.create_dictionary_entry(word, 1)
    

In [12]:
# Preprocessing of reviews

def preprocessing(raw_string):
    # Remove html tags and anything inside them 
    no_html = re.sub(r'<[^>]*>','', raw_string)

    # Make everything lowercase
    lowercase_column = no_html.lower()
    
    # Remove all non alphabetic instances that aren't a space and replace them with a space 
    alphabetic_column = re.sub(r'[^a-z\s]', ' ', lowercase_column)

    # Tokenize string into individual words
    tokens = word_tokenize(alphabetic_column) 
    
    # For each word we will have a list of suggestions
    spelling_suggestions = [sym_spell.lookup(x, Verbosity.CLOSEST, max_edit_distance=2) for x in tokens]
    # Drop those that have no suggestions and top[0] suggestions for those who do
    spelling_suggestions = [x[0].term for x in spelling_suggestions if len(x) > 0]
    
    # Remove stopping words 
    tokens_without_stopping_words = [token for token in spelling_suggestions if token not in stopping_words_new]

    # Lemmatize tokens (nltk)
    sentances_without_stop_words = ' '.join([lemmatizer.lemmatize(t) for t in tokens_without_stopping_words])

    return sentances_without_stop_words


In [13]:
# Clean reviews
df["clean_reviews"] = df["newReviewText"].apply(preprocessing)
# Export to CSV for sense checking
df["clean_reviews"]

df["clean_reviews"].to_csv("Clean Reviews.csv")

In [14]:
# Remove any empty reviews or nulls  
df = df.drop(df[ ( (df["clean_reviews"] == "" ) | ( df["clean_reviews"].isnull()) ) ].index)

In [16]:
# Reset index of the data frame for easy use
df = df.reset_index()
df

Unnamed: 0,index,reviewText,asin,newReviewText,num_words_reviews,clean_reviews
0,1,My husband wanted to reading about the Negro ...,0143026860,My husband wanted to reading about the Negro ...,29,husband wanted reading negro baseball great ad...
1,3,I am already a baseball fan and knew a bit abo...,0143026860,I am already a baseball fan and knew a bit abo...,23,baseball fan knew bit negro league learned lot...
2,4,This was a good story of the Black leagues. I ...,0143026860,This was a good story of the Black leagues. I ...,67,good story black league bought book teach high...
3,7,I didn't like this product it smudged all unde...,014789302X,I didn't like this product it smudged all unde...,14,like product smudged eye thoroughly day
4,9,it burns your eyes when u put it on and very ...,014789302X,it burns your eyes when u put it on and very ...,35,burn eye light going forth lot dark eyeliner c...
...,...,...,...,...,...,...
210647,371338,love! love! love! these what a Time Saver for ...,B01HJEGTYK,love! love! love! these what a Time Saver for ...,38,love love love time saver people like hair gre...
210648,371340,It was awful. It was super frizzy and I tried ...,B01HJEGTYK,It was awful. It was super frizzy and I tried ...,28,awful super frizzy tried comb fell completely ...
210649,371341,I was skeptical about buying this. Worried it...,B01HJEGTYK,I was skeptical about buying this. Worried it...,43,sceptical buying worried look obviously fake s...
210650,371343,Way lighter than photo\nNot mix blend of color...,B01HJEGTYK,Way lighter than photoNot mix blend of colorsN...,25,way lighter photon mix blend quality shown vol...


In [18]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

# Setting vectorizer to take phrases in range 1 to 2
vectorizer = CountVectorizer(binary=True, ngram_range= (1,2)) 

# Fit the vectorizer and transform it on the reviews corpus
doc_word = vectorizer.fit_transform(df["clean_reviews"])

words = vectorizer.get_feature_names_out()
print(len(words))

1390211


In [19]:
# Code based on https://github.com/gregversteeg/corex_topic/blob/mastera/corextopic/example/corex_topic_example.ipynb
import corextopic.corextopic as ct
import corextopic.vis_topic as vt

anchor_words = [enviro_p, social_p, economic_p, health_p]

anchored_topic_model = ct.Corex(n_hidden=4, seed = 11)
anchored_topic_model.fit(doc_word, words=words, anchors=anchor_words, anchor_strength=11)

# Same results of corex model for evaluation
vt.vis_rep(anchored_topic_model, column_label=words, prefix='CX Reviews')#FINAL 17-4 Reviews CorEx Model

topics_list = anchored_topic_model.get_topics()

# Get top 10 phrases for each topic
top_words = []
for sublist in topics_list:
    phrase = [item[0] for item in sublist]
    top_words.append(phrase)

for index, phrase in enumerate(top_words):
    print(f"Theme {index+1}:", phrase)
    


Print topics in text file
Theme 1: ['organic', 'refill', 'teeth', 'review', 'water', 'use', 'gum', 'don', 'no', 'time']
Theme 2: ['cruelty free', 'no animal', 'look', 'looking', 'donated', 'update', 'cruelty', 'fair trade', 'slightly', 'add']
Theme 3: ['razor', 'shave', 'blade', 'shaver', 'morello', 'shaving', 'battery', 'close', 'trimmer', 'electric']
Theme 4: ['skin', 'organic', 'smell', 'scent', 'dry', 'face', 'feel', 'product', 'natural', 'cream']


In [None]:
# GLDA # GLDA # GLDA # GLDA # GLDA # GLDA # GLDA # GLDA # GLDA # GLDA # GLDA # GLDA # GLDA # GLDA # GLDA # GLDA # GLDA # GLDA # GLDA # GLDA # GLDA # GLDA # GLDA # GLDA 

In [21]:
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>     GLDA VECTORIZATION         <<<<<<<<<<<<<<<<<<<<<<<<<<<<<

from sklearn.feature_extraction.text import CountVectorizer

# Setting vectorizer to take phrases in range 1 to 2 (non binary for GLDA)
vectorizer_GLDA = CountVectorizer(binary=False, ngram_range= (1,2))

# Fit the vectorizer and transform & fit it on the description corpus
vectorised_descriptions_corpus = vectorizer_GLDA.fit_transform(df["clean_reviews"])
# TODO COMMENT
word2id = vectorizer_GLDA.vocabulary_ #columns dictionary

vocab_GLDA = vectorizer_GLDA.get_feature_names_out() 
print(len(vocab_GLDA))


1390211


In [29]:
anchor_words = [enviro_p, social_p, economic_p, health_p]
print(anchor_words)

achors_to_remove =  ["cosmos", "natrue", "triclosan free", "soil association", "no oxybenzone", "fsc", "leaping bunny","economic prosperity", "local ingredient"]

#enviro_to_remove = ["cosmos", "natrue", "triclosan free", "soil association", "no oxybenzone", "fsc"]
#social_to_remove = ["leaping bunny"]
#economic_to_remove = ["economic prosperity", "local ingredient"]
#health_to_remove = ["natrue", "triclosan free", "cosmos", "soil association"]

for word in achors_to_remove:
    for i in range(4):
        if word in anchor_words[i]:
            anchor_words[i].remove(word)
        
print("------------------")
print(anchor_words)

[['reusable', 'recycled', 'low impact', 'environmentally friendly', 'eco', 'recyclable', 'reef safe', 'sustainable', 'refill', 'plastic free', 'ecological', 'conservation', 'refillable', 'compostable', 'biodegradable', 'no packaging', 'renewable', 'organic'], ['cruelty free', 'ethical', 'donated', 'equality', 'peta', 'no animal', 'fair trade', 'non profit'], ['recycled', 'local farmer', 'renewable', 'small business', 'locally sourced', 'community', 'fair trade', 'fair wage'], ['phthalates free', 'fragrance free', 'no toxic', 'usda', 'paraben free', 'no fragrance', 'non nano', 'sulfate free', 'formaldehyde free', 'non gmo', 'organic', 'non toxic']]
------------------
[['reusable', 'recycled', 'low impact', 'environmentally friendly', 'eco', 'recyclable', 'reef safe', 'sustainable', 'refill', 'plastic free', 'ecological', 'conservation', 'refillable', 'compostable', 'biodegradable', 'no packaging', 'renewable', 'organic'], ['cruelty free', 'ethical', 'donated', 'equality', 'peta', 'no an

In [31]:
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>      Guided - LDA          <<<<<<<<<<<<<<<<<<<<<<<<<<<<<
# Based on https://www.kaggle.com/code/nvpsani/topic-modelling-using-guided-lda who used workaround GLDA as well
from lda import guidedlda as glda
import numpy as np

# Defining model
model = glda.GuidedLDA(n_topics=4, alpha=12.5, eta=0.1, n_iter=2000, random_state=99, refresh=50)

# Topics for the model: create a mapping from feature column index to topic ID
anchor_topics = {}
for topic_id in range(len(anchor_words)):
    key_word_list = anchor_words[topic_id]
    for word in key_word_list:
        col_index = word2id[word]
        anchor_topics[col_index] = topic_id

# Train model
model.fit(vectorised_descriptions_corpus, seed_topics=anchor_topics, seed_confidence=1.1)

# Extract the importance of each feature towards each topic, and display the top ones
NUM_TOP_FEATURES = 10
words_in_topics = model.topic_word_
for topic_id in range(len(words_in_topics)):
    # Get the feature importance for this one topic
    feature_importance = words_in_topics[topic_id]
    # Get the indices of the list when sorted
    sorted_indices = np.argsort(feature_importance)
    # Reverse to get most important first, and take the most important ones
    important_features_indices = sorted_indices[::-1][:NUM_TOP_FEATURES]
    # Use our vocab list to map to the feature names
    top_words = np.array(vocab_GLDA)[important_features_indices]
    
    print(f'Topic {topic_id}: {", ".join(top_words)}')

INFO:lda:n_documents: 210652
INFO:lda:vocab_size: 1390211
INFO:lda:n_words: 8054004
INFO:lda:n_topics: 4
INFO:lda:n_iter: 2000
INFO:lda:<0> log likelihood: -100424456
INFO:lda:<50> log likelihood: -92426867
INFO:lda:<100> log likelihood: -91946922
INFO:lda:<150> log likelihood: -91842709
INFO:lda:<200> log likelihood: -91819648
INFO:lda:<250> log likelihood: -91799651
INFO:lda:<300> log likelihood: -91794569
INFO:lda:<350> log likelihood: -91773638
INFO:lda:<400> log likelihood: -91756146
INFO:lda:<450> log likelihood: -91748331
INFO:lda:<500> log likelihood: -91741842
INFO:lda:<550> log likelihood: -91744843
INFO:lda:<600> log likelihood: -91744097
INFO:lda:<650> log likelihood: -91747044
INFO:lda:<700> log likelihood: -91740703
INFO:lda:<750> log likelihood: -91748869
INFO:lda:<800> log likelihood: -91741702
INFO:lda:<850> log likelihood: -91750023
INFO:lda:<900> log likelihood: -91751218
INFO:lda:<950> log likelihood: -91742182
INFO:lda:<1000> log likelihood: -91747855
INFO:lda:<105

Topic 0: color, great, look, like, brush, little, nice, love, price, quality
Topic 1: product, hair, work, use, don, time, day, week, no, review
Topic 2: good, year, great, razor, bought, shave, time, easy, better, no
Topic 3: skin, love, product, smell, like, feel, use, face, dry, great


In [32]:
# Transform the model to be able to set a threshold and get document topic lables
doc_topic = model.transform(vectorised_descriptions_corpus)
print(doc_topic)

# Set threshold
threshold = 0.45

doc_topic_thresholded = (doc_topic >= threshold).astype(int)
print(doc_topic_thresholded)


# Output it
df_glda_labels = pd.DataFrame(doc_topic_thresholded)
df_glda_labels.to_csv('FINAL 17-4 Reviews GLDA Model.txt', sep='\t', header=False)

[[0.34136359 0.32246684 0.2077321  0.12843747]
 [0.47343705 0.26878913 0.03333832 0.2244355 ]
 [0.34026852 0.28293264 0.15424667 0.22255218]
 ...
 [0.51445361 0.37561892 0.03683785 0.07308961]
 [0.51663265 0.21402148 0.0302978  0.23904806]
 [0.45747977 0.18157116 0.26995996 0.09098911]]
[[0 0 0 0]
 [1 0 0 0]
 [0 0 0 0]
 ...
 [1 0 0 0]
 [1 0 0 0]
 [1 0 0 0]]
