In [3]:
# Importing metadata zip file and converting it to dataframe

import json
import gzip
import pandas as pd

def getDF(path):
  # Unzip the file, load in each line as an object
  g = gzip.open(path, 'rb')
  g = [json.loads(l) for l in g]

  # Map to a dictionary, then load in as a dataframe
  dict_df = {i: d for (i, d) in enumerate(g)}
  return pd.DataFrame.from_dict(dict_df, orient='index')

df = getDF('meta_ALL_Beauty.json.gz')

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [4]:

# Selecting only the colums that are required for analysis

colums_description_asin = ["description","asin"]
df = df[colums_description_asin]

df.describe()

Unnamed: 0,description,asin
count,32892,32892
unique,13751,32488
top,[],B00027CDOW
freq,17773,2


In [5]:
# Descriptions in the json file are a list of strings, converting to sting for easy cleaning
df["newdescription"] = df.description.map(lambda x: ".".join(x).replace("\n",""))


In [6]:
# Remove empty descriptions
#df = df.drop(df[df["newdescription"] == ""].index)

#print(len(df)) 



In [7]:
df = df.drop(df[ ( (df["newdescription"] == "" ) | ( df["newdescription"].isnull()) ) ].index)

df

Unnamed: 0,description,asin,newdescription
0,[Loud 'N Clear Personal Sound Amplifier allows...,6546546450,Loud 'N Clear Personal Sound Amplifier allows ...
1,[No7 Lift & Luminate Triple Action Serum 50ml ...,7178680776,No7 Lift & Luminate Triple Action Serum 50ml b...
2,[No7 Stay Perfect Foundation now stays perfect...,7250468162,No7 Stay Perfect Foundation now stays perfect ...
4,[Lacto Calamine Skin Balance Daily Nourishing ...,7414204790,Lacto Calamine Skin Balance Daily Nourishing L...
5,[Mary Kay Satin Hands Peach Hand Cream Travel ...,7535842801,Mary Kay Satin Hands Peach Hand Cream Travel S...
...,...,...,...
32880,[Move over soap on a rope! This heavy-duty Bri...,B01HIHLFOC,Move over soap on a rope! This heavy-duty Bric...
32884,[Eau de parfum spray vial mini design house: y...,B01HIPOQ2M,Eau de parfum spray vial mini design house: yv...
32885,[Pokemon Plush 9.2 Inch / 23cm Gengar Doll Stu...,B01HIUEEHO,Pokemon Plush 9.2 Inch / 23cm Gengar Doll Stuf...
32886,[New and unused product. 100% authentic Benefi...,B01HIWKGOM,New and unused product. 100% authentic Benefit...


In [8]:
# Remove duplicate asin so can have only unique products
df[df.duplicated("asin")]
df = df.drop_duplicates("asin", keep="last")
df.describe()

Unnamed: 0,description,asin,newdescription
count,14821,14821,14821
unique,13749,14821,13743
top,"[For over 60 years, Betty Dain Creations, Inc....",6546546450,"For over 60 years, Betty Dain Creations, Inc. ..."
freq,59,1,59


In [9]:
# Remove duplicate description
df[df.duplicated("newdescription")]
df = df.drop_duplicates("newdescription", keep="last")
df.describe() 

Unnamed: 0,description,asin,newdescription
count,13743,13743,13743
unique,13743,13743,13743
top,[Loud 'N Clear Personal Sound Amplifier allows...,6546546450,Loud 'N Clear Personal Sound Amplifier allows ...
freq,1,1,1


In [10]:
# Removing redundant old description column
colums_description_asin = ["newdescription","asin"]
df = df[colums_description_asin]

df.describe()

Unnamed: 0,newdescription,asin
count,13743,13743
unique,13743,13743
top,Loud 'N Clear Personal Sound Amplifier allows ...,6546546450
freq,1,1


In [11]:
# Identify range of descriptions in after duplicates removal

df_descriptions_without_empty = df["newdescription"].apply(lambda x: len(x.split()))

df_descriptions_without_empty.describe([0.1,0.15,0.20,0.25,0.30,0.75,0.85,0.90,0.92,0.95,0.97,0.98,0.99])

# 97% upper limit of 259 (same % milit as reviews)
# 11 words (same numbe of words as reviews) 20% because no assuption is made on how much more each one is informative as well as descriptions 
# being a smaller sample


count    13743.000000
mean        63.643600
std         91.940065
min          1.000000
10%          7.000000
15%          9.000000
20%         11.000000
25%         14.000000
30%         17.000000
50%         39.000000
75%         79.000000
85%        114.000000
90%        148.000000
92%        169.000000
95%        217.000000
97%        259.000000
98%        288.000000
99%        324.000000
max       3224.000000
Name: newdescription, dtype: float64

In [12]:
# Remove descriptions with more or less than xx pre-cleaned words

# Split at any white space 
df["num_words_description"] = df["newdescription"].apply(lambda x: len(x.split()))

# Check if under or equal to 80% upper limmit words fulfils withs condition and set it
df = df[(df["num_words_description"] <= 259) & (df["num_words_description"] >= 11)]

df["newdescription"].describe()

count                                                 10710
unique                                                10710
top       Loud 'N Clear Personal Sound Amplifier allows ...
freq                                                      1
Name: newdescription, dtype: object

In [13]:
# Check shortening worked
df_descriptions_without_empty = df["newdescription"].apply(lambda x: len(x.split()))

df_descriptions_without_empty.describe([0.1,0.15,0.20,0.25,0.30,0.75,0.85,0.90,0.92,0.95,0.97,0.98,0.99])


count    10710.000000
mean        65.241923
std         53.053433
min         11.000000
10%         15.000000
15%         18.000000
20%         23.000000
25%         27.000000
30%         31.000000
50%         49.000000
75%         86.000000
85%        115.000000
90%        142.000000
92%        157.000000
95%        185.000000
97%        211.000000
98%        225.000000
99%        243.000000
max        259.000000
Name: newdescription, dtype: float64

In [14]:
# Regrex for character removal
import re

# Spacy for spell check
import spacy
import contextualSpellCheck
nlp = spacy.load("en_core_web_sm")
contextualSpellCheck.add_to_pipe(nlp)

# NLTK for tokenisation and lemmatization
import nltk

from nltk.tokenize import word_tokenize

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

nltk.download('wordnet')

# Spacy stop word creation
stopping_words = spacy.lang.en.stop_words.STOP_WORDS
stopping_words_new = stopping_words


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [15]:
stopping_words_to_remove = ['without' , 'not', 'less', 'noting', 'none','no']
for word in list(stopping_words_new):
    if word in stopping_words_to_remove:
        stopping_words_new.remove(word)
print(stopping_words_new)

{'among', 'seeming', 'any', 'if', 'whenever', 'they', 'anyone', 'together', 'twenty', 'somewhere', 'whether', 'full', 'keep', 'been', 'wherein', 'go', 'anyway', '’ve', 'front', 'a', 'am', 'herein', 'has', 'behind', 'her', 'can', 'much', 'herself', 'were', 'afterwards', 'hundred', 'these', 'enough', 'most', 'throughout', 'here', 'fifty', 'well', 'even', 'thereby', 'somehow', 'whereafter', 'noone', 'forty', 'never', 'ever', 'hereby', 'four', 'hers', 'ten', 'seems', 'side', 'within', 'many', 'themselves', "'d", '‘d', 'whereas', 'just', 'nothing', 'already', 'see', 'is', 'several', 'on', 'across', 'very', 'almost', 're', 'another', 'more', 'whatever', 'through', 'still', '‘s', 'mostly', 'himself', 'this', 'nor', 'must', 'doing', 'per', 'our', 'own', 'move', 'thereupon', 'once', 'onto', 'his', 'she', 'everyone', 'n‘t', 'your', 'there', 'nevertheless', 'due', 'again', 'he', 'therefore', 'whereupon', "'re", 'latter', 'with', 'twelve', 'yours', '’d', 'else', 'back', 'might', 'whither', 'be', '

In [16]:
# Preprocessing of descriptions

def preprocessing(raw_string):
    # Remove html tags and anything inside them 
    no_html = re.sub(r'<[^>]*>','', raw_string)
    #print("after removing html", no_html)


    # Make everything lowercase
    lowercase_column = no_html.lower()
    #print("lowercase", lowercase_column)

    # Remove apostrophe to enable spell check to correct words with apostrophe
    without_apostrophe = re.sub(r'[\']', '', lowercase_column)

    # ! Need to double check again where best to use this spell check
    # 
    # .pipe for batches of text
    #doc = list(nlp.pipe(without_apostrophe))
    #doc = nlp(without_apostrophe)

    #spell_checked = doc._.outcome_spellCheck

    # Remove all non alphabetic instances that aren't a space and replace them with a space using Regrex
    alphabetic_column = re.sub(r'[^a-z\s]', ' ', without_apostrophe)
    #print("removed numerical and punctuation", alphabetic_column)

    # Tokenize string into individual words
    tokens = word_tokenize(alphabetic_column)

    # Remove stopping words using Spacy library
    tokens_without_stopping_words = [token for token in tokens if token not in stopping_words_new]

    # Lemmatize tokens using nltk and join them into sentances
    sentances_without_stop_words = ' '.join([lemmatizer.lemmatize(t) for t in tokens_without_stopping_words])

    return sentances_without_stop_words


In [17]:
df["clean_description"] = df["newdescription"].head(1000).apply(preprocessing)

df["clean_description"]

0        loud n clear personal sound amplifier allows t...
2        no stay perfect foundation stay perfect longer...
4        lacto calamine skin balance daily nourishing l...
5        mary kay satin hand peach hand cream travel si...
7        according legend brother native origin black b...
                               ...                        
32879                                                  NaN
32880                                                  NaN
32884                                                  NaN
32885                                                  NaN
32890                                                  NaN
Name: clean_description, Length: 10710, dtype: object

In [18]:
# Remove any empty descriptions that appear because of head()
df = df.drop(df[ ( (df["clean_description"] == "" ) | ( df["clean_description"].isnull()) ) ].index)

In [19]:
# Identifying range of descriptions in after cleaning
 
df_description_without_empty_clean = df["clean_description"].apply(lambda x: len(x.split()))
#df_descriptions_without_empty.describe()
df_description_without_empty_clean.describe([0.03,0.1,0.2,0.3,0.75,0.85,0.90,0.95])

count    1000.00000
mean       44.84100
std        32.72263
min         4.00000
3%          9.00000
10%        13.00000
20%        19.00000
30%        24.70000
50%        35.00000
75%        59.00000
85%        76.15000
90%        92.00000
95%       114.00000
max       221.00000
Name: clean_description, dtype: float64

In [20]:
from collections import Counter
Counter(" ".join(df["clean_description"]).split()).most_common(100)

[('skin', 762),
 ('hair', 372),
 ('oil', 288),
 ('use', 270),
 ('natural', 268),
 ('product', 266),
 ('body', 229),
 ('fragrance', 214),
 ('help', 207),
 ('not', 178),
 ('oz', 174),
 ('clean', 171),
 ('free', 170),
 ('formula', 162),
 ('shave', 156),
 ('dry', 151),
 ('no', 146),
 ('color', 141),
 ('smooth', 135),
 ('soft', 134),
 ('x', 130),
 ('day', 129),
 ('ingredient', 128),
 ('vitamin', 126),
 ('time', 117),
 ('blend', 115),
 ('soap', 113),
 ('long', 111),
 ('scent', 110),
 ('system', 109),
 ('size', 107),
 ('water', 107),
 ('shaving', 106),
 ('head', 105),
 ('blade', 103),
 ('extract', 102),
 ('line', 99),
 ('face', 97),
 ('easy', 97),
 ('contains', 96),
 ('nbsp', 96),
 ('organic', 94),
 ('bath', 94),
 ('provides', 93),
 ('hand', 92),
 ('designed', 92),
 ('razor', 91),
 ('cream', 89),
 ('note', 87),
 ('lip', 86),
 ('e', 84),
 ('essential', 84),
 ('woman', 82),
 ('work', 82),
 ('perfect', 81),
 ('area', 81),
 ('amp', 81),
 ('without', 80),
 ('new', 79),
 ('trimmer', 79),
 ('gel', 7

In [22]:
###############################################################
#                      Cleaning Keywords                                

# TODO use fanilized list of keywords

enviromental = ['biodegradable', 'reduced packaging', 'reduced', 'sustainable', 'plastic-free', 'sustainably sourced', 'compostable', 'renewable', 'renewable energy', 'reusable', 'biodegradable', 'organic', 'refillable', 'refills', 'solid bar', 'recycled', 'cardboard', 'reef safe','reef-friendly', 'oxybenzone free', 'triclosan-free', 'microplastics free', 'microbeads free', 'palm oil free', 'HDPE', 'post-consumer recycled plastic', 'renewable energy', 'recycling scheme', 'sustainably sourced', 'low-impact', 'carbon neutral', 'carbon offsetting', 'eco', 'soil association', 'conservation', 'COSMOS', 'NATRUE', 'RSPO', 'FSC']
social = ['No animal testing', 'cruelty-free', 'vegan', 'plant-based', 'palm oil-free', 'ethical', 'vegan society', 'PETA', 'leaping bunny', 'fair trade', 'local', 'hand-made', 'small business'] 
economic = ['Fair trade', 'renewable energy', 'circular economy', 'locally sourced', 'local', 'small business', 'job creation']
health = ['non-toxic', 'bio', 'organic', 'plant-based', 'paraben free', 'triclosan-free', 'fragrance-free', 'synthetic fragrance-free', 'SLS free', 'phthalates free','nanoparticles free', 'non-nano', 'formaldehyde free', 'phthalates free', 'no GMO', 'soil association', 'COSMOS', 'NATRUE', 'USDA']

# Cleaning keywords in the same way as the corpus
clean_enviro = list(map(preprocessing, enviromental))
clean_social = list(map(preprocessing, social))
clean_economic = list(map(preprocessing, economic))
clean_health = list(map(preprocessing, health))

print(clean_enviro)
print(clean_social)
print(clean_economic)
print(clean_health)


['biodegradable', 'reduced packaging', 'reduced', 'sustainable', 'plastic free', 'sustainably sourced', 'compostable', 'renewable', 'renewable energy', 'reusable', 'biodegradable', 'organic', 'refillable', 'refill', 'solid bar', 'recycled', 'cardboard', 'reef safe', 'reef friendly', 'oxybenzone free', 'triclosan free', 'microplastics free', 'microbeads free', 'palm oil free', 'hdpe', 'post consumer recycled plastic', 'renewable energy', 'recycling scheme', 'sustainably sourced', 'low impact', 'carbon neutral', 'carbon offsetting', 'eco', 'soil association', 'conservation', 'cosmos', 'natrue', 'rspo', 'fsc']
['no animal testing', 'cruelty free', 'vegan', 'plant based', 'palm oil free', 'ethical', 'vegan society', 'peta', 'leaping bunny', 'fair trade', 'local', 'hand', 'small business']
['fair trade', 'renewable energy', 'circular economy', 'locally sourced', 'local', 'small business', 'job creation']
['non toxic', 'bio', 'organic', 'plant based', 'paraben free', 'triclosan free', 'fragr

In [37]:
###############################################################
#                      CorEx                                  #
# Code based on
# https://github.com/gregversteeg/corex_topic/blob/master/corextopic/example/corex_topic_example.ipynb

# Setting anchor words for corEx
# Anchor with group of words
anchor_words = [clean_enviro, clean_social, clean_economic, clean_health]

#------------------------------------------------------
#                   Vectorisation
# fow working with sparse arrays
import scipy.sparse as ss

# Vectorisation
from sklearn.feature_extraction.text import CountVectorizer

# Vectorise the dataset

# TODO double check which stopwords removed no need for stop_words='englsih' because those are removed already 
# Give better results with them 
# TODO but need to check about NEGATIVES with it and wihtouth it
# TODO check if max_features=20000 is needed, if there is a point to use it as it will select top words from everything, number might need adjusting
# TODO invest9gate the chance of using Max_df to remove some words with high frequency
# binary set to True because this takes frequesncy of each word into account (for GLDA this need to be set to default false as it only accepts 0 or 1)
vectorizer = CountVectorizer(stop_words='english', binary=True)         

doc_word = vectorizer.fit_transform(df["clean_description"]) # dont need .data cuz we have dataframe
# Transform descriptions into a sparse matrix
doc_word = ss.csr_matrix(doc_word)

doc_word.shape # n_docs x m_words

# TODO should I be adding my seedwords to the vocabulary?
# Getting words for labeling the columns of the matrix
words = list(np.asarray(vectorizer.get_feature_names_out()))
print(words)
#-----------------------------------------------------
#           COREX
import corextopic.corextopic as ct

# TODO experiment with anchor strength
anchored_topic_model = ct.Corex(n_hidden=4, seed=2)
anchored_topic_model.fit(doc_word, words=words, anchors=anchor_words, anchor_strength=5);





In [42]:
#               Corex output

for n in range(len(anchor_words)):
    topic_words,_,_ = zip(*anchored_topic_model.get_topics(topic=n))
    print('{}: '.format(n) + ', '.join(topic_words))


0: organic, refill, reusable, biodegradable, sustainable, reduced, hot, blood, peppermint, increase
1: hand, vegan, local, oilyskin, whenapplyingany, similarly, consider, instance, alter, affect
2: blade, shave, trimmer, head, norelco, razor, shaving, norelcos, shaver, cordless
3: organic, skin, ingredient, oil, vitamin, product, extract, natural, soothing, help


In [None]:
'''
Withouth spell check

0: organic, fragrance, biodegradable, launched, house, note, scent, recommended, blend, refill
1: hand, vegan, trimmer, cordless, head, shaving, blade, shave, system, norelco
2: local, diet, aging, similarly, consider, instance, whenapplyingany, alter, oilyskin, chemistry
3: organic, vitamin, oil, ingredient, skin, extract, natural, animal, product, healthy
'''

In [46]:
# Show top words for selected topic, number is "highest mutual information with the topic"
anchored_topic_model.get_topics(topic=1, n_words=10)

[('hand', 0.8456658970704904, 1.0),
 ('vegan', 0.1483173044461751, 1.0),
 ('local', 0.04009181540524613, 1.0),
 ('oilyskin', 0.02966346088923502, 1.0),
 ('whenapplyingany', 0.02966346088923502, 1.0),
 ('similarly', 0.02966346088923502, 1.0),
 ('consider', 0.02966346088923502, 1.0),
 ('instance', 0.02966346088923502, 1.0),
 ('alter', 0.02966346088923502, 1.0),
 ('affect', 0.02892167026902224, 1.0)]

In [43]:
# Show all documents for each topic
# TODO what is exactly this number?
anchored_topic_model.get_top_docs(topic=1, n_docs=10000, sort_by='log_prob')[-20:]

NOTE: 'docs' not provided to CorEx. Returning top docs as lists of row indices


[(428, -37.976977830299944),
 (50, -38.454266672802284),
 (547, -38.81289322003444),
 (218, -39.14284035723783),
 (948, -39.346430540696154),
 (778, -39.45691695953759),
 (955, -39.51191568196159),
 (939, -39.618106745152616),
 (359, -39.93467544486075),
 (659, -40.55792350642572),
 (653, -40.84025594459864),
 (96, -41.01205013487814),
 (302, -41.95187989288415),
 (660, -41.997491785255235),
 (822, -42.05209164447969),
 (504, -42.064994052499195),
 (342, -43.035552872126104),
 (720, -45.56926774170902),
 (898, -47.95725311867178),
 (62, -48.4424986918038)]

In [None]:
# from https://www.kaggle.com/code/nvpsani/topic-modelling-using-guided-lda
# used workaround as well

import numpy as np
from lda import guidedlda as glda

model = glda.GuidedLDA(n_topics=4, n_iter=2000, random_state=7, refresh=20,alpha=0.01,eta=0.01)
