In [1]:
# Importing metadata zip file and converting it to dataframe

import json
import gzip
import pandas as pd

def getDF(path):
  # Unzip the file, load in each line as an object
  g = gzip.open(path, 'rb')
  g = [json.loads(l) for l in g]

  # Map to a dictionary, then load in as a dataframe
  dict_df = {i: d for (i, d) in enumerate(g)}
  return pd.DataFrame.from_dict(dict_df, orient='index')

df = getDF('meta_ALL_Beauty.json.gz')

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:

# Selecting only the colums that are required for analysis

colums_description_asin = ["description","asin"]
df = df[colums_description_asin]

df.describe()

Unnamed: 0,description,asin
count,32892,32892
unique,13751,32488
top,[],B00027CDOW
freq,17773,2


In [3]:
# Descriptions in the json file are a list of strings, converting to sting for easy cleaning
df["newdescription"] = df.description.map(lambda x: ".".join(x).replace("\n",""))


In [4]:
# Remove empty descriptions
#df = df.drop(df[df["newdescription"] == ""].index)

#print(len(df)) 



In [5]:
df = df.drop(df[ ( (df["newdescription"] == "" ) | ( df["newdescription"].isnull()) ) ].index)

df

Unnamed: 0,description,asin,newdescription
0,[Loud 'N Clear Personal Sound Amplifier allows...,6546546450,Loud 'N Clear Personal Sound Amplifier allows ...
1,[No7 Lift & Luminate Triple Action Serum 50ml ...,7178680776,No7 Lift & Luminate Triple Action Serum 50ml b...
2,[No7 Stay Perfect Foundation now stays perfect...,7250468162,No7 Stay Perfect Foundation now stays perfect ...
4,[Lacto Calamine Skin Balance Daily Nourishing ...,7414204790,Lacto Calamine Skin Balance Daily Nourishing L...
5,[Mary Kay Satin Hands Peach Hand Cream Travel ...,7535842801,Mary Kay Satin Hands Peach Hand Cream Travel S...
...,...,...,...
32880,[Move over soap on a rope! This heavy-duty Bri...,B01HIHLFOC,Move over soap on a rope! This heavy-duty Bric...
32884,[Eau de parfum spray vial mini design house: y...,B01HIPOQ2M,Eau de parfum spray vial mini design house: yv...
32885,[Pokemon Plush 9.2 Inch / 23cm Gengar Doll Stu...,B01HIUEEHO,Pokemon Plush 9.2 Inch / 23cm Gengar Doll Stuf...
32886,[New and unused product. 100% authentic Benefi...,B01HIWKGOM,New and unused product. 100% authentic Benefit...


In [6]:
# Remove duplicate asin so can have only unique products
df[df.duplicated("asin")]
df = df.drop_duplicates("asin", keep="last")
df.describe()

Unnamed: 0,description,asin,newdescription
count,14821,14821,14821
unique,13749,14821,13743
top,"[For over 60 years, Betty Dain Creations, Inc....",6546546450,"For over 60 years, Betty Dain Creations, Inc. ..."
freq,59,1,59


In [7]:
# Remove duplicate description
df[df.duplicated("newdescription")]
df = df.drop_duplicates("newdescription", keep="last")
df.describe() 

Unnamed: 0,description,asin,newdescription
count,13743,13743,13743
unique,13743,13743,13743
top,[Loud 'N Clear Personal Sound Amplifier allows...,6546546450,Loud 'N Clear Personal Sound Amplifier allows ...
freq,1,1,1


In [8]:
# Removing redundant old description column
colums_description_asin = ["newdescription","asin"]
df = df[colums_description_asin]

df.describe()

Unnamed: 0,newdescription,asin
count,13743,13743
unique,13743,13743
top,Loud 'N Clear Personal Sound Amplifier allows ...,6546546450
freq,1,1


In [9]:
# Identify range of descriptions in after duplicates removal

df_descriptions_without_empty = df["newdescription"].apply(lambda x: len(x.split()))

df_descriptions_without_empty.describe([0.1,0.15,0.20,0.25,0.30,0.75,0.85,0.90,0.92,0.95,0.97,0.98,0.99])

# 97% upper limit of 259 (same % milit as reviews)
# 11 words (same numbe of words as reviews) 20% because no assuption is made on how much more each one is informative as well as descriptions 
# being a smaller sample


count    13743.000000
mean        63.643600
std         91.940065
min          1.000000
10%          7.000000
15%          9.000000
20%         11.000000
25%         14.000000
30%         17.000000
50%         39.000000
75%         79.000000
85%        114.000000
90%        148.000000
92%        169.000000
95%        217.000000
97%        259.000000
98%        288.000000
99%        324.000000
max       3224.000000
Name: newdescription, dtype: float64

In [10]:
# Remove descriptions with more or less than xx pre-cleaned words

# Split at any white space 
df["num_words_description"] = df["newdescription"].apply(lambda x: len(x.split()))

# Check if under or equal to 80% upper limmit words fulfils withs condition and set it
df = df[(df["num_words_description"] <= 259) & (df["num_words_description"] >= 11)]

df["newdescription"].describe()

count                                                 10710
unique                                                10710
top       Loud 'N Clear Personal Sound Amplifier allows ...
freq                                                      1
Name: newdescription, dtype: object

In [11]:
# Check shortening worked
df_descriptions_without_empty = df["newdescription"].apply(lambda x: len(x.split()))

df_descriptions_without_empty.describe([0.1,0.15,0.20,0.25,0.30,0.75,0.85,0.90,0.92,0.95,0.97,0.98,0.99])


count    10710.000000
mean        65.241923
std         53.053433
min         11.000000
10%         15.000000
15%         18.000000
20%         23.000000
25%         27.000000
30%         31.000000
50%         49.000000
75%         86.000000
85%        115.000000
90%        142.000000
92%        157.000000
95%        185.000000
97%        211.000000
98%        225.000000
99%        243.000000
max        259.000000
Name: newdescription, dtype: float64

In [12]:
# Regrex for character removal
import re

# Spacy for spell check
import spacy
import contextualSpellCheck
nlp = spacy.load("en_core_web_sm")
contextualSpellCheck.add_to_pipe(nlp)

# NLTK for tokenisation and lemmatization
import nltk

from nltk.tokenize import word_tokenize

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

nltk.download('wordnet')

# Spacy stop word creation
stopping_words = spacy.lang.en.stop_words.STOP_WORDS
stopping_words_new = stopping_words


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [13]:
stopping_words_to_remove = ['without' , 'not', 'less', 'noting', 'none','no']
for word in list(stopping_words_new):
    if word in stopping_words_to_remove:
        stopping_words_new.remove(word)
print(stopping_words_new)

{'the', 'before', 'among', 'namely', 'almost', 'thus', 'see', 'while', 'twelve', 'his', 'sometimes', 'whenever', 'might', '‘m', 'few', 'us', 'further', 'herein', 'by', 'from', 'ours', 'since', 'off', 'about', 'full', 'six', 'nor', 'must', 'enough', 'after', 'unless', 'bottom', 'whom', 'call', 'she', 'he', 'several', 'eight', 'per', 'than', 'i', 'due', 'him', 'perhaps', 'for', 'out', 'get', 'besides', 'together', 'already', 'that', 'n‘t', 'beside', 'thence', 'just', 'because', 'three', 'anyone', 'how', 'me', 'other', 'whoever', 'they', 'did', 'again', 'often', 'sometime', 'only', 'serious', 'at', 'will', 'himself', 'least', 'top', 'upon', 'nowhere', 'became', 'too', 'nothing', 'although', 'back', 'what', '‘re', '’ll', 'we', 'keep', 'becoming', 'name', 'who', 'cannot', 'myself', 'above', 'fifteen', 'amount', 'itself', 'own', 'these', 'any', 'my', 'give', 'front', 'can', 'an', 'amongst', 'down', 'go', 'really', 'someone', 'their', 're', 'ourselves', 'some', 'along', 'thereupon', 'thru', '

In [14]:
# Preprocessing of descriptions

def preprocessing(raw_string):
    # Remove html tags and anything inside them 
    no_html = re.sub(r'<[^>]*>','', raw_string)
    #print("after removing html", no_html)


    # Make everything lowercase
    lowercase_column = no_html.lower()
    #print("lowercase", lowercase_column)

    # Remove apostrophe to enable spell check to correct words with apostrophe
    without_apostrophe = re.sub(r'[\']', '', lowercase_column)

    # ! Need to double check again where best to use this spell check
    # 
    # .pipe for batches of text
    #doc = list(nlp.pipe(without_apostrophe))
    #doc = nlp(without_apostrophe)

    #spell_checked = doc._.outcome_spellCheck

    # Remove all non alphabetic instances that aren't a space and replace them with a space using Regrex
    alphabetic_column = re.sub(r'[^a-z\s]', ' ', without_apostrophe)
    #print("removed numerical and punctuation", alphabetic_column)

    # Tokenize string into individual words
    tokens = word_tokenize(alphabetic_column)

    # Remove stopping words using Spacy library
    tokens_without_stopping_words = [token for token in tokens if token not in stopping_words_new]

    # Lemmatize tokens using nltk and join them into sentances
    sentances_without_stop_words = ' '.join([lemmatizer.lemmatize(t) for t in tokens_without_stopping_words])

    return sentances_without_stop_words


In [93]:
df["clean_description"] = df["newdescription"].head(1000).apply(preprocessing)

df["clean_description"].to_csv("out_new.csv")

df["clean_description"]

0       loud n clear personal sound amplifier allows t...
2       no stay perfect foundation stay perfect longer...
4       lacto calamine skin balance daily nourishing l...
5       mary kay satin hand peach hand cream travel si...
7       according legend brother native origin black b...
                              ...                        
2299    venus embrace cartridge count venus embrace hu...
2309    occo red shield rich yummy blend vanilla brown...
2311    tropical tradition moisturizing lotion baby si...
2312    perfume light life generous sparkling bottle r...
2315    weightless mousse formula slip blend effortles...
Name: clean_description, Length: 1000, dtype: object

In [16]:
# Remove any empty descriptions that appear because of head()
df = df.drop(df[ ( (df["clean_description"] == "" ) | ( df["clean_description"].isnull()) ) ].index)

In [17]:
# Identifying range of descriptions in after cleaning
 
df_description_without_empty_clean = df["clean_description"].apply(lambda x: len(x.split()))
#df_descriptions_without_empty.describe()
df_description_without_empty_clean.describe([0.03,0.1,0.2,0.3,0.75,0.85,0.90,0.95])

count    1000.00000
mean       44.84100
std        32.72263
min         4.00000
3%          9.00000
10%        13.00000
20%        19.00000
30%        24.70000
50%        35.00000
75%        59.00000
85%        76.15000
90%        92.00000
95%       114.00000
max       221.00000
Name: clean_description, dtype: float64

In [18]:
from collections import Counter
Counter(" ".join(df["clean_description"]).split()).most_common(100)

[('skin', 762),
 ('hair', 372),
 ('oil', 288),
 ('use', 270),
 ('natural', 268),
 ('product', 266),
 ('body', 229),
 ('fragrance', 214),
 ('help', 207),
 ('not', 178),
 ('oz', 174),
 ('clean', 171),
 ('free', 170),
 ('formula', 162),
 ('shave', 156),
 ('dry', 151),
 ('no', 146),
 ('color', 141),
 ('smooth', 135),
 ('soft', 134),
 ('x', 130),
 ('day', 129),
 ('ingredient', 128),
 ('vitamin', 126),
 ('time', 117),
 ('blend', 115),
 ('soap', 113),
 ('long', 111),
 ('scent', 110),
 ('system', 109),
 ('size', 107),
 ('water', 107),
 ('shaving', 106),
 ('head', 105),
 ('blade', 103),
 ('extract', 102),
 ('line', 99),
 ('face', 97),
 ('easy', 97),
 ('contains', 96),
 ('nbsp', 96),
 ('organic', 94),
 ('bath', 94),
 ('provides', 93),
 ('hand', 92),
 ('designed', 92),
 ('razor', 91),
 ('cream', 89),
 ('note', 87),
 ('lip', 86),
 ('e', 84),
 ('essential', 84),
 ('woman', 82),
 ('work', 82),
 ('perfect', 81),
 ('area', 81),
 ('amp', 81),
 ('without', 80),
 ('new', 79),
 ('trimmer', 79),
 ('gel', 7

In [19]:
###############################################################
#                      Cleaning Keywords                                

# TODO use finalise list of keywords

enviromental = ['biodegradable', 'reduced packaging', 'reduced', 'sustainable', 'plastic-free', 'sustainably sourced', 'compostable', 'renewable', 'renewable energy', 'reusable', 'biodegradable', 'organic', 'refillable', 'refills', 'solid bar', 'recycled', 'cardboard', 'reef safe','reef-friendly', 'oxybenzone free', 'triclosan-free', 'microplastics free', 'microbeads free', 'palm oil free', 'HDPE', 'post-consumer recycled plastic', 'renewable energy', 'recycling scheme', 'sustainably sourced', 'low-impact', 'carbon neutral', 'carbon offsetting', 'eco', 'soil association', 'conservation', 'COSMOS', 'NATRUE', 'RSPO', 'FSC']
social = ['No animal testing', 'cruelty-free', 'vegan', 'plant-based', 'palm oil-free', 'ethical', 'vegan society', 'PETA', 'leaping bunny', 'fair trade', 'local', 'hand-made', 'small business'] 
economic = ['Fair trade', 'renewable energy', 'circular economy', 'locally sourced', 'local', 'small business', 'job creation']
health = ['non toxic', 'bio', 'organic', 'plant based', 'paraben free', 'triclosan free', 'fragrance free', 'synthetic fragrance free', 'SLS free', 'phthalates free','nanoparticles free', 'non nano', 'formaldehyde free', 'phthalates free', 'no GMO', 'soil association', 'COSMOS', 'NATRUE', 'USDA']
idnk = ['palm oil-free','free','no gmo','free from','no plastic','no chemicals']

# Cleaning keywords in the same way as the corpus, 
# splitting them into individual words (for vectorisation) 
# removing duplicates, 
# converts it to a list
enviro_processed = list(set(word for phrase in enviromental for word in preprocessing(phrase).split()))
social_processed = list(set(word for phrase in social for word in preprocessing(phrase).split()))
economic_processed = list(set(word for phrase in economic for word in preprocessing(phrase).split()))
health_processed = list(set(word for phrase in health for word in preprocessing(phrase).split()))

#enviro_processed = set(map(preprocessing, enviromental))
print(enviro_processed)
print(social_processed)
print(economic_processed)
print(health_processed)



['refill', 'natrue', 'solid', 'refillable', 'oxybenzone', 'cosmos', 'sustainable', 'palm', 'recycled', 'biodegradable', 'association', 'soil', 'sourced', 'reef', 'free', 'packaging', 'oil', 'hdpe', 'post', 'energy', 'rspo', 'compostable', 'neutral', 'impact', 'eco', 'organic', 'fsc', 'sustainably', 'safe', 'recycling', 'renewable', 'plastic', 'triclosan', 'cardboard', 'carbon', 'scheme', 'reusable', 'reduced', 'consumer', 'conservation', 'friendly', 'offsetting', 'microplastics', 'microbeads', 'low', 'bar']
['ethical', 'leaping', 'plant', 'business', 'hand', 'palm', 'society', 'trade', 'bunny', 'free', 'oil', 'local', 'testing', 'peta', 'small', 'cruelty', 'fair', 'animal', 'no', 'based', 'vegan']
['job', 'fair', 'locally', 'trade', 'sourced', 'circular', 'economy', 'local', 'renewable', 'creation', 'business', 'energy', 'small']
['gmo', 'natrue', 'toxic', 'plant', 'cosmos', 'nanoparticles', 'association', 'soil', 'free', 'phthalates', 'paraben', 'nano', 'non', 'organic', 'sl', 'usda',

In [20]:
#                      Testing negative keywords

non_enviromental = ['packaging','plastic',  'oxybenzone', 'triclosan', 'microplastics', 'microbeads', 'palm oil']
non_social = ['animal testing', 'palm oil','imported'] 
non_health = ['toxic', 'paraben', 'triclosan','SLS', 'phthalates','nanoparticles', 'nano', 'formaldehyde', 'phthalates', 'GMO']

# Cleaning keywords in the same way as the corpus
non_enviromental_processed = list(set(word for phrase in non_enviromental for word in preprocessing(phrase).split()))
non_social_processed = list(set(word for phrase in non_social for word in preprocessing(phrase).split()))
non_health_processed = list(set(word for phrase in non_health for word in preprocessing(phrase).split()))


print(non_enviromental_processed)
print(non_social_processed)
print(non_health_processed)


['packaging', 'oxybenzone', 'oil', 'microplastics', 'microbeads', 'plastic', 'triclosan', 'palm']
['animal', 'imported', 'oil', 'testing', 'palm']
['sl', 'gmo', 'phthalates', 'toxic', 'paraben', 'formaldehyde', 'triclosan', 'nanoparticles', 'nano']


In [None]:
############################################################### 
#                      CorEx                                  #
# Code based on
# https://github.com/gregversteeg/corex_topic/blob/mastera/corextopic/example/corex_topic_example.ipynb

# Setting anchor words for corEx
# Anchor with group of words
anchor_words = [enviro_processed, social_processed, economic_processed, health_processed, non_enviromental_processed, non_social_processed,non_health_processed]#, non_clean_enviro, non_clean_social, non_clean_health]

#------------------------------------------------------
#                   Vectorisation
# fow working with sparse arrays
import scipy.sparse as ss

# Vectorisation
from sklearn.feature_extraction.text import CountVectorizer

# Vectorise the dataset

# TODO double check which stopwords removed no need for stop_words='englsih' because those are removed already + resulta it gives
# TODO check if max_features=20000 is needed, if there is a point to use it as it will select top words from everything, number might need adjusting
# TODO invest9gate the chance of using Max_df to remove some words with high frequency

vectorizer = CountVectorizer(stop_words='english', binary=True)         
doc_word = vectorizer.fit_transform(df["clean_description"]) # dont need .data cuz we have dataframe
# Transform descriptions into a sparse matrix
doc_word = ss.csr_matrix(doc_word)

print(doc_word.shape) # n_docs x m_words

# Getting words for labeling the columns of the matrix
corpus_words = vectorizer.get_feature_names_out()

combined_words = list(corpus_words) + list(enviro_processed) + list(social_processed) + list(economic_processed) + list(health_processed)

combined_words = set(combined_words)
vectorizer = CountVectorizer(vocabulary=combined_words,stop_words='english', binary=True)

doc_word = vectorizer.transform(df["clean_description"])
doc_word = ss.csr_matrix(doc_word)
print(doc_word.shape)


words = list(np.asarray(vectorizer.get_feature_names_out()))

print(words)
#-----------------------------------------------------
#           COREX
import corextopic.corextopic as ct

# TODO experiment with anchor strength
anchored_topic_model = ct.Corex(n_hidden=7, seed=2)
anchored_topic_model.fit(doc_word, words=words, anchors=anchor_words, anchor_strength=10);



In [None]:
#               Corex output

for n in range(len(anchor_words)):
    topic_words,_,_ = zip(*anchored_topic_model.get_topics(topic=n))
    print('{}: '.format(n) + ', '.join(topic_words))


In [None]:
'''
With stop words ( based on 2 it looks like some of the words lile LOCAL, LOSS and CIRCULAR are excluded unfortunately)

0: organic, safe, oil, friendly, free, refill, energy, palm, biodegradable, reusable
1: oil, free, animal, plant, vegan, cruelty, testing, palm, hand, based
2: small, local, fair, trimmer, cordless, contour, norelcos, norelco, worldwide, warranty
3: fragrance, free, organic, non, based, plant, synthetic, paraben, toxic, gmo
'''

'''
After splitting words

0: oil, free, organic, bar, packaging, friendly, biodegradable, palm, sustainable, soap
1: oil, animal, no, hand, based, free, vegan, testing, cruelty, small
2: small, energy, fair, local, circular, loss, economy, creation, trade, aging
3: fragrance, free, organic, non, based, synthetic, paraben, toxic, blend, gmo
'''

'''
Before splitting words

0: organic, refill, biodegradable, reusable, fragrance, recommended, launched, house, note, sustainable
1: hand, vegan, local, trimmer, cordless, contour, norelcos, charging, norelco, warranty
2: organic, ingredient, extract, vitamin, skin, bio, natural, product, oil, rich
3: free, cruelty, non, radical, sensitive, gluten, contact, sunscreen, paraben, apply
'''

'''
Withouth spell check

0: organic, fragrance, biodegradable, launched, house, note, scent, recommended, blend, refill
1: hand, vegan, trimmer, cordless, head, shaving, blade, shave, system, norelco
2: local, diet, aging, similarly, consider, instance, whenapplyingany, alter, oilyskin, chemistry
3: organic, vitamin, oil, ingredient, skin, extract, natural, animal, product, healthy
'''

In [None]:
# Show top words for selected topic, number is "highest mutual information with the topic"
anchored_topic_model.get_topics(topic=2, n_words=50)

In [None]:
# Show all documents for each topic
# TODO what is exactly this number?
anchored_topic_model.get_top_docs(topic=1, n_docs=10000, sort_by='log_prob')#[-20:]

In [None]:
import corextopic.vis_topic as vt
vt.vis_rep(anchored_topic_model, column_label=words, prefix='twenty')

In [None]:
import matplotlib.pyplot as plt
import numpy as np
# Read labels from labels text file in twenty
data = np.loadtxt('twenty\labels.txt', delimiter=',')

# Extract all columns but not the first one
columns = data[:, 1:]

# Summation of each column to add up how often each theme appears
sum_themes = np.sum(columns, axis=0)

# TODO if non of the rows = 1 then make label 5 (no theme) =+

# Identify and sum any rows without 1s in each column by checking if the sum of the whole row is 0
rows_without_theme = np.sum(columns, axis=1) == 0

# Sum
num_rows_without_theme = np.sum(rows_without_theme)
print(num_rows_without_theme)

# Add new column represting no themes classified with 1 as no theme and 0 having a theme
data = np.column_stack((data, rows_without_theme.astype(int)))

# Create new labels text file with addition of the new "no themes" column as int
np.savetxt( 'new_lables.txt', data, delimiter=',', fmt='%d')


# Display the results
labels = ['Enviromental', 'Social', 'Economic', 'Health']
plt.bar(labels, sum_themes)
plt.xlabel('Themes')
plt.ylabel('N of times')
plt.title('Number of times each theme appears')
plt.show()

In [120]:
n_top_words = 10
topic_word = model.topic_word_
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(combined_words)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    print('Topic {}: {}'.format(i, ' '.join(topic_words)))

IndexError: index 7482 is out of bounds for axis 0 with size 7419

In [118]:
# from https://www.kaggle.com/code/nvpsani/topic-modelling-using-guided-lda 
# used workaround as well

import pandas as pd
import numpy as np
from lda import guidedlda as glda

from sklearn.feature_extraction.text import CountVectorizer

# Anchor words list
anchor_words = [enviro_processed, social_processed, economic_processed, health_processed]

# Create corpus of descriptions
descriptions_corpus = df["clean_description"].tolist() 

# Create vocabulary to work with 
vocab = list(set(word_tokenize(" ".join(df["clean_description"]))))
print(vocab)

vocab.extend(enviro_processed)
vocab.extend(social_processed)
vocab.extend(economic_processed)
vocab.extend(health_processed)
vocab = list(set(vocab))
print(vocab)
print("vocab",len(vocab))


vectorizer = CountVectorizer(ngram_range = (1,1), stop_words='english', vocabulary = vocab)
vectorised_descriptions_corpus = vectorizer.fit_transform(descriptions_corpus)
word2id = vectorizer.vocabulary_

# Defining model
model = glda.GuidedLDA(n_topics=4, n_iter=4000, random_state=7, refresh=20,alpha=0.01,eta=0.01)

# Topics for the model
anchor_topics = {}
for t_id, st in enumerate(anchor_words):
    for word in st:
        anchor_topics[word2id[word]] = t_id
        
#train model
model.fit(vectorised_descriptions_corpus, seed_topics=anchor_topics, seed_confidence=0.15)

INFO:lda:n_documents: 1000
INFO:lda:vocab_size: 7484
INFO:lda:n_words: 43451
INFO:lda:n_topics: 4
INFO:lda:n_iter: 4000
INFO:lda:<0> log likelihood: -462681


vocab 7484


INFO:lda:<20> log likelihood: -376360
INFO:lda:<40> log likelihood: -368666
INFO:lda:<60> log likelihood: -365550
INFO:lda:<80> log likelihood: -363989
INFO:lda:<100> log likelihood: -363206
INFO:lda:<120> log likelihood: -362518
INFO:lda:<140> log likelihood: -361841
INFO:lda:<160> log likelihood: -361828
INFO:lda:<180> log likelihood: -361507
INFO:lda:<200> log likelihood: -361123
INFO:lda:<220> log likelihood: -360992
INFO:lda:<240> log likelihood: -360987
INFO:lda:<260> log likelihood: -360933
INFO:lda:<280> log likelihood: -360961
INFO:lda:<300> log likelihood: -360996
INFO:lda:<320> log likelihood: -360950
INFO:lda:<340> log likelihood: -360842
INFO:lda:<360> log likelihood: -360729
INFO:lda:<380> log likelihood: -360868
INFO:lda:<400> log likelihood: -360754
INFO:lda:<420> log likelihood: -360673
INFO:lda:<440> log likelihood: -360829
INFO:lda:<460> log likelihood: -360534
INFO:lda:<480> log likelihood: -360679
INFO:lda:<500> log likelihood: -360483
INFO:lda:<520> log likelihood

<lda.guidedlda.GuidedLDA at 0x280c596b810>

In [119]:
top_n_glda_words = 14
words_in_topics = model.topic_word_
for i, topic_dist in enumerate(words_in_topics):
    words_in_this_topic = np.array(vocab)[np.argsort(topic_dist)][:-(top_n_glda_words+1):-1]
    print('Topic {}: {}'.format(i, ' '.join(words_in_this_topic)))

Topic 0: fragrance scent soap note xl blend oz wear body design woman recommended smell house
Topic 1: hair shave blade shaving head razor trimmer shaver easy minute feature comfortable brush norelco
Topic 2: skin oil natural hair help body free use product formula clean vitamin oz ingredient
Topic 3: product use bra size lash cup great relief easy color shape woman available designed


In [None]:
'''
WITH STOP WORDS
Topic 0: fragrance scent soap note xl blend oz wear body design woman recommended smell house
Topic 1: hair shave blade shaving head razor trimmer shaver easy minute feature comfortable brush norelco
Topic 2: skin oil natural hair help body free use product formula clean vitamin oz ingredient
Topic 3: product use bra size lash cup great relief easy color shape woman available designed

Wihtout stop words normal
Topic 0: shave hair blade head shaving razor system trimmer xl easy shaver brush use comfortable
Topic 1: skin oil hair natural help free formula use body vitamin clean oz dry smooth
Topic 2: product not use bath body natural water relief foot child no cold professional tree
Topic 3: fragrance scent soap note woman blend body wear oz bra design recommended house smell

?
Topic 0: no hand safe small non free plastic biodegradable fragrance reusable low toxic palm based
Topic 1: oil no hand organic animal fragrance bar plant small refill testing packaging based friendly
Topic 2: fragrance bar based low energy oil bio packaging carbon creation local business economy sustainably
Topic 3: free oil organic non fragrance no animal vegan safe cruelty hand friendly paraben synthetic

OLD VECTORISATION WITH KEYWORDS

Topic 0: shave hair blade head shaving razor xl trimmer system easy
Topic 1: skin hair oil natural body help free use formula vitamin
Topic 2: product not use bath oil natural relief water child ingredient
Topic 3: fragrance scent soap note blend oz product woman wear design

NEW VECTORISATION WITH KEYWORDS

Topic 0: fragrance scent soap body note blend oz woman bath wear
Topic 1: skin oil hair natural help use formula free product body
Topic 2: hair product size easy color designed day bra brush soft
Topic 3: shave blade shaving head razor xl trimmer shaver hair norelco

OLD VECTORISATION WITHOUT KEYWORDS

pic 0: skin hair help natural free formula oil body vitamin dry
Topic 1: fragrance scent note product woman body blend wear oz design
Topic 2: hair shave shaving blade head razor trimmer easy shaver fit
Topic 3: oil soap product organic use bath natural ingredient xl no


'''