In [3]:
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
import pandas as pd 
import os 

representation_model = KeyBERTInspired()
topic_model = BERTopic(n_gram_range=(1,2), representation_model=representation_model, verbose=True)

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
searches = os.listdir('../data/search_results')
for file in searches:  
    if '021' not in file: continue # only Violin Strings for now 
    products = pd.read_csv(f'../data/search_results/{file}')
    for keyword in set(products.keyword): 
        products = products[products.keyword == keyword] 
        products = products[products.source != "paid"] # only Organic & Amazon's Choice products 
        products = products[products.min_revenue != 0] 
        titles = []
        all_titles = list(products['title'])
        for idx, star in enumerate(list(products['rating'])):
            if float(star) > 4: 
                titles.append(all_titles[idx])
        topics, probs = topic_model.fit_transform(titles)
        print(topic_model.get_topic_info())

2024-07-06 17:45:01,198 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 5/5 [00:01<00:00,  3.83it/s]
2024-07-06 17:45:06,258 - BERTopic - Embedding - Completed ✓
2024-07-06 17:45:06,259 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-07-06 17:45:17,233 - BERTopic - Dimensionality - Completed ✓
2024-07-06 17:45:17,234 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-07-06 17:45:17,256 - BERTopic - Cluster - Completed ✓
2024-07-06 17:45:17,259 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-07-06 17:45:17,700 - BERTopic - Representation - Completed ✓


   Topic  Count                                               Name  \
0      0     95   0_violin strings_violin string_violin set_violin   
1      1     52  1_violin strings_string violin_violin string_a...   

                                      Representation  \
0  [violin strings, violin string, violin set, vi...   
1  [violin strings, string violin, violin string,...   

                                 Representative_Docs  
0  [Pirastro Evah Pirazzi 1/8-1/4 Violin String S...  
1  [Violin Strings,Violin Strings 4/4 Full Set, M...  


In [4]:
all_reviews = pd.read_csv('../data/reviews/.csv')
topics, probs = topic_model.fit_transform(list(all_reviews['content']))

2024-07-06 17:47:04,358 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 2/2 [00:02<00:00,  1.09s/it]
2024-07-06 17:47:10,779 - BERTopic - Embedding - Completed ✓
2024-07-06 17:47:10,780 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-07-06 17:47:20,693 - BERTopic - Dimensionality - Completed ✓
2024-07-06 17:47:20,694 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-07-06 17:47:20,699 - BERTopic - Cluster - Completed ✓
2024-07-06 17:47:20,703 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-07-06 17:47:21,193 - BERTopic - Representation - Completed ✓


In [5]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,50,-1_strings_violin_the strings_strings are,"[strings, violin, the strings, strings are, th...",[Purchased these strings to replace a broken A...
