In [129]:
import pandas as pd
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
import pickle
from scipy.cluster import hierarchy as sch
from umap import UMAP
from hdbscan import HDBSCAN
import numpy as np

In [130]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.current_device())
print(torch.cuda.get_device_name())


True
1
0
NVIDIA GeForce RTX 2070 SUPER


In [131]:
df = pd.read_csv("../../data/processed/clean_reviews.csv")
df.head()

Unnamed: 0,Sentiment,Time,Text,Cleaned Text,Label
0,positive,18/6/21,This is a very healthy dog food. Good for thei...,healthy dog food good digestion also good smal...,1
1,positive,7/7/21,I've been very pleased with the Natural Balanc...,pleased natural balance dog food dog issue dog...,1
2,positive,18/6/21,"Before I was educated about feline nutrition, ...",educate feline nutrition allow cat become addi...,1
3,positive,7/7/21,"My holistic vet recommended this, along with a...",holistic vet recommend along brand try cat pre...,1
4,positive,1/7/21,I bought this coffee because its much cheaper ...,buy coffee much cheap ganocafe organic reishi ...,1


In [132]:
len(df)

5444

In [133]:
reviews = df['Cleaned Text'].tolist()
time = pd.to_datetime(df.Time).tolist()

In [134]:
# Prepare embeddings
# sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
# sentence_model = SentenceTransformer("all-mpnet-base-v2")
# embeddings = sentence_model.encode(reviews, show_progress_bar=True)

In [135]:
# save embeddings
# with open('reviews_all-MiniLM-L6-v2_embedding.pickle', 'wb') as pkl:
# with open('reviews_all-mpnet-base-v2_embedding.pickle', 'wb') as pkl:
#     pickle.dump(embeddings, pkl)


In [136]:
# load embeddings
# with open('reviews_all-MiniLM-L6-v2_embedding.pickle', 'rb') as pkl:
with open('reviews_all-mpnet-base-v2_embedding.pickle', 'rb') as pkl:
    embeddings = pickle.load(pkl)


In [137]:
# configure UMAP for reproducibility
umap_model = UMAP(n_neighbors=50,
                  n_components=5,
                  min_dist=0.0,
                  metric='cosine',
                  random_state=42)

# configure hdbscan
hdbscan_model = HDBSCAN(min_cluster_size=50,
                        min_samples=20,
                        metric='euclidean',
                        prediction_data = True)

In [138]:
topic_model = BERTopic(language="english",
                       min_topic_size=40,
                       umap_model=umap_model,
                       hdbscan_model=hdbscan_model,
                       nr_topics='auto')
topics, probs = topic_model.fit_transform(reviews, embeddings)

In [139]:
# Get the list of topics
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,1867,-1_taste_product_great_like
1,0,495,0_coffee_cup_taste_roast
2,1,450,1_sauce_soup_noodle_pasta
3,2,350,2_tea_green_taste_drink
4,3,318,3_dog_food_cat_treat
5,4,290,4_drink_juice_soda_water
6,5,288,5_candy_gift_taste_box
7,6,235,6_price_ship_buy_store
8,7,154,7_gluten_free_pancake_mix
9,8,113,8_sugar_syrup_stevia_use


In [None]:
# Visualize intertopic distance
topic_model.visualize_topics()

In [None]:
# Visualize top topic keywords
topic_model.visualize_barchart(top_n_topics=20)

In [None]:
# Visualize similarity using heatmap
topic_model.visualize_heatmap()

In [None]:
# # Hierarchical topics
# linkage_function = lambda x: sch.linkage(x, 'single', optimal_ordering=True)
# hierarchical_topics = topic_model.hierarchical_topics(reviews, linkage_function=linkage_function)
# topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

# Visualize connections between topics using hierachical clustering
topic_model.visualize_hierarchy(top_n_topics=20)

In [None]:
topic_model.visualize_documents(df["Cleaned Text"], reduced_embeddings=umap_model.fit_transform(embeddings))

In [None]:
topics_over_time = topic_model.topics_over_time(reviews, time, nr_bins=10)

In [None]:
topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=10, normalize_frequency=True)

In [None]:
topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=10, normalize_frequency=False)