# BERTtopic

## Import libraries and data

In [None]:
import pandas as pd
import numpy as np
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
import pickle

In [None]:
data = pd.read_csv("../../data/processed/clean_reviews.csv", parse_dates=['Time'])
data.columns = [x.lower().replace(" ", "_") for x in data.columns ]
data.head()

## Prepare model

### Prepare embeddings

In [None]:
# # Prepare embeddings
# sentence_model = SentenceTransformer("all-MiniLM-L12-v2")
# embeddings = sentence_model.encode(data['cleaned_text'], show_progress_bar=True)

In [None]:
# # save embeddings
# with open('BERTopic_embeddings.pickle', 'wb') as pkl:
#     pickle.dump(embeddings, pkl)

In [None]:
# Load embeddings
with open('BERTopic_embeddings.pickle', 'rb') as pkl:
    embeddings = pickle.load(pkl)

### UMAP for dimensionality reduction

In [None]:
umap_model = UMAP(n_neighbors=100, 
                  n_components=3, 
                  min_dist=0.0, 
                  metric='cosine', 
                  random_state=4263)

### HDBSCAN for clustering

In [None]:
hdbscan_model = HDBSCAN(min_cluster_size=50,
                        min_samples=20,
                        metric='euclidean',
                        prediction_data = True)

### Initialize BERTopic

In [None]:
# Initiate BERTopic
topic_model = BERTopic(hdbscan_model=hdbscan_model,
                       umap_model=umap_model,
                       language="english",
                       calculate_probabilities=True,
                       nr_topics="auto")
# Run BERTopic model
topics, probabilities = topic_model.fit_transform(data['cleaned_text'], embeddings)

## Evaluate model

In [None]:
# Get the list of topics
topic_model.get_topic_info()

In [None]:
# Get top 10 terms for a topic
topic_model.get_topic(2)

In [None]:
# Visualize top topic keywords
topic_model.visualize_barchart(top_n_topics=20)

In [None]:
# Visualize term rank decrease
topic_model.visualize_term_rank()

In [None]:
# Visualize intertopic distance
topic_model.visualize_topics()

In [None]:
# Visualize connections between topics using hierachical clustering
topic_model.visualize_hierarchy(top_n_topics=20)

In [None]:
# Visualize similarity using heatmap
topic_model.visualize_heatmap()

In [None]:
topic_model.visualize_documents(data["cleaned_text"], reduced_embeddings=umap_model.fit_transform(embeddings))

### Topics over time

In [None]:
topics_over_time = topic_model.topics_over_time(data["cleaned_text"], data["time"])
topic_model.visualize_topics_over_time(topics_over_time)

### Topics by sentiment

In [None]:
sentiment = [i for i in data["label"]]

In [None]:
topics_by_sentiment = topic_model.topics_per_class(data["cleaned_text"], classes=sentiment)
topic_model.visualize_topics_per_class(topics_by_sentiment)

In [None]:
# Calculate the topic distributions on a token-level
topic_distr, topic_token_distr = topic_model.approximate_distribution(data["cleaned_text"], calculate_tokens=True)

# Visualize the token-level distributions
df = topic_model.visualize_approximate_distribution(data["cleaned_text"][5000], topic_token_distr[5000])
df

## Append topic to dataset

In [None]:
# Get the topic predictions
topic_prediction = topic_model.topics_[:]
# Save the predictions in the dataframe
data['topic_id'] = topic_prediction

In [None]:
topics_dict = topic_model.get_topics()
topics_df = pd.DataFrame()
for topic_id, topic in topics_dict.items():
    topic_df = pd.DataFrame(topic, columns=['word', 'weight'])
    topic_df['topic_id'] = topic_id
    topics_df = pd.concat([topics_df, topic_df], ignore_index=True)
topics_df

In [None]:
data_df = data.merge(topics_df, on='topic_id')

In [None]:
pivoted_df = data_df.pivot_table(index=['sentiment', 'time','text', 'cleaned_text', 'label', 'topic_id'],
                                 columns=data_df.groupby(['sentiment', 'time','text', 'cleaned_text', 'label', 'topic_id']).cumcount() + 1,
                                 values=['word', 'weight'],
                                aggfunc = 'first').reset_index()
# Flatten the column names
pivoted_df.columns = ['_'.join(map(str, col)).strip() for col in pivoted_df.columns.values]

# # Rename the columns
pivoted_df = pivoted_df.rename(columns={'sentiment_': 'sentiment', 'time_': 'time', 'text_': 'text','cleaned_text_':'cleaned_text', 'label_': 'label', 'topic_id_': 'topic_id'})

column_order = ['sentiment', 'time', 'text', 'cleaned_text', 'label', 'topic_id']

for i in range(1, len(pivoted_df.columns[5:]), 2):
    column_order += [f'word_{i//2+1}', f'weight_{i//2+1}']

pivoted_df = pivoted_df.reindex(columns=column_order)

# Ensure same order as original df
data2 = data.set_index(["sentiment","time","text","cleaned_text","label"])
pivoted_df2 = pivoted_df.set_index(["sentiment","time","text","cleaned_text","label"])
output = pivoted_df2.reindex(data2.index).reset_index()
output.to_csv("../../data/processed/clean_reviews_w_topics.csv", index = False)