# Install Dependencies and load libraries

In [None]:
!pip install bertopic

In [None]:
!pip install transformers

In [None]:
import pandas as pd
import numpy as np
import re
import json

In [None]:
import nltk
import torch
from nltk.corpus import stopwords
nltk.download('stopwords')

In [None]:
from google.colab import drive
import sys
from google.colab import files

In [None]:
from bertopic import BERTopic

# Upload Data

- You can choose to mount your Google Drive account to Colab and use the data file stored in your account, which is more secure

In [None]:
drive.mount('/content/gdrive', force_remount=True)

In [None]:
sys.path.append('/content/gdrive/MyDrive')

- Or you can upload your data file here by running the below cell

In [None]:
src = list(files.upload().values())[0]

- Please specify your file name, which has to be the one you uploaded

In [None]:
file_name = 'MS_samples.csv'

- Read data file according to how to upload the file

In [None]:
# df = pd.read_csv('/content/gdrive/MyDrive/' + file_name)
'''Or uncomment the following if you choose to upload your data file'''
df = pd.read_csv(file_name)

# Processing

- Define some functions for data processing

In [None]:
def clean_punc_newlines(input_text):
    # delete special character
    punc_free = re.sub(r'[^a-zA-Z0-9\s]', ' ', input_text)
    return re.sub(r'\n|\t', ' ', punc_free)


class Processor:
    def __init__(self, df, column):
        self.df = df
        self.column = column

    def delete_null_duplicates(self):
        # delete duplicates
        self.df.drop_duplicates(subset=[self.column], inplace=True, ignore_index=True)
        # delete null value
        self.df.dropna(subset=[self.column], inplace=True)
        self.df.reset_index(drop=True, inplace=True)

    def delete_short_overlapped_sent(self):
        indices = []
        for i in range(len(self.df[self.column].values)):
            if len(self.df[self.column].values[i].split()) < 5:
                # filter text less than three words
                indices.append(i)

            if i != len(self.df[self.column].values) - 1:
                pre = self.df[self.column].values[i].replace(" ", "")
                lat = self.df[self.column].values[i + 1].replace(" ", "")
                if pre in lat or lat in pre:
                    # filter overlapped texts
                    indices.append(i)

        self.df.drop(indices, inplace=True)

    def clean_text(self):
        self.df["cleaned"] = self.df[self.column].apply(clean_punc_newlines)


def mask_stopwords(docs, stopwords):
    nltk_stop_words = stopwords.words('english')
    masked_docs = []
    for text in docs:
        tokens = text.lower().split()
        for i in range(len(tokens)):
            if tokens[i] in nltk_stop_words:
                tokens[i] = '[MASK]'
        masked_docs.append(' '.join(tokens))
    return masked_docs


def remove_masks(dic):
    filtered_dic = {}
    for k, v in dic.items():
        for i in v:
            if i[0] == 'mask' or i[0] == 'MASK':
                v.remove(i)
        filtered_dic[k] = v
    return filtered_dic

- Show column names and decide which column to use for data processing

In [None]:
df.columns

In [None]:
# choose column name here
column_name = 'Deal - Detailed description of case'

- Processing input text and get a list of documents

In [None]:
processor = Processor(df, column_name)
processor.delete_null_duplicates()
processor.delete_short_overlapped_sent()
processor.clean_text()
# get list of documents for BERT topic modelling
docs = processor.df.cleaned.to_list()
# mask stopwords
masked_docs = mask_stopwords(docs, stopwords)

# Topic Modelling

In [None]:
# Choose model name and the number of topics
embedding_model = 'bert-base-uncased'
n_topics = None # prefer not to set so that the clustering algorithm decide by itself

In [None]:
model = BERTopic(embedding_model=embedding_model, nr_topics=n_topics, verbose=True)
doc_topics, _ = model.fit_transform(masked_docs)

In [None]:
# get all potential topics
topics = model.get_topics()
# remove masks
topics = remove_masks(topics)

# Write to json and download

In [None]:
with open('topics.json', 'w', encoding='utf-8') as f:
    json.dump(topics, f, ensure_ascii=False, indent=4)

In [None]:
files.download('topics.json')

# Mask Stopwords

In [None]:
nltk_stop_words = stopwords.words('english')

In [None]:
def mask_stopwords(docs, stopwords):
  masked_docs = []
  for text in docs:
    tokens = text.lower().split()
    for i in range(len(tokens)):
      if tokens[i] in nltk_stop_words:
        tokens[i] = '[MASK]'
    masked_docs.append(' '.join(tokens))
  return masked_docs

In [None]:
masked_docs = mask_stopwords(docs, stopwords)

In [None]:
print(masked_docs[1])

# BERTopic

In [None]:
model = BERTopic(embedding_model='bert-base-uncased', nr_topics=10, verbose=True)

In [None]:
topics, probabilities = model.fit_transform(masked_docs)

In [None]:
print(max(topics))

In [None]:
probabilities[9]

In [None]:
model.get_topic_freq()

In [None]:
  model.get_topics()

# BERTopic Visualization

In [None]:
model.visualize_topics()

In [None]:
model.visualize_barchart()

In [None]:
model.visualize_heatmap()

# Contextualised Topic Modelling

In [None]:
!pip install contextualized-topic-models==2.2.0

In [None]:
!pip install pyldavis

## Prep

In [None]:
import nltk
from contextualized_topic_models.utils.preprocessing import WhiteSpacePreprocessing

In [None]:
nltk.download('stopwords')

In [None]:
sp = WhiteSpacePreprocessing(docs, stopwords_language='english')

In [None]:
preprocessed_documents, unpreprocessed_documents, vocab = sp.preprocess()

In [None]:
len(preprocessed_documents)

In [None]:
len(unpreprocessed_documents)

## Train

In [None]:
from contextualized_topic_models.models.ctm import CombinedTM
from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation

In [None]:
tp = TopicModelDataPreparation("paraphrase-distilroberta-base-v1")

In [None]:
training_dataset = tp.fit(text_for_contextual=unpreprocessed_documents, text_for_bow=preprocessed_documents)

In [None]:
training_dataset[0]

In [None]:
ctm = CombinedTM(bow_size=len(tp.vocab), contextual_size=768, n_components=10, num_epochs=50)
ctm.fit(training_dataset)

In [None]:
ctm.get_topics()

## Viz

In [None]:
lda_vis_data = ctm.get_ldavis_data_format(tp.vocab, training_dataset, n_samples=20)

In [None]:
import pyLDAvis as vis
movies_pd = vis.prepare(**lda_vis_data)
vis.display(movies_pd)