# Install Dependencies and load libraries

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install bertopic
!pip install transformers
!pip install sentence-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting bertopic
  Downloading bertopic-0.13.0-py2.py3-none-any.whl (103 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m103.9/103.9 KB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting hdbscan>=0.8.29
  Downloading hdbscan-0.8.29.tar.gz (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m60.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting umap-learn>=0.5.0
  Downloading umap-learn-0.5.3.tar.gz (88 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.2/88.2 KB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pyyaml<6.0
  Downloading PyYAML-5.4.1-cp38-cp38-manylinux1_x

In [None]:
import pandas as pd
import numpy as np
import re
import json

In [None]:
import nltk
import torch
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
from bertopic import BERTopic

# Upload Data

In [None]:
file_name = 'topicwise_LID_masked.csv'

- Read data file according to how to upload the file

In [None]:
df = pd.read_csv('/content/drive/MyDrive/topicwise_LID/' + file_name,  on_bad_lines='skip')
'''Or uncomment the following if you choose to upload your data file'''

# df = pd.read_csv(file_name)

FileNotFoundError: ignored

# Processing

- Define some functions for data processing

In [None]:
def clean_punc_newlines(input_text):
    # delete special character
    punc_free = re.sub(r'[^a-zA-Z0-9\s]', ' ', input_text)
    return re.sub(r'\n|\t', ' ', punc_free)


class Processor:
    def __init__(self, df, column):
        self.df = df
        self.column = column

    def delete_null_duplicates(self):
        # delete duplicates
        self.df.drop_duplicates(subset=[self.column], inplace=True, ignore_index=True)
        # delete null value
        self.df.dropna(subset=[self.column], inplace=True)
        self.df.reset_index(drop=True, inplace=True)

    def delete_short_overlapped_sent(self):
        indices = []
        for i in range(len(self.df[self.column].values)):
            if len(self.df[self.column].values[i].split()) < 5:
                # filter text less than three words
                indices.append(i)

            if i != len(self.df[self.column].values) - 1:
                pre = self.df[self.column].values[i].replace(" ", "")
                lat = self.df[self.column].values[i + 1].replace(" ", "")
                if pre in lat or lat in pre:
                    # filter overlapped texts
                    indices.append(i)

        self.df.drop(indices, inplace=True)

    def clean_text(self):
        self.df["cleaned"] = self.df[self.column].apply(clean_punc_newlines)


def mask_stopwords(docs, stopwords):
    nltk_stop_words = stopwords.words('english')
    masked_docs = []
    for text in docs:
        tokens = text.lower().split()
        for i in range(len(tokens)):
            if tokens[i] in nltk_stop_words:
                tokens[i] = '[MASK]'
        masked_docs.append(' '.join(tokens))
    return masked_docs


def remove_masks(dic):
    filtered_dic = {}
    for k, v in dic.items():
        for i in v:
            if i[0] == 'mask' or i[0] == 'MASK':
                v.remove(i)
        filtered_dic[k] = v
    return filtered_dic

- Show column names and decide which column to use for data processing

In [None]:
df.columns

In [None]:
# choose column name here
column_name = 'tweet'

- Processing input text and get a list of documents

In [None]:
processor = Processor(df, column_name)
processor.delete_null_duplicates()
processor.delete_short_overlapped_sent()
processor.clean_text()
# get list of documents for BERT topic modelling
docs = processor.df.cleaned.to_list()
# mask stopwords
masked_docs = mask_stopwords(docs, stopwords)
# docs

# Topic Modelling

In [None]:
from sentence_transformers import SentenceTransformer

sentence_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
n_topics = None

In [None]:
model = BERTopic(embedding_model=sentence_model, nr_topics=n_topics, verbose=True)
doc_topics, prob_topics = model.fit_transform(masked_docs)

In [None]:
# get all potential topics
topics = model.get_topics()

# remove masks
topics = remove_masks(topics)

In [None]:
topics

# Write to json and download

In [None]:
with open('topics.json', 'w', encoding='utf-8') as f:
    json.dump(topics, f, ensure_ascii=False, indent=4)

# BERTopic

In [None]:
print(max(topics))

In [None]:
model.get_topics()

# BERTopic Visualization

In [None]:
fig = model.visualize_topics()

In [None]:
fig

In [None]:
fig.write_html("f1.html")

In [None]:
fig2 = model.visualize_barchart(top_n_topics=868)

In [None]:
model.visualize_barchart(top_n_topics=50)

In [None]:
fig2

In [None]:
fig2.write_html("f2.html")

In [None]:
fig3 = model.visualize_hierarchy()

In [None]:
fig3.write_html("f3.html")

In [None]:

# hierarchical_topics = model.hierarchical_topics(masked_docs, doc_topics)
# print(hierarchical_topics)

# # h_topics = remove_masks(hierarchical_topics)
# # Print topic tree
# tree = model.get_topic_tree(hierarchical_topics)
# print(tree)


In [None]:
fig4 = model.visualize_heatmap()

In [None]:
fig4.write_html("f4.html")

---

# Contextualised Topic Modelling

In [None]:
!pip install contextualized-topic-models==2.2.0

In [None]:
!pip install pyldavis

## Prep

In [None]:
import nltk
from contextualized_topic_models.utils.preprocessing import WhiteSpacePreprocessing

In [None]:
nltk.download('stopwords')

In [None]:
# , stopwords_language='english'
sp = WhiteSpacePreprocessing(docs)

In [None]:
preprocessed_documents, unpreprocessed_documents, vocab = sp.preprocess()

In [None]:
len(preprocessed_documents)

In [None]:
len(unpreprocessed_documents)

## Train

In [None]:
from contextualized_topic_models.models.ctm import CombinedTM
from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation

In [None]:
tp = TopicModelDataPreparation("paraphrase-distilroberta-base-v1")

In [None]:
training_dataset = tp.fit(text_for_contextual=unpreprocessed_documents, text_for_bow=preprocessed_documents)

NameError: ignored

In [None]:
training_dataset[0]

In [None]:
ctm = CombinedTM(bow_size=len(tp.vocab), contextual_size=768, n_components=50, num_epochs=10)
ctm.fit(training_dataset)

In [None]:
ctm.get_topics()

## Viz

In [None]:
lda_vis_data = ctm.get_ldavis_data_format(tp.vocab, training_dataset, n_samples=2)

In [None]:
import pyLDAvis as vis
movies_pd = vis.prepare(**lda_vis_data)
fig5 = vis.display(movies_pd)

In [None]:
fig5

In [None]:
vis.save_html(movies_pd, 'lda.html')