# Install Dependencies and load libraries

In [43]:
!pip install bertopic
!pip install transformers
!pip install sentence-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [44]:
import pandas as pd
import numpy as np
import re
import json

In [45]:
import nltk
import torch
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [46]:
from bertopic import BERTopic

# Upload Data

In [47]:
file_name = 'total_scrapped_data.csv'

- Read data file according to how to upload the file

In [48]:
# df = pd.read_csv('/content/gdrive/MyDrive/' + file_name)
'''Or uncomment the following if you choose to upload your data file'''
df = pd.read_csv(file_name)

# Processing

- Define some functions for data processing

In [49]:
def clean_punc_newlines(input_text):
    # delete special character
    punc_free = re.sub(r'[^a-zA-Z0-9\s]', ' ', input_text)
    return re.sub(r'\n|\t', ' ', punc_free)


class Processor:
    def __init__(self, df, column):
        self.df = df
        self.column = column

    def delete_null_duplicates(self):
        # delete duplicates
        self.df.drop_duplicates(subset=[self.column], inplace=True, ignore_index=True)
        # delete null value
        self.df.dropna(subset=[self.column], inplace=True)
        self.df.reset_index(drop=True, inplace=True)

    def delete_short_overlapped_sent(self):
        indices = []
        for i in range(len(self.df[self.column].values)):
            if len(self.df[self.column].values[i].split()) < 5:
                # filter text less than three words
                indices.append(i)

            if i != len(self.df[self.column].values) - 1:
                pre = self.df[self.column].values[i].replace(" ", "")
                lat = self.df[self.column].values[i + 1].replace(" ", "")
                if pre in lat or lat in pre:
                    # filter overlapped texts
                    indices.append(i)

        self.df.drop(indices, inplace=True)

    def clean_text(self):
        self.df["cleaned"] = self.df[self.column].apply(clean_punc_newlines)


def mask_stopwords(docs, stopwords):
    nltk_stop_words = stopwords.words('english')
    masked_docs = []
    for text in docs:
        tokens = text.lower().split()
        for i in range(len(tokens)):
            if tokens[i] in nltk_stop_words:
                tokens[i] = '[MASK]'
        masked_docs.append(' '.join(tokens))
    return masked_docs


def remove_masks(dic):
    filtered_dic = {}
    for k, v in dic.items():
        for i in v:
            if i[0] == 'mask' or i[0] == 'MASK':
                v.remove(i)
        filtered_dic[k] = v
    return filtered_dic

- Show column names and decide which column to use for data processing

In [50]:
df.columns

Index(['Sentence', 'Category'], dtype='object')

In [51]:
# choose column name here
column_name = 'Sentence'

- Processing input text and get a list of documents

In [52]:
processor = Processor(df, column_name)
processor.delete_null_duplicates()
processor.delete_short_overlapped_sent()
processor.clean_text()
# get list of documents for BERT topic modelling
docs = processor.df.cleaned.to_list()
# mask stopwords
masked_docs = mask_stopwords(docs, stopwords)
# docs

# Topic Modelling

In [53]:
from sentence_transformers import SentenceTransformer

sentence_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
n_topics = None

In [54]:
model = BERTopic(embedding_model=sentence_model, nr_topics=n_topics, verbose=True)
doc_topics, prob_topics = model.fit_transform(masked_docs)

Batches:   0%|          | 0/63 [00:00<?, ?it/s]

2022-08-15 15:59:37,463 - BERTopic - Transformed documents to Embeddings
2022-08-15 16:00:00,315 - BERTopic - Reduced dimensionality
2022-08-15 16:00:00,400 - BERTopic - Clustered reduced embeddings


In [55]:
# get all potential topics
topics = model.get_topics()

# remove masks
topics = remove_masks(topics)

In [56]:
topics

{-1: [('modi', 0.01732000562130977),
  ('india', 0.01428867139992016),
  ('bjp', 0.011993043285906519),
  ('people', 0.009603478726024808),
  ('pm', 0.009019140725238657),
  ('govt', 0.008482644549235707),
  ('congress', 0.008472767235852779),
  ('http', 0.007115903919134497),
  ('one', 0.0065127226749647675)],
 0: [('people', 0.01657711351214658),
  ('democracy', 0.010972659640450926),
  ('say', 0.009337697968353134),
  ('time', 0.009170306954884),
  ('one', 0.009094053916412105),
  ('like', 0.009034910429785133),
  ('nation', 0.008656818652112227),
  ('right', 0.008613406715322042),
  ('media', 0.008406630831931028)],
 1: [('india', 0.03552541743634113),
  ('savarkar', 0.02026042532020739),
  ('great', 0.018718155968662244),
  ('indian', 0.01780440407794082),
  ('tributes', 0.015291730116139686),
  ('tribute', 0.014250140421781375),
  ('veer', 0.013380263851622227),
  ('give', 0.012880365042966446),
  ('remembering', 0.012536911485228364)],
 2: [('muslims', 0.06873229034991302),
  ('

# Write to json and download

In [57]:
with open('topics.json', 'w', encoding='utf-8') as f:
    json.dump(topics, f, ensure_ascii=False, indent=4)

# BERTopic

In [58]:
print(max(topics))

21


In [59]:
model.get_topics()

{-1: [('modi', 0.01732000562130977),
  ('india', 0.01428867139992016),
  ('bjp', 0.011993043285906519),
  ('people', 0.009603478726024808),
  ('pm', 0.009019140725238657),
  ('govt', 0.008482644549235707),
  ('congress', 0.008472767235852779),
  ('http', 0.007115903919134497),
  ('one', 0.0065127226749647675)],
 0: [('people', 0.01657711351214658),
  ('democracy', 0.010972659640450926),
  ('say', 0.009337697968353134),
  ('time', 0.009170306954884),
  ('one', 0.009094053916412105),
  ('like', 0.009034910429785133),
  ('nation', 0.008656818652112227),
  ('right', 0.008613406715322042),
  ('media', 0.008406630831931028)],
 1: [('india', 0.03552541743634113),
  ('savarkar', 0.02026042532020739),
  ('great', 0.018718155968662244),
  ('indian', 0.01780440407794082),
  ('tributes', 0.015291730116139686),
  ('tribute', 0.014250140421781375),
  ('veer', 0.013380263851622227),
  ('give', 0.012880365042966446),
  ('remembering', 0.012536911485228364)],
 2: [('muslims', 0.06873229034991302),
  ('

# BERTopic Visualization

In [60]:
model.visualize_topics()

In [61]:
model.visualize_barchart(top_n_topics=20)
# fig.write_html("file.html")

In [62]:
model.visualize_hierarchy()


scipy.array is deprecated and will be removed in SciPy 2.0.0, use numpy.array instead


scipy.array is deprecated and will be removed in SciPy 2.0.0, use numpy.array instead


scipy.array is deprecated and will be removed in SciPy 2.0.0, use numpy.array instead


scipy.array is deprecated and will be removed in SciPy 2.0.0, use numpy.array instead



In [63]:
hierarchical_topics = model.hierarchical_topics(masked_docs, doc_topics)
print(hierarchical_topics)

# h_topics = remove_masks(hierarchical_topics)
# Print topic tree
tree = model.get_topic_tree(hierarchical_topics)
print(tree)

100%|██████████| 21/21 [00:00<00:00, 124.54it/s]


   Parent_ID                                 Parent_Name  \
20        42                  mask_india_people_bjp_modi   
19        41                  mask_people_india_bjp_modi   
18        40                mask_rahul_pakistan_modi_bjp   
17        39                    mask_govt_nda_data_india   
16        38                   rahul_mask_modi_gandhi_ji   
15        37           mask_pakistan_bjp_kashmiri_indian   
14        36              mask_bjp_rallies_aap_addressed   
13        35                    mask_govt_nda_data_india   
12        34              mask_people_india_vote_muslims   
11        33               mask_rallies_addressed_aap_3d   
10        32                    mask_nda_data_govt_india   
9         31    pakistan_mask_kashmiri_accused_kashmiris   
8         30    pakistan_mask_kashmiri_kashmiris_kashmir   
7         29            mask_bjp_vote_elections_election   
6         28            bjp_mask_congress_media_disgrace   
5         27            mask_people_musl

In [64]:
model.visualize_heatmap()


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations



---

# Contextualised Topic Modelling

In [65]:
!pip install contextualized-topic-models==2.2.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [66]:
!pip install pyldavis

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Prep

In [67]:
import nltk
from contextualized_topic_models.utils.preprocessing import WhiteSpacePreprocessing

In [68]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [69]:
sp = WhiteSpacePreprocessing(docs, stopwords_language='english')

In [70]:
preprocessed_documents, unpreprocessed_documents, vocab = sp.preprocess()

In [71]:
len(preprocessed_documents)

1999

In [72]:
len(unpreprocessed_documents)

1999

## Train

In [73]:
from contextualized_topic_models.models.ctm import CombinedTM
from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation

In [74]:
tp = TopicModelDataPreparation("paraphrase-distilroberta-base-v1")

In [75]:
training_dataset = tp.fit(text_for_contextual=unpreprocessed_documents, text_for_bow=preprocessed_documents)

Batches:   0%|          | 0/10 [00:00<?, ?it/s]

In [76]:
training_dataset[0]

{'X_bow': tensor([[0., 0., 0.,  ..., 0., 0., 0.]]),
 'X_contextual': tensor([-4.0498e-02, -2.6594e-01,  1.0159e-01, -1.9433e-01,  7.1802e-02,
          1.2669e-01,  9.4211e-02, -2.2931e-01,  1.3224e-01, -5.7586e-02,
         -4.8656e-02, -5.0711e-01,  6.5773e-03,  1.9796e-01, -4.5504e-02,
          3.1948e-01,  2.9208e-01,  3.6723e-01, -1.4929e-01, -6.5634e-01,
         -3.2622e-02, -3.8401e-01,  1.6066e-01,  1.2029e-01,  5.5425e-01,
          6.1781e-02, -2.7916e-01,  1.3890e-01, -3.2175e-01, -3.6257e-01,
          2.1418e-02, -1.3394e-01, -8.9690e-02, -5.4047e-02,  3.1533e-01,
          2.2677e-02, -9.5729e-02, -4.4652e-02, -3.6684e-02,  6.6625e-02,
          1.2913e-01,  3.0115e-01, -3.7200e-03, -4.3566e-02,  3.2983e-02,
          2.4430e-02,  8.4693e-02, -1.1726e-01,  1.2551e-01, -1.8291e-01,
          1.0474e-01, -6.8118e-02,  1.4911e-01, -1.0972e-01,  9.5576e-02,
          9.1948e-02, -1.0358e-02, -5.0415e-02,  1.8001e-01,  3.9566e-01,
         -1.2335e-01, -1.7255e-01,  4.8188e-

In [77]:
ctm = CombinedTM(bow_size=len(tp.vocab), contextual_size=768, n_components=10, num_epochs=50)
ctm.fit(training_dataset)

Epoch: [50/50]	 Seen Samples: [99950/99950]	Train Loss: 82.46003952904186	Time: 0:00:00.708773: : 50it [00:48,  1.02it/s]


In [78]:
ctm.get_topics()

defaultdict(list,
            {0: ['development',
              'debate',
              'need',
              'mean',
              'leadership',
              'yet',
              'governance',
              'sadhguruquotes',
              'human',
              'women'],
             1: ['gujarat',
              'mps',
              'state',
              'polls',
              'last',
              'across',
              'elections',
              'schools',
              'years',
              'since'],
             2: ['modi',
              'bjp',
              'narendra',
              'ec',
              'pm',
              'attack',
              'shah',
              'arnab',
              'media',
              'pragya'],
             3: ['muslims',
              'muslim',
              'history',
              'nation',
              'hindu',
              'issue',
              'country',
              'life',
              'indian',
              'hate'],
             4: 

## Viz

In [79]:
lda_vis_data = ctm.get_ldavis_data_format(tp.vocab, training_dataset, n_samples=20)

Sampling: [20/20]: : 20it [00:16,  1.22it/s]


In [80]:
import pyLDAvis as vis
movies_pd = vis.prepare(**lda_vis_data)
vis.display(movies_pd)