# Install Dependencies and load libraries

In [5]:
!pip install bertopic
!pip install transformers
!pip install sentence-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [6]:
import pandas as pd
import numpy as np
import re
import json

In [7]:
import nltk
import torch
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
from bertopic import BERTopic

# Upload Data

In [9]:
file_name = 'Aggression_Imbalance_DS.csv'

- Read data file according to how to upload the file

In [10]:
# df = pd.read_csv('/content/gdrive/MyDrive/' + file_name)
'''Or uncomment the following if you choose to upload your data file'''
df = pd.read_csv(file_name)

# Processing

- Define some functions for data processing

In [11]:
def clean_punc_newlines(input_text):
    # delete special character
    punc_free = re.sub(r'[^a-zA-Z0-9\s]', ' ', input_text)
    return re.sub(r'\n|\t', ' ', punc_free)


class Processor:
    def __init__(self, df, column):
        self.df = df
        self.column = column

    def delete_null_duplicates(self):
        # delete duplicates
        self.df.drop_duplicates(subset=[self.column], inplace=True, ignore_index=True)
        # delete null value
        self.df.dropna(subset=[self.column], inplace=True)
        self.df.reset_index(drop=True, inplace=True)

    def delete_short_overlapped_sent(self):
        indices = []
        for i in range(len(self.df[self.column].values)):
            if len(self.df[self.column].values[i].split()) < 5:
                # filter text less than three words
                indices.append(i)

            if i != len(self.df[self.column].values) - 1:
                pre = self.df[self.column].values[i].replace(" ", "")
                lat = self.df[self.column].values[i + 1].replace(" ", "")
                if pre in lat or lat in pre:
                    # filter overlapped texts
                    indices.append(i)

        self.df.drop(indices, inplace=True)

    def clean_text(self):
        self.df["cleaned"] = self.df[self.column].apply(clean_punc_newlines)


def mask_stopwords(docs, stopwords):
    nltk_stop_words = stopwords.words('english')
    masked_docs = []
    for text in docs:
        tokens = text.lower().split()
        for i in range(len(tokens)):
            if tokens[i] in nltk_stop_words:
                tokens[i] = '[MASK]'
        masked_docs.append(' '.join(tokens))
    return masked_docs


def remove_masks(dic):
    filtered_dic = {}
    for k, v in dic.items():
        for i in v:
            if i[0] == 'mask' or i[0] == 'MASK':
                v.remove(i)
        filtered_dic[k] = v
    return filtered_dic

- Show column names and decide which column to use for data processing

In [12]:
df.columns

Index(['Sentence', 'Category'], dtype='object')

In [13]:
# choose column name here
column_name = 'Sentence'

- Processing input text and get a list of documents

In [14]:
processor = Processor(df, column_name)
processor.delete_null_duplicates()
processor.delete_short_overlapped_sent()
processor.clean_text()
# get list of documents for BERT topic modelling
docs = processor.df.cleaned.to_list()
# mask stopwords
masked_docs = mask_stopwords(docs, stopwords)
# docs

# Topic Modelling

In [15]:
from sentence_transformers import SentenceTransformer

sentence_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
n_topics = None

In [16]:
model = BERTopic(embedding_model=sentence_model, nr_topics=n_topics, verbose=True)
doc_topics, prob_topics = model.fit_transform(masked_docs)

Batches:   0%|          | 0/329 [00:00<?, ?it/s]

2022-08-15 15:47:09,734 - BERTopic - Transformed documents to Embeddings
2022-08-15 15:47:37,687 - BERTopic - Reduced dimensionality
2022-08-15 15:47:38,115 - BERTopic - Clustered reduced embeddings


In [17]:
# get all potential topics
topics = model.get_topics()

# remove masks
topics = remove_masks(topics)

In [18]:
topics

{-1: [('people', 0.0064643852915540825),
  ('like', 0.005846983849900188),
  ('india', 0.004993526448228299),
  ('one', 0.004944068892229782),
  ('country', 0.004102600498897237),
  ('get', 0.0038987369009486177),
  ('indian', 0.003862000839855034),
  ('good', 0.0038407143782761117),
  ('even', 0.0038160354436239997)],
 0: [('bjp', 0.030540129627329632),
  ('modi', 0.017824618841702073),
  ('congress', 0.011970246196100952),
  ('anna', 0.009707830268869894),
  ('party', 0.009610582345835937),
  ('pm', 0.008432892754611832),
  ('kejriwal', 0.008409358574310787),
  ('aap', 0.008176547210682322),
  ('india', 0.007312892107687154)],
 1: [('indvspak', 0.06752649233129425),
  ('indvsuae', 0.04864675702942018),
  ('cwc15', 0.03584301956911687),
  ('co', 0.034140951859984454),
  ('hai', 0.03379150527733323),
  ('http', 0.028506896754376097),
  ('ko', 0.027976565905521937),
  ('kohli', 0.025297308626600887),
  ('ki', 0.023569285616033803),
  ('india', 0.0226689146970166)],
 2: [('pakistan', 0.0

# Write to json and download

In [19]:
with open('topics.json', 'w', encoding='utf-8') as f:
    json.dump(topics, f, ensure_ascii=False, indent=4)

# BERTopic

In [20]:
print(max(topics))

85


In [21]:
model.get_topics()

{-1: [('people', 0.0064643852915540825),
  ('like', 0.005846983849900188),
  ('india', 0.004993526448228299),
  ('one', 0.004944068892229782),
  ('country', 0.004102600498897237),
  ('get', 0.0038987369009486177),
  ('indian', 0.003862000839855034),
  ('good', 0.0038407143782761117),
  ('even', 0.0038160354436239997)],
 0: [('bjp', 0.030540129627329632),
  ('modi', 0.017824618841702073),
  ('congress', 0.011970246196100952),
  ('anna', 0.009707830268869894),
  ('party', 0.009610582345835937),
  ('pm', 0.008432892754611832),
  ('kejriwal', 0.008409358574310787),
  ('aap', 0.008176547210682322),
  ('india', 0.007312892107687154)],
 1: [('indvspak', 0.06752649233129425),
  ('indvsuae', 0.04864675702942018),
  ('cwc15', 0.03584301956911687),
  ('co', 0.034140951859984454),
  ('hai', 0.03379150527733323),
  ('http', 0.028506896754376097),
  ('ko', 0.027976565905521937),
  ('kohli', 0.025297308626600887),
  ('ki', 0.023569285616033803),
  ('india', 0.0226689146970166)],
 2: [('pakistan', 0.0

# BERTopic Visualization

In [22]:
model.visualize_topics()

In [23]:
model.visualize_barchart(top_n_topics=20)
# fig.write_html("file.html")

In [24]:
model.visualize_hierarchy()

In [25]:
hierarchical_topics = model.hierarchical_topics(masked_docs, doc_topics)
print(hierarchical_topics)

# h_topics = remove_masks(hierarchical_topics)
# Print topic tree
tree = model.get_topic_tree(hierarchical_topics)
print(tree)

100%|██████████| 85/85 [00:00<00:00, 190.05it/s]


   Parent_ID                         Parent_Name  \
84       170      mask_bjp_india_people_pakistan   
83       169      mask_bjp_india_pakistan_people   
82       168   mask_indvspak_indvsuae_india_http   
81       167      movie_review_kabir_video_watch   
80       166     indvspak_mask_indvsuae_http_hai   
..       ...                                 ...   
4         90          money_notes_banks_black_rs   
3         89  pakistan_army_pak_indian_pakistani   
2         88       kabir_singh_reddy_arjun_movie   
1         87   muslims_muslim_islam_quran_terror   
0         86     review_bhai_best_pratik_reviews   

                                               Topics Child_Left_ID  \
84  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...           167   
83  [0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 13, 14, 15...           168   
82  [1, 4, 6, 11, 13, 14, 15, 16, 17, 21, 25, 26, ...           145   
81    [7, 12, 20, 22, 24, 31, 34, 37, 41, 50, 53, 77]           126   
80  [1, 4, 6, 11, 13

In [26]:
model.visualize_heatmap()

---

# Contextualised Topic Modelling

In [27]:
!pip install contextualized-topic-models==2.2.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [28]:
!pip install pyldavis

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Prep

In [29]:
import nltk
from contextualized_topic_models.utils.preprocessing import WhiteSpacePreprocessing

In [30]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [31]:
sp = WhiteSpacePreprocessing(docs, stopwords_language='english')

In [32]:
preprocessed_documents, unpreprocessed_documents, vocab = sp.preprocess()

In [33]:
len(preprocessed_documents)

10438

In [34]:
len(unpreprocessed_documents)

10438

## Train

In [35]:
from contextualized_topic_models.models.ctm import CombinedTM
from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation

In [36]:
tp = TopicModelDataPreparation("paraphrase-distilroberta-base-v1")

In [37]:
training_dataset = tp.fit(text_for_contextual=unpreprocessed_documents, text_for_bow=preprocessed_documents)

Batches:   0%|          | 0/53 [00:00<?, ?it/s]

In [38]:
training_dataset[0]

{'X_bow': tensor([[0., 0., 0.,  ..., 0., 0., 0.]]),
 'X_contextual': tensor([-4.0517e-01, -2.2820e-01, -3.3725e-02, -3.7396e-01, -1.2437e-02,
         -4.6936e-02,  1.5041e-01,  3.9746e-01, -4.8150e-02, -2.7647e-01,
         -2.9549e-01, -5.9646e-01, -1.5291e-01, -3.8414e-02, -4.4795e-02,
         -4.2222e-01, -1.4927e-01,  4.9498e-01, -4.1721e-01, -9.9135e-01,
         -3.4259e-01,  2.1728e-01,  2.7737e-01,  2.8769e-01, -1.8721e-01,
         -1.7393e-01,  5.4422e-02,  4.3456e-01,  3.8043e-01, -3.6282e-01,
          1.5098e-02, -1.3608e-01,  4.1560e-01,  1.4708e-01,  3.6475e-01,
         -5.4931e-01, -3.9512e-03, -3.1156e-01, -3.7368e-01, -2.2636e-01,
         -1.1333e-01,  2.2666e-01,  2.7766e-01,  2.1684e-01, -3.1109e-02,
          1.8141e-02, -4.7684e-01, -9.3549e-02, -5.1414e-02, -6.2284e-02,
          2.3986e-01, -5.3388e-02,  5.2697e-02,  1.8612e-01,  3.9992e-01,
          4.0134e-03, -3.5427e-01,  1.5300e-01, -1.7000e-01,  8.0659e-02,
          4.5216e-01, -4.2522e-01,  3.5193e-

In [39]:
ctm = CombinedTM(bow_size=len(tp.vocab), contextual_size=768, n_components=10, num_epochs=50)
ctm.fit(training_dataset)

Epoch: [50/50]	 Seen Samples: [521900/521900]	Train Loss: 72.98859264741435	Time: 0:00:02.845872: : 50it [02:38,  3.17s/it]


In [40]:
ctm.get_topics()

defaultdict(list,
            {0: ['religion',
              'right',
              'sonu',
              'beef',
              'loudspeakers',
              'azaan',
              'hindu',
              'noise',
              'nigam',
              'stop'],
             1: ['one',
              'people',
              'like',
              'work',
              'man',
              'woman',
              'get',
              'good',
              'know',
              'every'],
             2: ['india',
              'country',
              'people',
              'muslim',
              'muslims',
              'religion',
              'hindus',
              'pakistan',
              'hindu',
              'terror'],
             3: ['bjp',
              'congress',
              'party',
              'aap',
              'anna',
              'kejriwal',
              'modi',
              'rss',
              'delhi',
              'join'],
             4: ['indvspak',
        

## Viz

In [41]:
lda_vis_data = ctm.get_ldavis_data_format(tp.vocab, training_dataset, n_samples=20)

Sampling: [20/20]: : 20it [00:47,  2.36s/it]


In [42]:
import pyLDAvis as vis
movies_pd = vis.prepare(**lda_vis_data)
vis.display(movies_pd)


Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated since Python 3.3,and in 3.9 it will stop working


Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated since Python 3.3,and in 3.9 it will stop working

