## Topic Modeling: Comparing LDA & BerTopic Results

#### LDA with Gensim

In [1]:
import numpy as np
import pandas as pd
from io import StringIO

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import en_core_web_sm

from tqdm import tqdm_notebook as tqdm
from pprint import pprint

In [2]:
nlp= spacy.load("en_core_web_sm")

In [3]:
with open('/Users/tdubon/Documents/Plato_republic_sample.txt', "r") as file:
    lines = [line.strip().lower() for line in file]

In [4]:
doc = []

for i in lines:
    doc = nlp(i)

In [5]:
print(doc)

for which i am indebted to you, i said, now that you have grown gentle towards me and have left off scolding. nevertheless, i have not been well entertained; but that was my own fault and not yours. as an epicure snatches a taste of every dish which is successively brought to table, he not having allowed himself time to enjoy the one before, so have i gone from one subject to another without having discovered what i sought at first, the nature of justice. i left that enquiry and turned away to consider whether justice is virtue and wisdom or evil and folly; and when there arose a further question about the comparative advantages of justice and injustice, i could not refrain from passing on to that. and the result of the whole discussion has been that i know nothing at all. for i know not what justice is, and therefore i am not likely to know whether it is or is not a virtue, nor can i say whether the just man is happy or unhappy.


In [6]:
for token in doc:
    print(token)

for
which
i
am
indebted
to
you
,
i
said
,
now
that
you
have
grown
gentle
towards
me
and
have
left
off
scolding
.
nevertheless
,
i
have
not
been
well
entertained
;
but
that
was
my
own
fault
and
not
yours
.
as
an
epicure
snatches
a
taste
of
every
dish
which
is
successively
brought
to
table
,
he
not
having
allowed
himself
time
to
enjoy
the
one
before
,
so
have
i
gone
from
one
subject
to
another
without
having
discovered
what
i
sought
at
first
,
the
nature
of
justice
.
i
left
that
enquiry
and
turned
away
to
consider
whether
justice
is
virtue
and
wisdom
or
evil
and
folly
;
and
when
there
arose
a
further
question
about
the
comparative
advantages
of
justice
and
injustice
,
i
could
not
refrain
from
passing
on
to
that
.
and
the
result
of
the
whole
discussion
has
been
that
i
know
nothing
at
all
.
for
i
know
not
what
justice
is
,
and
therefore
i
am
not
likely
to
know
whether
it
is
or
is
not
a
virtue
,
nor
can
i
say
whether
the
just
man
is
happy
or
unhappy
.


In [7]:
#remove stop words and punctuation
stop_words = []

def remove_words(doc):
    for token in doc:
        if token.is_punct == True:
            stop_words.append(token)
        
        if token.is_stop == True:
            stop_words.append(token)

In [8]:
remove_words(doc)

In [9]:
print(stop_words)

[for, which, i, am, to, you, ,, i, ,, now, that, you, have, towards, me, and, have, off, ., nevertheless, ,, i, have, not, been, well, ;, but, that, was, my, own, and, not, yours, ., as, an, a, of, every, which, is, to, ,, he, not, himself, to, the, one, before, ,, so, have, i, from, one, to, another, without, what, i, at, first, ,, the, of, ., i, that, and, to, whether, is, and, or, and, ;, and, when, there, a, further, about, the, of, and, ,, i, could, not, from, on, to, that, ., and, the, of, the, whole, has, been, that, i, nothing, at, all, ., for, i, not, what, is, ,, and, therefore, i, am, not, to, whether, it, is, or, is, not, a, ,, nor, can, i, say, whether, the, just, is, or, .]


In [10]:
print(stop_words)

[for, which, i, am, to, you, ,, i, ,, now, that, you, have, towards, me, and, have, off, ., nevertheless, ,, i, have, not, been, well, ;, but, that, was, my, own, and, not, yours, ., as, an, a, of, every, which, is, to, ,, he, not, himself, to, the, one, before, ,, so, have, i, from, one, to, another, without, what, i, at, first, ,, the, of, ., i, that, and, to, whether, is, and, or, and, ;, and, when, there, a, further, about, the, of, and, ,, i, could, not, from, on, to, that, ., and, the, of, the, whole, has, been, that, i, nothing, at, all, ., for, i, not, what, is, ,, and, therefore, i, am, not, to, whether, it, is, or, is, not, a, ,, nor, can, i, say, whether, the, just, is, or, .]


In [11]:
#create list of words
doc_list = []

for token in doc:
    if token not in stop_words:
        doc_list.append(token.text)

In [12]:
print(doc_list)

['indebted', 'said', 'grown', 'gentle', 'left', 'scolding', 'entertained', 'fault', 'epicure', 'snatches', 'taste', 'dish', 'successively', 'brought', 'table', 'having', 'allowed', 'time', 'enjoy', 'gone', 'subject', 'having', 'discovered', 'sought', 'nature', 'justice', 'left', 'enquiry', 'turned', 'away', 'consider', 'justice', 'virtue', 'wisdom', 'evil', 'folly', 'arose', 'question', 'comparative', 'advantages', 'justice', 'injustice', 'refrain', 'passing', 'result', 'discussion', 'know', 'know', 'justice', 'likely', 'know', 'virtue', 'man', 'happy', 'unhappy']


In [13]:
# Creates inputs to topic model -  which is a mapping of word IDs to words.
words = corpora.Dictionary([doc_list]) 

In [14]:
# Turns each document into a bag of words.
corpus = [words.doc2bow(doc_list)]

In [15]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=words,
                                           num_topics=10, 
                                           random_state=2,
                                           update_every=1,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [16]:
pprint(lda_model.print_topics(num_words=10))

[(0,
  '0.021*"justice" + 0.021*"know" + 0.021*"left" + 0.021*"said" + '
  '0.021*"likely" + 0.021*"man" + 0.021*"nature" + 0.021*"passing" + '
  '0.021*"refrain" + 0.021*"result"'),
 (1,
  '0.021*"virtue" + 0.021*"know" + 0.021*"justice" + 0.021*"passing" + '
  '0.021*"likely" + 0.021*"result" + 0.021*"scolding" + 0.021*"nature" + '
  '0.021*"left" + 0.021*"man"'),
 (2,
  '0.021*"justice" + 0.021*"know" + 0.021*"refrain" + 0.021*"left" + '
  '0.021*"likely" + 0.021*"man" + 0.021*"nature" + 0.021*"passing" + '
  '0.021*"question" + 0.021*"indebted"'),
 (3,
  '0.021*"virtue" + 0.021*"justice" + 0.021*"nature" + 0.021*"likely" + '
  '0.021*"question" + 0.021*"passing" + 0.021*"result" + 0.021*"left" + '
  '0.021*"man" + 0.021*"know"'),
 (4,
  '0.022*"justice" + 0.022*"know" + 0.021*"virtue" + 0.021*"left" + '
  '0.021*"having" + 0.021*"passing" + 0.021*"epicure" + 0.021*"scolding" + '
  '0.021*"refrain" + 0.021*"enjoy"'),
 (5,
  '0.069*"justice" + 0.052*"know" + 0.035*"left" + 0.035*"hav

#### Bertopic with spacy

In [None]:
!pip install bertopic

In [18]:
import pandas as pd
import numpy as np
from bertopic import BERTopic
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import en_core_web_sm

In [19]:
nlp= spacy.load("en_core_web_sm")

In [20]:
with open('/Users/tdubon/Documents/Plato_republic_sample.txt', "r") as file:
    lines = [line.strip().lower() for line in file]

In [21]:
doc = []

for i in lines:
    doc = nlp(i)

In [22]:
#remove stop words and punctuation
stop_words = []

def remove_words(doc):
    for token in doc:
        if token.is_punct == True:
            stop_words.append(token)
        
        if token.is_stop == True:
            stop_words.append(token)

In [23]:
remove_words(doc)

In [24]:
#create list of words
doc_list = []

for token in doc:
    if token not in stop_words:
        doc_list.append(token.text)

In [27]:
model = BERTopic(verbose=True)

In [28]:
topic, probabilitites = model.fit_transform(doc_list)

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

2022-03-04 19:03:36,685 - BERTopic - Transformed documents to Embeddings
OMP: Info #274: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
2022-03-04 19:03:46,964 - BERTopic - Reduced dimensionality with UMAP
2022-03-04 19:03:47,010 - BERTopic - Clustered UMAP embeddings with HDBSCAN


In [33]:
model.get_topic_info()                                                       

Unnamed: 0,Topic,Count,Name
0,-1,55,-1_justice_know_virtue_having


In [31]:
model.get_topic(-1)

[('justice', 0.1957267696571512),
 ('know', 0.16155440301154414),
 ('virtue', 0.1218146940827129),
 ('having', 0.1218146940827129),
 ('left', 0.1218146940827129),
 ('', 1e-05),
 ('unhappy', 0.0731882125588209),
 ('consider', 0.0731882125588209),
 ('discussion', 0.0731882125588209),
 ('discovered', 0.0731882125588209)]