In [1]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [2]:
from sklearn.datasets import fetch_20newsgroups

In [3]:
newgroups = fetch_20newsgroups()

In [5]:
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))

In [7]:
from nltk import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\s+', gaps=True)

In [10]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

In [11]:
from string import punctuation
translate = {ord(p):u" " for p in punctuation}

In [13]:
def text_to_tokens(text):
  clean_text = text.lower().translate(translate)
  tokens = [token.strip() for token in tokenizer.tokenize(clean_text) ]
  tokens = [token for token in tokens if token not in stopwords]
  stemmedtokens = [stemmer.stem(token) for token in tokens]
  filteredtokens = [token for token in tokens if len(token) > 2]
  return stemmedtokens

In [14]:
dataset = [text_to_tokens(text) for text in newgroups['data'] ]

In [16]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m44.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [18]:
# dictionary that maps word to unique id's
# prune_At tells how many max unique words can be stored

In [17]:
from gensim.corpora import Dictionary
dictionary = Dictionary(dataset, prune_at=None)

In [None]:
# remove words if they occur in less than 5 docs, remove if they occur in more than 50% of focs, keep_n to keep only top n words

In [19]:
dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=None)

In [None]:
# remove unused word indices of deleted words

In [20]:
dictionary.compactify()

In [None]:
# bag of words

In [21]:
data = [dictionary.doc2bow(doc) for doc in dataset]

In [22]:
from gensim.models import LdaMulticore

In [23]:
topics = 15

In [25]:
lda = LdaMulticore(data, num_topics=topics, id2word=dictionary, passes=10, workers=4, eval_every=None, batch=True, alpha=5/topics, eta=0.01)

In [27]:
!pip install pyLDAvis

Collecting pyLDAvis
  Downloading pyLDAvis-3.4.1-py3-none-any.whl.metadata (4.2 kB)
Collecting funcy (from pyLDAvis)
  Downloading funcy-2.0-py2.py3-none-any.whl.metadata (5.9 kB)
Downloading pyLDAvis-3.4.1-py3-none-any.whl (2.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m28.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading funcy-2.0-py2.py3-none-any.whl (30 kB)
Installing collected packages: funcy, pyLDAvis
Successfully installed funcy-2.0 pyLDAvis-3.4.1


In [28]:
import pyLDAvis.gensim_models
import pyLDAvis
import warnings
warnings.filterwarnings('ignore')

In [29]:
pyLDAvis.enable_notebook()

In [30]:
pyLDAvis.gensim_models.prepare(lda, data, dictionary)

In [None]:
# coherence score using c_v metric

In [36]:
from gensim.models import CoherenceModel

model = CoherenceModel(model=lda,texts=dataset, dictionary=dictionary, coherence='c_v')

In [37]:
score = model.get_coherence()

In [38]:
score

np.float64(0.5137454336470901)

In [None]:
# perplexity score

In [39]:
lda.log_perplexity(data)

np.float64(-7.884634630739686)