# **Topic Modeling with LDA (gensim)**

In [3]:
# Install gensim if not already installed
!pip install gensim

Collecting gensim
  Downloading gensim-4.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
Downloading gensim-4.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.6/26.6 MB[0m [31m57.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.0 MB)
[2K   [90m━━━━━━━━━━━

In [1]:
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim import corpora
from gensim.models import LdaModel

## **1. Sample Documents**

In [2]:
documents = [
    "I love watching movies about superheroes and action.",
    "The new Python library for machine learning is amazing.",
    "The election results will impact government policies.",
    "Deep learning and neural networks are advancing AI research.",
    "The food at the restaurant was delicious and the service was excellent.",
    "Political debates are heating up before the elections.",
    "Natural language processing is a key part of data science.",
    "We enjoyed the new Italian restaurant in town.",
    "Artificial intelligence is transforming industries.",
    "Sports events bring people together in celebration."
]

In [3]:
df = pd.DataFrame(documents, columns=["Document"])
df

Unnamed: 0,Document
0,I love watching movies about superheroes and a...
1,The new Python library for machine learning is...
2,The election results will impact government po...
3,Deep learning and neural networks are advancin...
4,The food at the restaurant was delicious and t...
5,Political debates are heating up before the el...
6,Natural language processing is a key part of d...
7,We enjoyed the new Italian restaurant in town.
8,Artificial intelligence is transforming indust...
9,Sports events bring people together in celebra...


## **2. Preprocessing**

In [4]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [5]:
stop_words = set(stopwords.words("english"))

In [8]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [6]:
def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [w for w in tokens if w not in stop_words]
    return tokens

In [9]:
processed_docs = [preprocess(doc) for doc in documents]
processed_docs[:3]

[['love', 'watching', 'movies', 'superheroes', 'action'],
 ['new', 'python', 'library', 'machine', 'learning', 'amazing'],
 ['election', 'results', 'impact', 'government', 'policies']]

## **3. Dictionary and Corpus**

In [10]:
# Create dictionary (word -> id)
dictionary = corpora.Dictionary(processed_docs)

In [11]:
# Create corpus (Bag of Words representation)
corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [12]:
print("Sample Vocabulary:", dictionary.token2id)
print("Corpus Example:", corpus[0])

Sample Vocabulary: {'action': 0, 'love': 1, 'movies': 2, 'superheroes': 3, 'watching': 4, 'amazing': 5, 'learning': 6, 'library': 7, 'machine': 8, 'new': 9, 'python': 10, 'election': 11, 'government': 12, 'impact': 13, 'policies': 14, 'results': 15, 'advancing': 16, 'ai': 17, 'deep': 18, 'networks': 19, 'neural': 20, 'research': 21, 'delicious': 22, 'excellent': 23, 'food': 24, 'restaurant': 25, 'service': 26, 'debates': 27, 'elections': 28, 'heating': 29, 'political': 30, 'data': 31, 'key': 32, 'language': 33, 'natural': 34, 'part': 35, 'processing': 36, 'science': 37, 'enjoyed': 38, 'italian': 39, 'town': 40, 'artificial': 41, 'industries': 42, 'intelligence': 43, 'transforming': 44, 'bring': 45, 'celebration': 46, 'events': 47, 'people': 48, 'sports': 49, 'together': 50}
Corpus Example: [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)]


## **4. Train LDA Model**

In [13]:
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=3, random_state=42, passes=15)

In [14]:
# Print discovered topics
topics = lda_model.print_topics(num_words=5)
for idx, topic in topics:
    print(f"Topic {idx+1}: {topic}")

Topic 1: 0.051*"results" + 0.051*"government" + 0.051*"election" + 0.051*"policies" + 0.051*"impact"
Topic 2: 0.033*"learning" + 0.033*"deep" + 0.033*"research" + 0.033*"ai" + 0.033*"networks"
Topic 3: 0.034*"new" + 0.034*"restaurant" + 0.034*"amazing" + 0.034*"python" + 0.034*"library"


## **5. Topic Distribution for Each Document**

In [15]:
for i, row in enumerate(lda_model[corpus]):
    print(f"Document {i+1} -> {row}")

Document 1 -> [(0, 0.056164283), (1, 0.88788366), (2, 0.055952024)]
Document 2 -> [(0, 0.04812172), (1, 0.05036879), (2, 0.90150946)]
Document 3 -> [(0, 0.88838905), (1, 0.055802215), (2, 0.055808757)]
Document 4 -> [(0, 0.042115998), (1, 0.9150428), (2, 0.042841204)]
Document 5 -> [(0, 0.056145545), (1, 0.057511736), (2, 0.88634276)]
Document 6 -> [(0, 0.067389295), (1, 0.06712277), (2, 0.865488)]
Document 7 -> [(0, 0.042112425), (1, 0.04194608), (2, 0.9159415)]
Document 8 -> [(0, 0.05616339), (1, 0.8840536), (2, 0.05978302)]
Document 9 -> [(0, 0.8660616), (1, 0.06696524), (2, 0.06697313)]
Document 10 -> [(0, 0.04813796), (1, 0.9039053), (2, 0.047956802)]


## **6. Assign Dominant Topic**

In [16]:
def get_dominant_topic(ldamodel, corpus):
    dominant_topics = []
    for row in ldamodel[corpus]:
        row = sorted(row, key=lambda x: x[1], reverse=True)
        dominant_topics.append(row[0][0])
    return dominant_topics

In [17]:
df["Dominant_Topic"] = get_dominant_topic(lda_model, corpus)
df

Unnamed: 0,Document,Dominant_Topic
0,I love watching movies about superheroes and a...,1
1,The new Python library for machine learning is...,2
2,The election results will impact government po...,0
3,Deep learning and neural networks are advancin...,1
4,The food at the restaurant was delicious and t...,2
5,Political debates are heating up before the el...,2
6,Natural language processing is a key part of d...,2
7,We enjoyed the new Italian restaurant in town.,1
8,Artificial intelligence is transforming indust...,0
9,Sports events bring people together in celebra...,1


## **7. Visualize Topics**

In [18]:
!pip install pyLDAvis

Collecting pyLDAvis
  Downloading pyLDAvis-3.4.1-py3-none-any.whl.metadata (4.2 kB)
Collecting funcy (from pyLDAvis)
  Downloading funcy-2.0-py2.py3-none-any.whl.metadata (5.9 kB)
Downloading pyLDAvis-3.4.1-py3-none-any.whl (2.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m40.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading funcy-2.0-py2.py3-none-any.whl (30 kB)
Installing collected packages: funcy, pyLDAvis
Successfully installed funcy-2.0 pyLDAvis-3.4.1


In [19]:
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis

  return datetime.utcnow().replace(tzinfo=utc)


In [20]:
pyLDAvis.enable_notebook()
lda_vis = gensimvis.prepare(lda_model, corpus, dictionary)
lda_vis

  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return date

  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
