In [None]:
# Apply topic modeling (LDA) to extract key topics from a set of documents.

!pip install spacy scikit-learn nltk --quiet
!python -m spacy download en_core_web_sm

import spacy
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

nltk.download('stopwords')
nltk_stopwords = set(stopwords.words('english'))

import en_core_web_sm
nlp = en_core_web_sm.load()

documents = [
    "It is a truth universally acknowledged, that a single man in possession of a good fortune, must be in want of a wife.",
    "Call me Ishmael. Some years ago—never mind how long precisely—I thought I would sail about a little and see the watery part of the world.",
    "It was the best of times, it was the worst of times, it was the age of wisdom, it was the age of foolishness.",
    "All happy families are alike; each unhappy family is unhappy in its own way.",
    "He had never believed in ghosts until he found himself walking the halls of the empty manor at midnight.",
    "The sky above the port was the color of television, tuned to a dead channel.",
    "I am an invisible man. No, I am not a spook like those who haunted Edgar Allan Poe.",
    "It was a bright cold day in April, and the clocks were striking thirteen.",
    "Once upon a time, there was a boy who lived in a cupboard under the stairs.",
    "She had always known she was different, ever since she first heard the voices no one else could hear."
]

cleaned_docs = []
for doc in documents:
    spacy_doc = nlp(doc.lower())
    tokens = [
        token.lemma_ for token in spacy_doc
        if token.is_alpha and not token.is_stop and token.lemma_ not in nltk_stopwords
    ]
    cleaned_docs.append(" ".join(tokens))

vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
X = vectorizer.fit_transform(cleaned_docs)

lda = LatentDirichletAllocation(n_components=3, random_state=42)
lda.fit(X)

print("\n Topics Identified:\n")
words = vectorizer.get_feature_names_out()
for idx, topic in enumerate(lda.components_):
    top_words = [words[i] for i in topic.argsort()[-5:]]
    print(f"Topic {idx+1}: {', '.join(top_words)}")




Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


[nltk_data] Downloading package stopwords to /Users/itami-
[nltk_data]     macbook/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



🔍 Topics Identified:

Topic 1: man, good, time
Topic 2: time, good, man
Topic 3: time, man, good
