In [1]:
%pip install spacy
%pip install gensim



In [2]:
# Import necessary libraries
import spacy
import gensim
from gensim import corpora
from pprint import pprint

# Download spaCy's English NLP model
spacy.cli.download("en_core_web_sm")

# Load spaCy's English NLP model
nlp = spacy.load('en_core_web_sm')


[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


# Documents

In [4]:
#
documents = [
    "Machine learning involves algorithms that improve automatically through experience.",
    "Deep learning is a subset of machine learning based on artificial neural networks.",
    "I am ironman",
    "I am the eggman",
    "I am the Walrus"
]

# Preprocess the documents

In [6]:

def preprocess(text):
    # Tokenize and lemmatize using spaCy
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
    return tokens

# Apply preprocessing to all documents
processed_documents = [preprocess(doc) for doc in documents]

# Create a dictionary and corpus for LDA
dictionary = corpora.Dictionary(processed_documents)
corpus = [dictionary.doc2bow(doc) for doc in processed_documents]

# Build LDA model
lda_model = gensim.models.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=18)



# Print topics and their keywords

In [7]:
pprint(lda_model.print_topics())

[(0,
  '0.118*"machine" + 0.118*"experience" + 0.118*"improve" + '
  '0.118*"automatically" + 0.118*"involve" + 0.118*"algorithm" + '
  '0.118*"learning" + 0.020*"eggman" + 0.020*"ironman" + 0.020*"Walrus"'),
 (1,
  '0.231*"Walrus" + 0.231*"ironman" + 0.038*"eggman" + 0.038*"machine" + '
  '0.038*"learning" + 0.038*"neural" + 0.038*"improve" + 0.038*"subset" + '
  '0.038*"automatically" + 0.038*"algorithm"'),
 (2,
  '0.063*"eggman" + 0.063*"ironman" + 0.063*"Walrus" + 0.062*"machine" + '
  '0.062*"learning" + 0.062*"network" + 0.062*"automatically" + '
  '0.062*"artificial" + 0.062*"involve" + 0.062*"neural"'),
 (3,
  '0.167*"learning" + 0.091*"base" + 0.091*"deep" + 0.091*"artificial" + '
  '0.091*"network" + 0.091*"subset" + 0.091*"neural" + 0.091*"machine" + '
  '0.091*"eggman" + 0.015*"ironman"'),
 (4,
  '0.063*"eggman" + 0.063*"ironman" + 0.063*"Walrus" + 0.062*"machine" + '
  '0.062*"learning" + 0.062*"algorithm" + 0.062*"involve" + 0.062*"neural" + '
  '0.062*"artificial" + 0.06

# Assign topics to documents

In [8]:
# Assign topics to documents
for i, doc in enumerate(processed_documents):
    print(f"Document {i+1} - Topic: {lda_model.get_document_topics(corpus[i])}")

Document 1 - Topic: [(0, 0.89970374), (1, 0.025009206), (2, 0.02501596), (3, 0.02525514), (4, 0.02501596)]
Document 2 - Topic: [(0, 0.020151647), (1, 0.020008352), (2, 0.02001448), (3, 0.919811), (4, 0.02001448)]
Document 3 - Topic: [(0, 0.10001111), (1, 0.5999011), (2, 0.10003966), (3, 0.1000085), (4, 0.10003966)]
Document 4 - Topic: [(0, 0.100030094), (1, 0.1000621), (2, 0.10010874), (3, 0.5996905), (4, 0.100108586)]
Document 5 - Topic: [(0, 0.10001111), (1, 0.5999011), (2, 0.100039646), (3, 0.1000085), (4, 0.10003965)]


#                   