In [13]:
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

stopwords = nltk.corpus.stopwords.words("english")
lemmatizer = nltk.WordNetLemmatizer()

corpus = [
    "The government implemented a new policy",
    "The soccer team lost the game",
    "The film received mixed reviews",
    "The economy is experiencing a downturn",
    "The book became a bestseller",
    "The company announced a merger",
    "The concert was sold out",
    "The research paper was published in a prestigious journal",
    "The artist won an award for their painting",
    "The restaurant opened a new branch",
]


def simple_preprocessing(text):
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)

    tokens = nltk.word_tokenize(text)
    tokens = [token for token in tokens if token not in stopwords]
    lemmas = [lemmatizer.lemmatize(token) for token in tokens]
    return " ".join(lemmas)


vectorizer = TfidfVectorizer()
corpus = [simple_preprocessing(text) for text in corpus]
X = vectorizer.fit_transform(corpus)
print(X.toarray()[0:3])

[[0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.5182909  0.5182909  0.         0.
  0.         0.         0.44059462 0.         0.         0.
  0.5182909  0.         0.         0.         0.         0.
  0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.5        0.         0.         0.         0.5
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.5        0.         0.5       ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.5        0.         0.         0.         0.         0.
  0.         0.5        0.         0.         0.         0.
  0.         0.         0.         0.5        0.         0.
  0.5

In [35]:
import numpy as np
np.random.seed(0)

svd = TruncatedSVD(n_components=3, random_state=0)
latent_semantic_analysis = svd.fit_transform(X)

terms = vectorizer.get_feature_names_out()
N_TERMS = 5
for i, comp in enumerate(svd.components_):
    terms_in_comp = zip(terms, comp)
    sorted_terms = sorted(terms_in_comp, key=lambda x:x[1], reverse=True)[:N_TERMS]
    print("Topic "+str(i)+": ")
    print("|".join([t for t, _ in sorted_terms]))
    print(" ")

Topic 0: 
new|government|implemented|policy|branch
 
Topic 1: 
painting|artist|award|government|implemented
 
Topic 2: 
announced|company|merger|became|bestseller
 
