In [1]:
#import necessary libraries for text preprocessing
import pandas as pd
import nltk 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [2]:
#import topic modelling as well
from gensim import corpora
from gensim.models import LdaModel

In [3]:
#Download nltk resourses
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Qeme\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Qeme\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Qeme\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
#load the data for our testing
documents = [
    "Rafael Nadal Joins Roger Federer in Missing U.S. Open",
    "Rafael Nadal Is Out of the Australian Open",
    "Biden Announces Virus Measures",
    "Biden's Virus Plans Meet Reality",
    "Where Biden's Virus Plan Stands"
]

In [5]:
#So lets do the preprocessing task for the documents
# set up stopword from english language
stop_words = set(stopwords.words('english'))
# set up lemmatizer too
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    tokens = word_tokenize(text.lower()) # tokenize first then we do lowercases
    tokens = [token for token in tokens if token.isalnum()] #filter out non-alphanumeric tokens like numbers or special characters
    tokens = [token for token in tokens if token not in stop_words] #now use the stop_words to remove stopwords
    tokens = [lemmatizer.lemmatize(token) for token in tokens] #now use the lemmatize as well
    return tokens

#preprocess each document in the list
preprocessed_documents = [preprocess_text(doc) for doc in documents]
preprocessed_documents

[['rafael', 'nadal', 'join', 'roger', 'federer', 'missing', 'open'],
 ['rafael', 'nadal', 'australian', 'open'],
 ['biden', 'announces', 'virus', 'measure'],
 ['biden', 'virus', 'plan', 'meet', 'reality'],
 ['biden', 'virus', 'plan', 'stand']]

In [6]:
#create a Gensim Dictionary object from the preprocessed documents
dictionary = corpora.Dictionary(preprocessed_documents)
#convert each preprocessed document into a bag-of-words representation using the dictionary
corpus = [dictionary.doc2bow(doc) for doc in preprocessed_documents]

In [7]:
#NOW RUN LDA
lda_model = LdaModel(corpus, num_topics=2, id2word=dictionary, passes=15)
#so lets breakdown certain thing in this code
#corpus: bag-of-words representation of the documents
#num_topics: number of topics to be extracted by the model
#id2word=dictionary : dictionary mapping from word IDs to words
#passes: number of passes through the corpus during training - random assignment of the topic

In [8]:
#now see the result

#create an empty array to store dominant topic labels for each document
article_labels = []

#iterate over each processed document
for i, doc in enumerate(preprocessed_documents):
    #for each document,convert to box representation
    bow = dictionary.doc2bow(doc)
    #get list of topic probabilities
    topics = lda_model.get_document_topics(bow)
    #determine topic with highest probability
    dominant_topic = max(topics, key=lambda x:x[1])[0]
    #appendf to the list
    article_labels.append(dominant_topic)

In [10]:
#create dataframe to see them
df = pd.DataFrame({"Article": documents, "Topic": article_labels})

#Now print them to see the result
print("Table with Articles and Topic:")
print(df)

Table with Articles and Topic:
                                             Article  Topic
0  Rafael Nadal Joins Roger Federer in Missing U....      1
1         Rafael Nadal Is Out of the Australian Open      1
2                     Biden Announces Virus Measures      0
3                   Biden's Virus Plans Meet Reality      0
4                    Where Biden's Virus Plan Stands      0


In [11]:
#Prnt the top terms for each topic
print("Top Terms for Each Topic:")
for idx, topic in lda_model.print_topics():
    print(f"Topic {idx}:")
    terms = [term.strip() for term in topic.split("+")]
    for term in terms:
        weight, word = term.split("*")
        print(f"- {word.strip()} (weight: {weight.strip()})")
    print()

Top Terms for Each Topic:
Topic 0:
- "biden" (weight: 0.166)
- "virus" (weight: 0.166)
- "plan" (weight: 0.119)
- "reality" (weight: 0.071)
- "meet" (weight: 0.071)
- "announces" (weight: 0.071)
- "measure" (weight: 0.071)
- "stand" (weight: 0.071)
- "australian" (weight: 0.024)
- "rafael" (weight: 0.024)

Topic 1:
- "open" (weight: 0.131)
- "nadal" (weight: 0.131)
- "rafael" (weight: 0.131)
- "federer" (weight: 0.079)
- "roger" (weight: 0.079)
- "missing" (weight: 0.079)
- "join" (weight: 0.079)
- "australian" (weight: 0.079)
- "virus" (weight: 0.027)
- "biden" (weight: 0.027)

