In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer

from gensim import corpora
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel

# Download NLTK Resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to C:\Users\ACER
[nltk_data]     NITRO\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\ACER
[nltk_data]     NITRO\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\ACER
[nltk_data]     NITRO\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
# Read the data
data = pd.read_csv('news_dataset.csv')

In [3]:
# Use only the 'text' column and drop rows with null values
documents = data['text'].dropna().tolist()

In [4]:
# Initialize stopwords, lemmatizer, and stemmer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token.isalnum() and not token.isdigit()] # Remove non-alphanumeric tokens and numbers
    tokens = [token for token in tokens if token not in stop_words] # Remove stopwords
    tokens = [stemmer.stem(token) for token in tokens] # Apply stemming
    tokens = [lemmatizer.lemmatize(token) for token in tokens] # Apply lemmatization
    return tokens

# Preprocess each document in the list
preprocessed_documents = [preprocess_text(doc) for doc in documents]

In [5]:
# Create a Gensim Dictionary object from the preprocessed documents
dictionary = corpora.Dictionary(preprocessed_documents)

# Convert each preprocessed document into a bag-of-words representation using the dictionary
corpus = [dictionary.doc2bow(doc) for doc in preprocessed_documents]

In [6]:
# Train an LDA model on the corpus with 4 topics using Gensim's LdaModel class
lda_model = LdaModel(corpus, num_topics=4, id2word=dictionary, passes=15)

# Calculate the coherence score for the LDA model
coherence_model_lda = CoherenceModel(model=lda_model, texts=preprocessed_documents, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()

In [7]:
# Empty list to store dominant topic labels for each document
article_labels = []

# Iterate over each processed document
for i, doc in enumerate(preprocessed_documents):
    # Convert to bag-of-words representation
    bow = dictionary.doc2bow(doc)
    # Get list of topic probabilities
    topics = lda_model.get_document_topics(bow)
    # Determine topic with highest probability
    dominant_topic = max(topics, key=lambda x: x[1])[0]
    # Append to the list
    article_labels.append(dominant_topic)

In [8]:
# Create DataFrame
df = pd.DataFrame({"The Article": documents, "The Topic": article_labels})

# Print the DataFrame
print("Table with the Articles and Topic:")
print(df)
print()

Table with the Articles and Topic:
                                             The Article  The Topic
0      I was wondering if anyone out there could enli...          1
1      I recently posted an article asking what kind ...          1
2      \nIt depends on your priorities.  A lot of peo...          1
3      an excellent automatic can be found in the sub...          1
4      : Ford and his automobile.  I need information...          1
...                                                  ...        ...
11091  Secrecy in Clipper Chip\n\nThe serial number o...          3
11092  Hi !\n\nI am interested in the source of FEAL ...          3
11093  The actual algorithm is classified, however, t...          2
11094  \n\tThis appears to be generic calling upon th...          2
11095  \nProbably keep quiet and take it, lest they g...          2

[11096 rows x 2 columns]



In [9]:
# Print the top terms for each topic
print("Top Terms for Each Topic:")
for idx, topic in lda_model.print_topics():
    print(f"Topic {idx}:")
    terms = [term.strip() for term in topic.split("+")]
    for term in terms:
        weight, word = term.split("*")
        print(f"-{word.strip()} (weight: {weight.strip()})")
    print()

# Display the coherence score
print(f'Topic Coherence Score (C_V): {coherence_lda:.4f}')

Top Terms for Each Topic:
Topic 0:
-"q" (weight: 0.042)
-"max" (weight: 0.040)
-"g" (weight: 0.026)
-"r" (weight: 0.025)
-"p" (weight: 0.021)
-"db" (weight: 0.019)
-"n" (weight: 0.017)
-"k" (weight: 0.014)
-"w" (weight: 0.014)
-"c" (weight: 0.012)

Topic 1:
-"use" (weight: 0.010)
-"get" (weight: 0.008)
-"one" (weight: 0.008)
-"would" (weight: 0.007)
-"like" (weight: 0.007)
-"know" (weight: 0.005)
-"window" (weight: 0.005)
-"game" (weight: 0.005)
-"run" (weight: 0.004)
-"work" (weight: 0.004)

Topic 2:
-"would" (weight: 0.009)
-"peopl" (weight: 0.008)
-"one" (weight: 0.008)
-"say" (weight: 0.005)
-"think" (weight: 0.005)
-"know" (weight: 0.005)
-"like" (weight: 0.004)
-"go" (weight: 0.004)
-"time" (weight: 0.004)
-"make" (weight: 0.004)

Topic 3:
-"x" (weight: 0.018)
-"key" (weight: 0.012)
-"use" (weight: 0.011)
-"encrypt" (weight: 0.008)
-"file" (weight: 0.007)
-"program" (weight: 0.006)
-"system" (weight: 0.006)
-"inform" (weight: 0.006)
-"secur" (weight: 0.005)
-"chip" (weight: 0.005