In [1]:
# Import necessary libraries
import pandas as pd
import gensim
from gensim import corpora
from gensim.models import CoherenceModel
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [3]:
# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Win10\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Win10\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Win10\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [13]:
# Load dataset (use only the 'text' column)
df = pd.read_csv(r"C:\Users\Win10\OneDrive\Documents\UNITEN\Txt Analytics\Lab Assignment 3\news_dataset.csv", usecols=['text'])
df.dropna(inplace=True)  # Remove null values

In [15]:
# Define text pre-processing function
def preprocess_text(text):
    stop_words = set(stopwords.words("english"))
    lemmatizer = WordNetLemmatizer()
    
    # Tokenization
    tokens = word_tokenize(text.lower())
    
    # Remove stopwords & lemmatize
    processed_tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalnum() and word not in stop_words]
    
    return processed_tokens

In [17]:
# Apply text pre-processing
df['processed_text'] = df['text'].apply(preprocess_text)

In [24]:
# Create dictionary & corpus for LDA
dictionary = corpora.Dictionary(df['processed_text'])
corpus = [dictionary.doc2bow(text) for text in df['processed_text'] if isinstance(text, list)]

In [26]:
print(df['processed_text'].head())

0    [wondering, anyone, could, enlighten, car, saw...
1    [recently, posted, article, asking, kind, rate...
2    [depends, priority, lot, people, put, higher, ...
3    [excellent, automatic, found, subaru, legacy, ...
4    [ford, automobile, need, information, whether,...
Name: processed_text, dtype: object


In [28]:
corpus = [dictionary.doc2bow(text) for text in df['processed_text'].tolist()]

In [30]:
# Train LDA model
lda_model = gensim.models.LdaModel(corpus, num_topics=4, id2word=dictionary, passes=15)

In [31]:
# Evaluate coherence score
coherence_model = CoherenceModel(model=lda_model, texts=df['processed_text'], dictionary=dictionary, coherence='c_v')
coherence_score = coherence_model.get_coherence()

In [32]:
# Display topics
print("\n--- LDA Topic Model Results ---")
for idx, topic in lda_model.show_topics(formatted=True):
    print(f"Topic {idx + 1}: {topic}")


--- LDA Topic Model Results ---
Topic 1: 0.021*"key" + 0.010*"chip" + 0.009*"encryption" + 0.007*"government" + 0.007*"system" + 0.007*"clipper" + 0.006*"use" + 0.006*"information" + 0.005*"security" + 0.005*"public"
Topic 2: 0.040*"1" + 0.024*"0" + 0.020*"2" + 0.016*"3" + 0.016*"db" + 0.011*"4" + 0.010*"5" + 0.009*"7" + 0.009*"x" + 0.008*"q"
Topic 3: 0.009*"would" + 0.008*"one" + 0.007*"people" + 0.005*"think" + 0.004*"like" + 0.004*"know" + 0.004*"time" + 0.004*"say" + 0.004*"right" + 0.004*"year"
Topic 4: 0.007*"one" + 0.007*"get" + 0.006*"would" + 0.006*"like" + 0.006*"use" + 0.006*"know" + 0.005*"window" + 0.005*"file" + 0.004*"problem" + 0.004*"work"


In [33]:
# Display coherence score
print(f"\n--- Coherence Score ---")
print(f"Coherence Score: {coherence_score:.4f}")


--- Coherence Score ---
Coherence Score: 0.6717


In [34]:
# Student details:
print("\n--- Student Information ---")
print("Name: Nur Afiqah Najihah Binti Mohd Nasir")
print("Student ID: IS01083539")


--- Student Information ---
Name: Nur Afiqah Najihah Binti Mohd Nasir
Student ID: IS01083539


In [35]:
# Interpretation of coherence score:
print("\n--- Coherence Score Interpretation ---")
print("The coherence score measures the quality of the generated topics.")
print("A higher coherence score indicates that the words within each topic are more logically related, making interpretation easier.")
print("If the score is low, it suggests that topics might be inconsistent, requiring adjustments in preprocessing or the number of topics.")


--- Coherence Score Interpretation ---
The coherence score measures the quality of the generated topics.
A higher coherence score indicates that the words within each topic are more logically related, making interpretation easier.
If the score is low, it suggests that topics might be inconsistent, requiring adjustments in preprocessing or the number of topics.
