# Lab Assignment 3: Topic Modeling (News Dataset)
### Tariq Walid Bin Abd Aziz (SW01083016)
### Montasir Kamal Eldin Mohamed (IS01080844)

In [4]:
# For text preprocessing
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import cleantext
# For topic modeling
from gensim import corpora
from gensim.models import LdaModel
import pandas as pd
# Download NLTK Resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [43]:
df = pd.read_csv('news_dataset.csv')
df['text'] = df['text'].str.lower()
documents = df['text'].tolist()

In [44]:
stop_words = set(stopwords.words('english')) # Create a set of English stopwords
lemmatizer = WordNetLemmatizer() # Initialize a WordNet lemmatizer

def preprocess_text(text):
    tokens = cleantext.clean_words(str(text),
                clean_all= False, # Execute all cleaning operations
                reg=r'\b[a-zA-Z]{1,2}\b', # Remove parts of text based on regex
                reg_replace='', # String to replace the regex used in reg
                extra_spaces=True ,  # Remove extra white spaces 
                stemming=False , # Stem the words
                stopwords=True ,# Remove stop words
                lowercase=True ,# Convert to lowercase
                numbers=True ,# Remove all digits 
                punct=True ,# Remove all punctuations
                stp_lang='english'  # Language for stop words
                )
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return tokens
    
preprocessed_documents = [preprocess_text(doc) for doc in documents] # Preprocess each document in the list

print(preprocessed_documents[0])

['wondering', 'anyone', 'could', 'enlighten', 'car', 'saw', 'day', 'door', 'sport', 'car', 'looked', 'late', 'early', 'called', 'bricklin', 'door', 'really', 'small', 'addition', 'front', 'bumper', 'separate', 'rest', 'body', 'know', 'anyone', 'tellme', 'model', 'name', 'engine', 'spec', 'year', 'production', 'car', 'made', 'history', 'whatever', 'info', 'funky', 'looking', 'car', 'please', 'mail']


In [45]:
# Create a Gensim Dictionary object from the preprocessed documents
dictionary = corpora.Dictionary(preprocessed_documents)
# Filter out tokens that appear in less than 15 documents or more than 50% of the documents
dictionary.filter_extremes(no_below=15, no_above=0.5)
# Convert each preprocessed document into a bag-of-words representation using the dictionary
corpus = [dictionary.doc2bow(doc) for doc in preprocessed_documents]

In [46]:
# Run LDA
lda_model = LdaModel(corpus, num_topics=4, id2word=dictionary, passes=15) # Train an LDA modelon the corpus with 4 topics using Gensim's LdaModel class

In [47]:
# empty list to store dominant topic labels for each document
article_labels = []

# iterate over each processed document
for i, doc in enumerate(preprocessed_documents):
    # for each document, convert to bag-of-words representation
    bow = dictionary.doc2bow(doc)
    # get list of topic probabilities
    topics = lda_model.get_document_topics(bow)
    # determine topic with highest probability
    dominant_topic = max(topics, key=lambda x: x[1])[0]
    # append to the list
    article_labels.append(dominant_topic)
    
# Create DataFrame
df_result = pd.DataFrame({"Article": documents, "Topic": article_labels})

# Print the DataFrame
print("Table with Articles and Topic:")
print(df_result)
print()

Table with Articles and Topic:
                                                 Article  Topic
0      i was wondering if anyone out there could enli...      0
1      i recently posted an article asking what kind ...      0
2      \nit depends on your priorities.  a lot of peo...      0
3      an excellent automatic can be found in the sub...      0
4      : ford and his automobile.  i need information...      0
...                                                  ...    ...
11309  secrecy in clipper chip\n\nthe serial number o...      2
11310  hi !\n\ni am interested in the source of feal ...      2
11311  the actual algorithm is classified, however, t...      0
11312  \n\tthis appears to be generic calling upon th...      0
11313  \nprobably keep quiet and take it, lest they g...      0

[11314 rows x 2 columns]



In [48]:
# Print top terms for each topic
for topic_id in range(lda_model.num_topics):
    print(f"Top terms for Topic #{topic_id}:")
    top_terms = lda_model.show_topic(topic_id, topn=10)
    print([term[0] for term in top_terms])
    print()

Top terms for Topic #0:
['would', 'one', 'people', 'get', 'know', 'like', 'think', 'government', 'time', 'right']

Top terms for Topic #1:
['armenian', 'game', 'year', 'team', 'new', 'first', 'turkish', 'player', 'israeli', 'israel']

Top terms for Topic #2:
['max', 'file', 'use', 'system', 'key', 'window', 'b', 'program', 'one', 'information']

Top terms for Topic #3:
['one', 'people', 'god', 'would', 'say', 'think', 'know', 'many', 'believe', 'christian']



In [49]:
# Print the top terms for each topic with weight
print("Top Terms for Each Topic:")
for idx, topic in lda_model.print_topics():
    print(f"Topic {idx}:")
    terms = [term.strip() for term in topic.split("+")]
    for term in terms:
        weight, word = term.split("*")
        print(f"- {word.strip()} (weight: {weight.strip()})")
    print()

Top Terms for Each Topic:
Topic 0:
- "would" (weight: 0.011)
- "one" (weight: 0.010)
- "people" (weight: 0.008)
- "get" (weight: 0.007)
- "know" (weight: 0.007)
- "like" (weight: 0.007)
- "think" (weight: 0.007)
- "government" (weight: 0.006)
- "time" (weight: 0.006)
- "right" (weight: 0.005)

Topic 1:
- "armenian" (weight: 0.010)
- "game" (weight: 0.010)
- "year" (weight: 0.008)
- "team" (weight: 0.008)
- "new" (weight: 0.005)
- "first" (weight: 0.005)
- "turkish" (weight: 0.005)
- "player" (weight: 0.005)
- "israeli" (weight: 0.005)
- "israel" (weight: 0.005)

Topic 2:
- "max" (weight: 0.011)
- "file" (weight: 0.010)
- "use" (weight: 0.009)
- "system" (weight: 0.008)
- "key" (weight: 0.008)
- "window" (weight: 0.006)
- "b" (weight: 0.006)
- "program" (weight: 0.006)
- "one" (weight: 0.006)
- "information" (weight: 0.005)

Topic 3:
- "one" (weight: 0.011)
- "people" (weight: 0.010)
- "god" (weight: 0.010)
- "would" (weight: 0.009)
- "say" (weight: 0.006)
- "think" (weight: 0.006)
- "k

## Topic Coherence Score

In [50]:
# import library for Coherence Score
from gensim.models.coherencemodel import CoherenceModel

In [54]:
# Calculate the coherence score for the LDA model
coherence_model_lda = CoherenceModel(model=lda_model, texts=preprocessed_documents, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()

# Display the score
print(f'Topic Coherence Score (C_V): {coherence_lda:.4f}')

Topic Coherence Score (C_V): 0.4640


## Finding & Discussion

#### The terms can be group based on the following topics:
#### Topic 0: Politics
#### Topic 1: International Sports
#### Topic 2: Software Systems
#### Topic 3: Religion
#### A coherence score of 0.4 indicates moderate coherence. The topics are fairly interpretable, but there is still room for improving the model to achieve clearer and more meaningful topics.