# **Topic Modelling**

### **Identification of Emerging Topics and Issues**

> Topic modeling can help identify emerging or trending topics within the Malaysian political landscape on Twitter. This analysis can highlight the issues that gain significant attention and generate discussions, providing insights into the evolving concerns and interests of the Twitter community regarding politics in Malaysia.

##### **General Imports**

In [193]:
import pandas as pd
import numpy as np
from datetime import datetime

import matplotlib.pyplot as plt
import plotly.graph_objs as go
import plotly.express as px

from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.metrics.pairwise import cosine_similarity
import scipy.stats as stats

##### **Preprocessing**

In [194]:
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re
from textblob import TextBlob
from textblob import WordList

import nltk
nltk.download('brown')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('vader_lexicon')

stopwords_set = set(stopwords.words('indonesian'))
stopwords_set.update(set(stopwords.words('english')))
additional_stopwords = {
    "kita",
    "yall",
    "kami",
    "saya",
    "dia",
    "tapi",
    "tak",
    "ini",
    "itu",
    "kalau",
    "bukan",
    "huh",
    "oh",
    "ayo",
    "rt",  # retweet
    "lrt",  # last retweet
    "im",
    "gon",
    "na",
    'ca',
    'nt',
    'wan',
    'na',
    'lol',
    'lmao',
    'rofl',
    'lmfao',
    'hi',
    'hello',
    'haha',
    'hahaha',
    'eh',
    'dah',
    'la',
    'lah',
    "ka",
    "ke",
    "kah",
    "aku",
    "kau",
    "guys",
    "nak", "tu", "this", "yg", "pa", "je", "yb", "ni", "benda", "dgn", "utk", "jd", "cina"
}

short_forms = {
    "irl": "in real life",
    "u": "you",
    "tpm": "dpm",
    "malaysians": "malaysian",
    "ds": "dato seri",
    "pm": "Prime Minister",
    "PM": "Prime Minister",
    "pm10": "Prime Minister 10",
    "PM10": "Prime Minister 10",
    "PMX": "Prime Minister 10",
    "pmx": "Prime Minister 10",
    "congrats": "congratulations",
    "congratulation": "congratulations",
    "tahniah": "congratulations",
    "btw": "by the way",
    "omg": "oh my god",
    "ni": "this",
    "nt": "not",
    "msia": "malaysia",
    "gov": "government",
    "govt": "government",
    "pls": "please",
    "pru": "General Election",
    "pru15": "General Election 15",
    "ge": "General Election",
    "ge15": "General Election 15",
    "kl": "kuala lumpur",
    "ngos": "ngo",
    "eksyen": "action",
    "wtf": "what the fuck",
    "tf": "the fuck",
    "stfu": "shut the fuck up",
    "idk": "i don't know",
    "dont": "do not",
    "don't": "do not",
    "can't": "cannot",
    "won't": "will not",
    "it's": "it is",
    "isn't": "is not",
    "we're": "we are",
    "you're": "you are",
    "they're": "they are",
    "he's": "he is",
    "she's": "she is",
    "I'm": "I am",
    "that's": "that is",
    "there's": "there is",
    "let's": "let us",
    "who's": "who is",
    "what's": "what is",
    "where's": "where is",
    "when's": "when is",
    "why's": "why is",
    "how's": "how is",
    "didn't": "did not",
    "wasn't": "was not",
    "weren't": "were not",
    "hasn't": "has not",
    "haven't": "have not",
    "hadn't": "had not",
    "can't": "can not",
    "couldn't": "could not",
    "shouldn't": "should not",
    "wouldn't": "would not",
    "mightn't": "might not",
    "mustn't": "must not",
    "i've": "I have",
    "you've": "you have",
    "we've": "we have",
    "they've": "they have",
    "he'd": "he would",
    "she'd": "she would",
    "i'd": "I would",
    "you'd": "you would",
    "we'd": "we would",
    "they'd": "they would",
    "he'll": "he will",
    "she'll": "she will",
    "i'll": "I will",
    "you'll": "you will",
    "we'll": "we will",
    "they'll": "they will",
    "i'd": "I had",
    "you'd": "you had",
    "we'd": "we had",
    "they'd": "they had",
    "should've": "should have",
    "could've": "could have",
    "would've": "would have",
    "might've": "might have",
    "must've": "must have",
    "ought to": "should",
    "need to": "should",
    "gotta": "got to",
    "wanna": "want to",
    "kinda": "kind of",
    "sorta": "sort of",
    "outta": "out of",
    "aren't": "are not",
    "isn't": "is not",
    "wasn't": "was not",
    "weren't": "were not",
    "hasn't": "has not",
    "haven't": "have not",
    "hadn't": "had not",
    "doesn't": "does not",
    "don't": "do not",
    "didn't": "did not",
    "won't": "will not",
    "wouldn't": "would not",
    "shan't": "shall not",
    "shouldn't": "should not",
    "can't": "cannot",
    "couldn't": "could not",
    "needn't": "need not",
    "oughtn't": "ought not",
    "mightn't": "might not",
    "mustn't": "must not",
    "daren't": "dare not",
    "ain't": "is not",
    "it's": "it is",
    "let's": "let us",
    "that's": "that is",
    "what's": "what is",
    "where's": "where is",
    "who's": "who is",
    "how's": "how is",
    "there's": "there is",
    "here's": "here is",
    "smh": "shake my head",
    "fyi": "for your information",
    "imo": "in my opinion",
    "brb": "be right back",
    # Add more short forms/contractions and their expansions as needed
}

stopwords_set.update(additional_stopwords)

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\mekukun\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mekukun\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mekukun\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mekukun\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\mekukun\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\mekukun\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already

In [195]:
pol_landscape = pd.read_csv('MalaysiaMadani.csv')

In [196]:
def remove_URL(text):
    return re.sub(r"http\S+", "", text) 

def remove_hashtags(sample):
    return re.sub(r"#\S+", "", sample) 

def remove_breaklines(text):
    return re.sub(r"\n", " ", text)

# remove stopwords from a string
def remove_stopwords(text):
    tokens = word_tokenize(text)
    filtered_text = [word for word in tokens if word.lower() not in stopwords_set]
    return ' '.join(filtered_text)

def remove_symbols(text):
    return re.sub(r"[^\w\s]", "", text)

def remove_whitespace(text):
    return re.sub(r"\s+", " ", text.strip())

def expand_shortforms(text):
    words = text.split()
    expanded_words = [short_forms.get(word.lower(), word) for word in words]
    expanded_text = ' '.join(expanded_words)
    return expanded_text

def lemmatize_word(word, pos):
    lemmatizer = WordNetLemmatizer()
    if pos.startswith('J'):
        pos = wordnet.ADJ
    elif pos.startswith('V'):
        pos = wordnet.VERB
    elif pos.startswith('N'):
        pos = wordnet.NOUN
    elif pos.startswith('R'):
        pos = wordnet.ADV
    else:
        pos = wordnet.NOUN  # Default to noun if the part of speech is not recognized
    
    return lemmatizer.lemmatize(word, pos=pos)

def convert_date(date_str):
    """
    Converts the given date string into the desired format.

    Parameters:
    - date_str (str): The input date string to be converted.

    Returns:
    - str: The converted date string in the format "DD-MM-YYYY".

    Steps:
    1. Get the current datetime.
    2. If the length of the date string is greater than 3 and it contains a comma:
        a. Try to parse the date string with the format "%b %d, %Y".
        b. If successful, format the parsed date as "DD-MM-YYYY" and return it.
        c. If parsing fails, return the current date formatted as "DD-MM-YYYY".
    3. If the date string ends with 'h', 'm', or 's':
        a. Return the current date formatted as "DD-MM-YYYY".
    4. If the length of the date string is 10 and it has '-' at positions 2 and 5:
        a. Return the date string as it is without any changes.
    5. Otherwise, try to parse the date string with the format "%b %d".
        a. If successful, format the parsed date as "DD-MM-YYYY" and return it.
        b. If parsing fails, return the current date formatted as "DD-MM-YYYY".
    """
    now = datetime.now()
    if len(date_str) > 3 and ',' in date_str:
        try:
            date = datetime.strptime(date_str, "%b %d, %Y")
            return date.strftime("%d-%m-%Y")
        except ValueError:
            return now.strftime("%d-%m-%Y")
    elif date_str.endswith('h') or date_str.endswith('m') or date_str.endswith('s'):
        return now.strftime("%d-%m-%Y")
    elif len(date_str) == 10 and date_str[2] == '-' and date_str[5] == '-':
        return date_str
    else:
        try:
            date = datetime.strptime(date_str, "%b %d")
            return date.strftime("%d-%m-2023")
        except ValueError:
            return now.strftime("%d-%m-%Y")
    

def processText(text):
    # out = lowercase(text)
    out = remove_URL(text)
    out = remove_hashtags(out)
    out = expand_shortforms(out)
    out = remove_stopwords(out)
    out = remove_breaklines(out)
    out = remove_symbols(out)
    out = remove_whitespace(out)
    return out

def lemmatize_word(word, pos):
    lemmatizer = WordNetLemmatizer()
    if pos.startswith('J'):
        pos = wordnet.ADJ
    elif pos.startswith('V'):
        pos = wordnet.VERB
    elif pos.startswith('N'):
        pos = wordnet.NOUN
    elif pos.startswith('R'):
        pos = wordnet.ADV
    else:
        pos = wordnet.NOUN  # Default to noun if the part of speech is not recognized
    
    return lemmatizer.lemmatize(word, pos=pos)

In [197]:
pol_landscape['Post'] = pol_landscape['Post'].apply(processText)
pol_landscape['Post'] = pol_landscape['Post'].apply(lambda x: ' '.join([lemmatize_word(word, pos) for word, pos in nltk.pos_tag(nltk.word_tokenize(x))]))
try:
    # Apply the conversion function to the "Date Posted" column
    pol_landscape['Date Posted'] = pol_landscape['Date Posted'].apply(convert_date)
    pol_landscape['Date Posted'] = pd.to_datetime(pol_landscape['Date Posted'], format='%d-%m-%Y')
except TypeError:
    pass

pol_landscape

Unnamed: 0,Twitter Username,Post,Date Posted
0,@14zdec3,russiahas iran irak stans pakistan india mongo...,2022-02-26
1,@16Lukkk,Belanjawan Malaysia Madani frankly ask T20 fee...,2023-02-24
2,@21badiuzzaman,Malaysia Madani word Madani come Arab see anyo...,2023-01-25
3,@4n4lisis,One problem lack tie Guaidó Middle East Asia M...,2019-12-07
4,@5M30W,Anyway since tweet date cost blow kerajaan mad...,2023-05-31
...,...,...,...
829,@yk2maya,masjid suppose center community member communi...,2023-03-16
830,@yujingwong,Next question take Mr Mohamd Madani CEO Malays...,2013-08-24
831,@yusyiyusoff,suggest mengadu twitter Prime Minister 10 kesa...,2023-01-27
832,@ziadrazak,frustration strike core today s Malaysia Madan...,2023-03-24


In [198]:
del_max_token = 5
pol_landscape['post_split'] = pol_landscape['Post'].apply(lambda x: x.split(" ") if isinstance(x, str) else x)
pol_landscape['tokens_num'] = pol_landscape['post_split'].apply(lambda x: len(x))
pol_landscape[pol_landscape['tokens_num'] <= del_max_token]

Unnamed: 0,Twitter Username,Post,Date Posted,post_split,tokens_num
7,@Aasril_53,Good Morning Malaysia Madani,2023-01-24,"[Good, Morning, Malaysia, Madani]",4
10,@AbdMydin,Exactly need push,2023-04-08,"[Exactly, need, push]",3
17,@AbdMydin,right doctor right physiotherapist key,2023-04-13,"[right, doctor, right, physiotherapist, key]",5
27,@AinnAzreena,BUS PLAYING MALAYSIA MADANI BUNKFACE,2023-04-10,"[BUS, PLAYING, MALAYSIA, MADANI, BUNKFACE]",5
36,@AlifBorhan_,way Malaysia Madani,2023-04-03,"[way, Malaysia, Madani]",3
...,...,...,...,...,...
789,@themystartup,Stay tune,2023-03-01,"[Stay, tune]",2
793,@tinadhillon28,happy PH proud Selangor,2023-06-21,"[happy, PH, proud, Selangor]",4
813,@wasabi1388,welcome Malaysia madani,2023-01-23,"[welcome, Malaysia, madani]",3
816,@wendyg9918,Best PM,2023-01-20,"[Best, PM]",2


In [199]:
pol_landscape.drop(pol_landscape[pol_landscape['tokens_num'] <= del_max_token].index, inplace=True)
pol_landscape.reset_index(drop=True, inplace=True)
pol_landscape

Unnamed: 0,Twitter Username,Post,Date Posted,post_split,tokens_num
0,@14zdec3,russiahas iran irak stans pakistan india mongo...,2022-02-26,"[russiahas, iran, irak, stans, pakistan, india...",14
1,@16Lukkk,Belanjawan Malaysia Madani frankly ask T20 fee...,2023-02-24,"[Belanjawan, Malaysia, Madani, frankly, ask, T...",11
2,@21badiuzzaman,Malaysia Madani word Madani come Arab see anyo...,2023-01-25,"[Malaysia, Madani, word, Madani, come, Arab, s...",23
3,@4n4lisis,One problem lack tie Guaidó Middle East Asia M...,2019-12-07,"[One, problem, lack, tie, Guaidó, Middle, East...",25
4,@5M30W,Anyway since tweet date cost blow kerajaan mad...,2023-05-31,"[Anyway, since, tweet, date, cost, blow, keraj...",23
...,...,...,...,...,...
712,@yk2maya,masjid suppose center community member communi...,2023-03-16,"[masjid, suppose, center, community, member, c...",19
713,@yujingwong,Next question take Mr Mohamd Madani CEO Malays...,2013-08-24,"[Next, question, take, Mr, Mohamd, Madani, CEO...",10
714,@yusyiyusoff,suggest mengadu twitter Prime Minister 10 kesa...,2023-01-27,"[suggest, mengadu, twitter, Prime, Minister, 1...",21
715,@ziadrazak,frustration strike core today s Malaysia Madan...,2023-03-24,"[frustration, strike, core, today, s, Malaysia...",19


##### **Base Model**

In [200]:
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaMulticore


In [201]:
# Create a id2word dictionary
id2word = Dictionary(pol_landscape['post_split'])
print(len(id2word))

4874


In [202]:
# Filtering Extremes
id2word.filter_extremes(no_below=2, no_above=.99)
print(len(id2word))

1623


In [203]:
# Creating a corpus object 
corpus = [id2word.doc2bow(d) for d in pol_landscape['post_split']]

In [204]:
# Instantiating a Base LDA model 
base_model = LdaMulticore(corpus=corpus, num_topics=10, id2word=id2word, workers=12, passes=5)

In [None]:
# Filtering for words 
words = [re.findall(r'"([^"]*)"',t[1]) for t in base_model.print_topics()]

In [None]:
# Create Topics
topics = [' '.join(t[0:10]) for t in words]

In [None]:
for id, t in enumerate(topics): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")

------ Topic 0 ------
Malaysia Madani Anwar s Minister like Prime government also MADANI

------ Topic 1 ------
Madani Malaysia malaysia MADANI go make government anwaribrahim Kerajaan child

------ Topic 2 ------
2023 Minister Malaysia Madani Prime Budget anwaribrahim madani must allocate

------ Topic 3 ------
Malaysia Madani good u PH state time work s Salam

------ Topic 4 ------
Malaysia madani Madani Salam Anwar still 2023 Bonjour PM day

------ Topic 5 ------
Malaysia Madani Minister Prime 10 Kerajaan good u concept plan

------ Topic 6 ------
Minister Malaysia Prime Madani anwaribrahim madani government way work 10

------ Topic 7 ------
Malaysia Madani make Kerajaan time government Minister MADANI like kerajaan

------ Topic 8 ------
Malaysia Madani MADANI government s need come Kerajaan people 2

------ Topic 9 ------
Madani Malaysia get please Kerajaan need s well money take



##### **Model Perplexity and Topic Coherence**

> Provide a convenient measure to judge how good a given topic model is.

In [None]:
from gensim.models import CoherenceModel

In [None]:
# Compute Perplexity
# a measure of how good the model is. lower the better
base_perplexity = base_model.log_perplexity(corpus)
print('\nPerplexity: ', base_perplexity) 

# Compute Coherence Score
coherence_model = CoherenceModel(model=base_model, texts=pol_landscape['post_split'], 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_base = coherence_model.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_base)


Perplexity:  -7.4412887736035565

Coherence Score:  0.3182637261121242


##### **Topic Distance Visualization**

In [None]:
import pyLDAvis
import pyLDAvis.gensim

In [None]:
#Creating Topic Distance Visualization 
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(base_model, corpus, id2word)

#### **Hyperparameter Tuning**

##### **Grid Search**

> The most important tuning parameter for LDA models is n_components (number of topics). In addition, we are going to search learning_decay (which controls the learning rate) as well. Besides these, other possible search params could be learning_offset (down weight early iterations. Should be > 1) and max_iter. These could be worth experimenting if you have enough time and computing resources.

> Be warned, the grid search constructs multiple LDA models for all possible combinations of param values in the param_grid dict. So, this process can consume a lot of time and resources.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
# Make tokens a string again
pol_landscape['post_split_back_to_text'] = [' '.join(map(str, l)) for l in pol_landscape['post_split']]

In [None]:
vectorizer = CountVectorizer()
data_vectorized = vectorizer.fit_transform(pol_landscape['post_split_back_to_text'])

In [None]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV


In [None]:
# Define Search Param
search_params = {'n_components': [10, 15, 20, 25, 30], 'learning_decay': [.5, .7, .9]}

# Init the Model
lda = LatentDirichletAllocation()

# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params)

# Do the Grid Search
model.fit(data_vectorized)
GridSearchCV(cv=None, error_score='raise',
             estimator=LatentDirichletAllocation(batch_size=128, 
                                                 doc_topic_prior=None,
                                                 evaluate_every=-1, 
                                                 learning_decay=0.7, 
                                                 learning_method=None,
                                                 learning_offset=10.0, 
                                                 max_doc_update_iter=100, 
                                                 max_iter=10,
                                                 mean_change_tol=0.001, 
                                                 n_components=10, 
                                                 n_jobs=1,
                                                 perp_tol=0.1, 
                                                 random_state=None,
                                                 topic_word_prior=None, 
                                                 total_samples=1000000.0, 
                                                 verbose=0),
              n_jobs=1,
             param_grid={'n_topics': [10, 15, 20, 30], 
                         'learning_decay': [0.5, 0.7, 0.9]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
             scoring=None, verbose=0)

In [None]:
# Best Model
best_lda_model = model.best_estimator_

# Model Parameters
print("Best Model's Params: ", model.best_params_)

# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)

# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))

Best Model's Params:  {'learning_decay': 0.7, 'n_components': 10}
Best Log Likelihood Score:  -36752.97939682695
Model Perplexity:  3750.4593153462183


In [None]:
#Defining a function to loop over number of topics to be used to find an 
#optimal number of tipics
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the 
    LDA model with respective number of topics
    """
    coherence_values_topic = []
    model_list_topic = []
    for num_topics in range(start, limit, step):
        model = LdaMulticore(corpus=corpus, num_topics=num_topics, id2word=id2word)
        model_list_topic.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values_topic.append(coherencemodel.get_coherence())

    return model_list_topic, coherence_values_topic    

In [None]:
# Can take a long time to run.
model_list_topic, coherence_values_topic = compute_coherence_values(dictionary=id2word,
                                                        corpus=corpus,
                                                        texts=pol_landscape['post_split'],
                                                        start=2, limit=200, step=6)

KeyboardInterrupt: 

##### **Optimum Number of Topics**