# Topic modeling on local data (Kenya) using LdaModel from gensim

## Install pyLDAvis

In [None]:
!pip install pyLDAvis

Collecting pyLDAvis
  Downloading pyLDAvis-3.3.1.tar.gz (1.7 MB)
[?25l[K     |▏                               | 10 kB 23.8 MB/s eta 0:00:01[K     |▍                               | 20 kB 23.9 MB/s eta 0:00:01[K     |▋                               | 30 kB 26.8 MB/s eta 0:00:01[K     |▉                               | 40 kB 20.3 MB/s eta 0:00:01[K     |█                               | 51 kB 15.7 MB/s eta 0:00:01[K     |█▏                              | 61 kB 17.4 MB/s eta 0:00:01[K     |█▍                              | 71 kB 18.3 MB/s eta 0:00:01[K     |█▋                              | 81 kB 19.6 MB/s eta 0:00:01[K     |█▉                              | 92 kB 21.3 MB/s eta 0:00:01[K     |██                              | 102 kB 18.8 MB/s eta 0:00:01[K     |██▏                             | 112 kB 18.8 MB/s eta 0:00:01[K     |██▍                             | 122 kB 18.8 MB/s eta 0:00:01[K     |██▋                             | 133 kB 18.8 MB/s eta 0:00:01

## Import Libraries

In [None]:
import pandas as pd
import gensim
from gensim.models.ldamulticore import LdaMulticore
from gensim import corpora, models
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

from nltk.corpus import stopwords
import string
from nltk.stem import WordNetLemmatizer

import warnings
warnings.simplefilter('ignore')
from itertools import chain
import re

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

## Mount drive

In [None]:


from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Import Data

In [None]:
twitter_data = pd.read_csv('/content/drive/MyDrive/Module 3/Datasets/Location Trend Tweets 2022-03-31.csv')
twitter_data

Unnamed: 0,screen_name,hashtag,tweet,time_stamp
0,EliasKabere,#BBIFinalVerdict,RT @Belive_Kinuthia: “IEBC was legally constit...,2022-03-31 08:47:01+00:00
1,Channel54News,#BBIFinalVerdict,"KENYA:#BBIFinalVerdict \n\n"" If the Supreme Co...",2022-03-31 08:47:00+00:00
2,KoneMoheavy,#BBIFinalVerdict,RT @BravinYuri: Summary of CJ Martha Koome's v...,2022-03-31 08:47:00+00:00
3,GodfearingDude,#BBIFinalVerdict,RT @ntvkenya: CJ Koome: I endorse the findings...,2022-03-31 08:46:59+00:00
4,godwin_sakaya,#BBIFinalVerdict,#Supreme court Judge William Ouko has acted th...,2022-03-31 08:46:59+00:00
...,...,...,...,...
2494,abdiazizhashim1,Mighty Diamonds,The BBI Susan Kihika Sonko Junet Odingas Ledam...,2022-03-31 08:30:00+00:00
2495,exclusiveska,Mighty Diamonds,RT @BigshipSounds: The Mighty Diamonds 🔥🔥 http...,2022-03-31 08:29:55+00:00
2496,Breasman1,Mighty Diamonds,RT @VPRecords: Devastated to hear of the passi...,2022-03-31 08:26:54+00:00
2497,royalrampnews,Mighty Diamonds,MIGHTY DIAMONDS Singer Shot &amp; Killed https...,2022-03-31 08:25:20+00:00


## Clean Data

In [None]:
def text_cleaner (text):
  text = re.sub(r'@[A-Za-z0-9]+','',text) ## remove @ mentions
  text = re.sub(r'#','',text) ## remove # symbol
  text = re.sub(r'^RT+','',text) ## remove RT
  text = re.sub(r'https?:\/\/\S+','',text) ## remove hyperlink
  text = re.sub(r'[^\w\s]','',text) ## remove everything apart from words and space
  text = re.sub(r'_',' ',text) ## remove underscore
  text = re.sub(r'\n',' ',text) ## remove \n

  return text

In [None]:
## Create clean text column

twitter_data['cleaned_tweet'] = twitter_data['tweet'].apply(text_cleaner)
twitter_data = twitter_data[['screen_name','hashtag','tweet','cleaned_tweet','time_stamp']]
twitter_data

Unnamed: 0,screen_name,hashtag,tweet,cleaned_tweet,time_stamp
0,EliasKabere,#BBIFinalVerdict,RT @Belive_Kinuthia: “IEBC was legally constit...,Kinuthia IEBC was legally constituted when i...,2022-03-31 08:47:01+00:00
1,Channel54News,#BBIFinalVerdict,"KENYA:#BBIFinalVerdict \n\n"" If the Supreme Co...",KENYABBIFinalVerdict If the Supreme Court r...,2022-03-31 08:47:00+00:00
2,KoneMoheavy,#BBIFinalVerdict,RT @BravinYuri: Summary of CJ Martha Koome's v...,Summary of CJ Martha Koomes verdict i On ba...,2022-03-31 08:47:00+00:00
3,GodfearingDude,#BBIFinalVerdict,RT @ntvkenya: CJ Koome: I endorse the findings...,CJ Koome I endorse the findings of the two s...,2022-03-31 08:46:59+00:00
4,godwin_sakaya,#BBIFinalVerdict,#Supreme court Judge William Ouko has acted th...,Supreme court Judge William Ouko has acted the...,2022-03-31 08:46:59+00:00
...,...,...,...,...,...
2494,abdiazizhashim1,Mighty Diamonds,The BBI Susan Kihika Sonko Junet Odingas Ledam...,The BBI Susan Kihika Sonko Junet Odingas Ledam...,2022-03-31 08:30:00+00:00
2495,exclusiveska,Mighty Diamonds,RT @BigshipSounds: The Mighty Diamonds 🔥🔥 http...,The Mighty Diamonds,2022-03-31 08:29:55+00:00
2496,Breasman1,Mighty Diamonds,RT @VPRecords: Devastated to hear of the passi...,Devastated to hear of the passing of Tabby D...,2022-03-31 08:26:54+00:00
2497,royalrampnews,Mighty Diamonds,MIGHTY DIAMONDS Singer Shot &amp; Killed https...,MIGHTY DIAMONDS Singer Shot amp Killed RIP Ta...,2022-03-31 08:25:20+00:00


In [None]:
## Define Filters
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

In [None]:
## Define function

def text_cleaner2 (text):
    stop_free = ' '.join([word for word in text.lower().split() if word not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = ' '.join([lemma.lemmatize(word) for word in punc_free.split()])
    return normalized.split()

In [None]:
twitter_data['cleaned_tweet2'] = twitter_data['cleaned_tweet'].apply(text_cleaner2)
twitter_data = twitter_data[['screen_name','hashtag','tweet','cleaned_tweet','cleaned_tweet2','time_stamp']]
twitter_data

Unnamed: 0,screen_name,hashtag,tweet,cleaned_tweet,cleaned_tweet2,time_stamp
0,EliasKabere,#BBIFinalVerdict,RT @Belive_Kinuthia: “IEBC was legally constit...,Kinuthia IEBC was legally constituted when i...,"[kinuthia, iebc, legally, constituted, underto...",2022-03-31 08:47:01+00:00
1,Channel54News,#BBIFinalVerdict,"KENYA:#BBIFinalVerdict \n\n"" If the Supreme Co...",KENYABBIFinalVerdict If the Supreme Court r...,"[kenyabbifinalverdict, supreme, court, rule, f...",2022-03-31 08:47:00+00:00
2,KoneMoheavy,#BBIFinalVerdict,RT @BravinYuri: Summary of CJ Martha Koome's v...,Summary of CJ Martha Koomes verdict i On ba...,"[summary, cj, martha, koomes, verdict, basic, ...",2022-03-31 08:47:00+00:00
3,GodfearingDude,#BBIFinalVerdict,RT @ntvkenya: CJ Koome: I endorse the findings...,CJ Koome I endorse the findings of the two s...,"[cj, koome, endorse, finding, two, superior, c...",2022-03-31 08:46:59+00:00
4,godwin_sakaya,#BBIFinalVerdict,#Supreme court Judge William Ouko has acted th...,Supreme court Judge William Ouko has acted the...,"[supreme, court, judge, william, ouko, acted, ...",2022-03-31 08:46:59+00:00
...,...,...,...,...,...,...
2494,abdiazizhashim1,Mighty Diamonds,The BBI Susan Kihika Sonko Junet Odingas Ledam...,The BBI Susan Kihika Sonko Junet Odingas Ledam...,"[bbi, susan, kihika, sonko, junet, odingas, le...",2022-03-31 08:30:00+00:00
2495,exclusiveska,Mighty Diamonds,RT @BigshipSounds: The Mighty Diamonds 🔥🔥 http...,The Mighty Diamonds,"[mighty, diamond]",2022-03-31 08:29:55+00:00
2496,Breasman1,Mighty Diamonds,RT @VPRecords: Devastated to hear of the passi...,Devastated to hear of the passing of Tabby D...,"[devastated, hear, passing, tabby, diamond, le...",2022-03-31 08:26:54+00:00
2497,royalrampnews,Mighty Diamonds,MIGHTY DIAMONDS Singer Shot &amp; Killed https...,MIGHTY DIAMONDS Singer Shot amp Killed RIP Ta...,"[mighty, diamond, singer, shot, amp, killed, r...",2022-03-31 08:25:20+00:00


## Create Dictionary from the articles

In [None]:
#create dictionary
dictionary = corpora.Dictionary(twitter_data['cleaned_tweet2'])
#Total number of non-zeroes in the BOW matrix (sum of the number of unique words per document over the entire corpus).
print(dictionary.num_nnz)

28014


## Create document term matrix

In [None]:
#create document term matrix
doc_term_matrix = [dictionary.doc2bow(doc) for doc in twitter_data['cleaned_tweet2'] ]
print(len(doc_term_matrix))

2499


## Instantiate LDA model

In [None]:
lda = gensim.models.ldamodel.LdaModel

## Fit LDA model on the dataset

In [None]:
num_topics=2
%time ldamodel = lda(doc_term_matrix,num_topics=num_topics,id2word=dictionary,passes=50,minimum_probability=0)

CPU times: user 22.7 s, sys: 297 ms, total: 23 s
Wall time: 22.7 s


## Print the topics identified by LDA model

In [None]:
ldamodel.print_topics(num_topics=num_topics)

[(0,
  '0.015*"bbi" + 0.008*"court" + 0.007*"cj" + 0.007*"world" + 0.007*"koome" + 0.007*"bbifinalverdict" + 0.007*"match" + 0.006*"algeria" + 0.006*"cup" + 0.006*"martha"'),
 (1,
  '0.044*"ledama" + 0.026*"susan" + 0.026*"kihika" + 0.025*"kuria" + 0.024*"diamond" + 0.023*"mighty" + 0.023*"bbi" + 0.023*"junet" + 0.022*"sonko" + 0.022*"nation"')]

## Visualize the LDA model results

In [None]:
lda_display = pyLDAvis.gensim_models.prepare(ldamodel, doc_term_matrix, dictionary, sort_topics=False, mds='mmds')
pyLDAvis.display(lda_display)

## Find which articles were marked in which cluster

In [None]:
# Assigns the topics to the documents in corpus
lda_corpus = ldamodel[doc_term_matrix]

In [None]:
 all_topics = ldamodel.get_document_topics(lda_corpus, minimum_probability=0.0)
 all_topics_csr = gensim.matutils.corpus2csc(all_topics)
 all_topics_numpy = all_topics_csr.T.toarray()
 all_topics_df = pd.DataFrame(all_topics_numpy)

In [None]:
twitter_data['topic'] = all_topics_df.columns.get_indexer(all_topics_df.apply('idxmax', axis=1))


In [None]:
twitter_data['topic'].value_counts(normalize=True)

0    0.714686
1    0.285314
Name: topic, dtype: float64