# Topic modeling on local data (Kenya) using LatentDirichletAllocation from sklearn

## Import Libraries

In [None]:
import pandas as pd
import re

from sklearn.feature_extraction.text import CountVectorizer

In [None]:
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

## Import Data

In [None]:
## Mount drive

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
twitter_data = pd.read_csv('/content/drive/MyDrive/Module 3/Datasets/Location Trend Tweets 2022-03-31.csv')
twitter_data

Unnamed: 0,screen_name,hashtag,tweet,time_stamp
0,EliasKabere,#BBIFinalVerdict,RT @Belive_Kinuthia: “IEBC was legally constit...,2022-03-31 08:47:01+00:00
1,Channel54News,#BBIFinalVerdict,"KENYA:#BBIFinalVerdict \n\n"" If the Supreme Co...",2022-03-31 08:47:00+00:00
2,KoneMoheavy,#BBIFinalVerdict,RT @BravinYuri: Summary of CJ Martha Koome's v...,2022-03-31 08:47:00+00:00
3,GodfearingDude,#BBIFinalVerdict,RT @ntvkenya: CJ Koome: I endorse the findings...,2022-03-31 08:46:59+00:00
4,godwin_sakaya,#BBIFinalVerdict,#Supreme court Judge William Ouko has acted th...,2022-03-31 08:46:59+00:00
...,...,...,...,...
2494,abdiazizhashim1,Mighty Diamonds,The BBI Susan Kihika Sonko Junet Odingas Ledam...,2022-03-31 08:30:00+00:00
2495,exclusiveska,Mighty Diamonds,RT @BigshipSounds: The Mighty Diamonds 🔥🔥 http...,2022-03-31 08:29:55+00:00
2496,Breasman1,Mighty Diamonds,RT @VPRecords: Devastated to hear of the passi...,2022-03-31 08:26:54+00:00
2497,royalrampnews,Mighty Diamonds,MIGHTY DIAMONDS Singer Shot &amp; Killed https...,2022-03-31 08:25:20+00:00


## Clean Data

In [None]:
def text_cleaner (text):
  text = re.sub(r'@[A-Za-z0-9]+','',text) ## remove @ mentions
  text = re.sub(r'#','',text) ## remove # symbol
  text = re.sub(r'^RT+','',text) ## remove RT
  text = re.sub(r'https?:\/\/\S+','',text) ## remove hyperlink
  text = re.sub(r'[^\w\s]','',text) ## remove everything apart from words and space
  text = re.sub(r'_',' ',text) ## remove underscore
  text = re.sub(r'\n',' ',text) ## remove \n

  return text

In [None]:
## Create clean text column

twitter_data['cleaned_tweet'] = twitter_data['tweet'].apply(text_cleaner)
twitter_data = twitter_data[['screen_name','hashtag','tweet','cleaned_tweet','time_stamp']]
twitter_data

Unnamed: 0,screen_name,hashtag,tweet,cleaned_tweet,time_stamp
0,EliasKabere,#BBIFinalVerdict,RT @Belive_Kinuthia: “IEBC was legally constit...,Kinuthia IEBC was legally constituted when i...,2022-03-31 08:47:01+00:00
1,Channel54News,#BBIFinalVerdict,"KENYA:#BBIFinalVerdict \n\n"" If the Supreme Co...",KENYABBIFinalVerdict If the Supreme Court r...,2022-03-31 08:47:00+00:00
2,KoneMoheavy,#BBIFinalVerdict,RT @BravinYuri: Summary of CJ Martha Koome's v...,Summary of CJ Martha Koomes verdict i On ba...,2022-03-31 08:47:00+00:00
3,GodfearingDude,#BBIFinalVerdict,RT @ntvkenya: CJ Koome: I endorse the findings...,CJ Koome I endorse the findings of the two s...,2022-03-31 08:46:59+00:00
4,godwin_sakaya,#BBIFinalVerdict,#Supreme court Judge William Ouko has acted th...,Supreme court Judge William Ouko has acted the...,2022-03-31 08:46:59+00:00
...,...,...,...,...,...
2494,abdiazizhashim1,Mighty Diamonds,The BBI Susan Kihika Sonko Junet Odingas Ledam...,The BBI Susan Kihika Sonko Junet Odingas Ledam...,2022-03-31 08:30:00+00:00
2495,exclusiveska,Mighty Diamonds,RT @BigshipSounds: The Mighty Diamonds 🔥🔥 http...,The Mighty Diamonds,2022-03-31 08:29:55+00:00
2496,Breasman1,Mighty Diamonds,RT @VPRecords: Devastated to hear of the passi...,Devastated to hear of the passing of Tabby D...,2022-03-31 08:26:54+00:00
2497,royalrampnews,Mighty Diamonds,MIGHTY DIAMONDS Singer Shot &amp; Killed https...,MIGHTY DIAMONDS Singer Shot amp Killed RIP Ta...,2022-03-31 08:25:20+00:00


In [None]:
twitter_data['cleaned_tweet'][0]

'  Kinuthia IEBC was legally constituted when it undertook the verification of the Signatures for the BBI Amendment Bill  CJ M'

## Pre-Processing

In [None]:
## create vocabulary

cv = CountVectorizer(max_df=0.9,min_df=5,stop_words='english')

In [None]:
## Document term matrix
dtm = cv.fit_transform(twitter_data['cleaned_tweet'])
dtm

<2499x990 sparse matrix of type '<class 'numpy.int64'>'
	with 20513 stored elements in Compressed Sparse Row format>

## Modeling

In [None]:
rand_topics = 3

from sklearn.decomposition import LatentDirichletAllocation

LDA = LatentDirichletAllocation(n_components=rand_topics,random_state=42)

LDA.fit(dtm)

LatentDirichletAllocation(n_components=3, random_state=42)

In [None]:
## Grab vocabulay
print(len(cv.get_feature_names()))
cv.get_feature_names()[510:520]


990




['kihika',
 'kileleshwa',
 'killed',
 'kindly',
 'kinuthia',
 'kinyanjui',
 'kiosk',
 'klopp',
 'knew',
 'know']

In [None]:
## Grab Topics

print(LDA.components_.shape)

(3, 990)


In [None]:
## Sample 1 topic

single_topic = LDA.components_[0]

## Order word distribution in topic 0 and get top ten words

topic_words = 30

top_words = single_topic.argsort()[-topic_words:]

for index in top_words:

  print(cv.get_feature_names()[index])

sports
na
april
mpesa
wanjiku
doctrine
fifa
networks
cameroon
kenya
algeria
uhuru
bring
nakuru
popular
bbifinalverdict
justice
like
structure
basic
match
constitution
initiative
supreme
president
martha
court
koome
cj
bbi




In [None]:
word_count = 15

for i,topic in enumerate(LDA.components_):
  print("The top  {word_count} word for topic # {i} is:".format(word_count=word_count,i=i))
  print([cv.get_feature_names()[index] for index in topic.argsort()[-word_count:]])
  print('\n')
  print('\n')

The top  15 word for topic # 0 is:
['bbifinalverdict', 'justice', 'like', 'structure', 'basic', 'match', 'constitution', 'initiative', 'supreme', 'president', 'martha', 'court', 'koome', 'cj', 'bbi']




The top  15 word for topic # 1 is:
['university', 'kileleshwa', 'uhuruto', 'odingas', 'diamonds', 'mighty', 'daily', 'sonko', 'kuria', 'nation', 'junet', 'bbi', 'kihika', 'susan', 'ledama']




The top  15 word for topic # 2 is:
['legally', 'junior', 'mendy', 'amendment', 'new', 'attorney', 'kcpe', 'people', 'iebc', 'bruce', 'willis', 'time', 'amp', 'hon', 'bbi']








In [None]:
## Attach topics to original dataset
dtm

<2499x990 sparse matrix of type '<class 'numpy.int64'>'
	with 20513 stored elements in Compressed Sparse Row format>

In [None]:
topic_results = LDA.transform(dtm)

topic_results.shape

(2499, 3)

In [None]:
topic_results[0].round(2)

array([0.04, 0.03, 0.93])

In [None]:
twitter_data['cleaned_tweet'][0]

'  Kinuthia IEBC was legally constituted when it undertook the verification of the Signatures for the BBI Amendment Bill  CJ M'

In [None]:
twitter_data['topic'] = topic_results.argmax(axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [None]:
twitter_data.head()

Unnamed: 0,screen_name,hashtag,tweet,cleaned_tweet,time_stamp,topic
0,EliasKabere,#BBIFinalVerdict,RT @Belive_Kinuthia: “IEBC was legally constit...,Kinuthia IEBC was legally constituted when i...,2022-03-31 08:47:01+00:00,2
1,Channel54News,#BBIFinalVerdict,"KENYA:#BBIFinalVerdict \n\n"" If the Supreme Co...",KENYABBIFinalVerdict If the Supreme Court r...,2022-03-31 08:47:00+00:00,0
2,KoneMoheavy,#BBIFinalVerdict,RT @BravinYuri: Summary of CJ Martha Koome's v...,Summary of CJ Martha Koomes verdict i On ba...,2022-03-31 08:47:00+00:00,0
3,GodfearingDude,#BBIFinalVerdict,RT @ntvkenya: CJ Koome: I endorse the findings...,CJ Koome I endorse the findings of the two s...,2022-03-31 08:46:59+00:00,0
4,godwin_sakaya,#BBIFinalVerdict,#Supreme court Judge William Ouko has acted th...,Supreme court Judge William Ouko has acted the...,2022-03-31 08:46:59+00:00,0


In [None]:
twitter_data['topic'].value_counts(normalize=True)

0    0.383353
2    0.331333
1    0.285314
Name: topic, dtype: float64

## Visualizing the models with pyLDAvis

In [None]:
pyLDAvis.sklearn.prepare(LDA, dtm, cv)

  by='saliency', ascending=False).head(R).drop('saliency', 1)
