In [45]:
!pip install keybert

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [46]:
import pandas as pd
import networkx as nx
import csv
from keybert import KeyBERT

In [2]:
df = pd.read_csv('/content/topicModelingData.csv', error_bad_lines=False, engine="python")

Skipping line 122433: unexpected end of data


In [47]:
# get important words
vaccine_terms = 'shot|vaccine|vacine|vacines|vaccines|vaccinate|vaccination|vaccinations|vaccinated|vaccinating|vaxxed|vaxx|vax|\
unvaccinated|unvaxxed|antivaxx|antivaccination|anti|\
moderna|pfizer|j&j|immune|immunize|immunizes|immunized|immunization|immunizations|covid-19|covid|covidvaccination|covid19|covid19vaccination'
vaccine_terms_list = vaccine_terms.split('|')

for i in range(20):
    corpus = list(df[df['community'] == i]['text'])
    doc = ' '.join(corpus)
    querywords = doc.split()
    resultwords  = [word for word in querywords if word.lower() not in vaccine_terms_list]
    result = ' '.join(resultwords)

    kw_model = KeyBERT()
    keywords = kw_model.extract_keywords(result)
    print(kw_model.extract_keywords(result, keyphrase_ngram_range=(1, 1), stop_words=None))

[('vaccine', 0.4391), ('vaccinated', 0.4285), ('vaccines', 0.4239), ('vaccinations', 0.4228), ('vaccination', 0.4129)]
[('covidvaccination', 0.5707), ('covid', 0.5065), ('covidfl', 0.4961), ('covid19tracking', 0.4924), ('vaccinehesitancy', 0.4855)]
[('covidvaccination', 0.4614), ('precautions', 0.4281), ('vaccinationrate', 0.4063), ('vaccineequity', 0.4014), ('vaccinessavelives', 0.4008)]
[('covidvaccination', 0.5004), ('covid19vaccines', 0.4498), ('covid19ncancer', 0.4432), ('vaccinesforall', 0.4418), ('covid19bc', 0.439)]
[('foxnews', 0.4143), ('facebook', 0.3762), ('cbsnews', 0.3566), ('news', 0.3401), ('reporting', 0.3225)]
[('covid19vaccineupdates', 0.499), ('covid19vaccines', 0.4338), ('vaccineupdates', 0.4196), ('coronavirusupdate', 0.4037), ('coronavirusupdates', 0.4027)]
[('hospitalizations', 0.4673), ('hospitalization', 0.4352), ('hospitalised', 0.4109), ('hospitalisation', 0.4105), ('vaccinehesitancy', 0.4074)]
[('getvaccinated', 0.422), ('vaccine', 0.3994), ('vaccination', 

In [20]:
# LDA model
# https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
stemmer = SnowballStemmer('english')
# nltk.download('wordnet')
# !pip install PyStemmer

In [21]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [22]:
processed_docs = df['text'].map(preprocess)
processed_docs[:10]

0    [ottawa, hospit, queensway, carleton, hospit, ...
1    [facebook, say, post, cast, doubt, covid, vacc...
2    [vaccin, peopl, appear, get, coronavirus, surp...
3    [sweet, republican, booster, covid, shoot, tod...
4                [beccaturmo, anti, https, hktaeylseq]
5    [univers, virginia, disenrol, student, compli,...
6    [scientist, blast, biden, administr, push, chi...
7    [india, report, covid, case, recoveri, death, ...
8    [look, edg, trumpism, trump, recommend, covid,...
9    [month, warn, ivermectin, ineffect, danger, cu...
Name: text, dtype: object

In [23]:
dictionary = gensim.corpora.Dictionary(processed_docs)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 carleton
1 covid
2 hospit
3 https
4 make
5 mandatori
6 montfort
7 ottawa
8 queensway
9 staff
10 tsxs


In [24]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [25]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [26]:
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.3373876197807154),
 (1, 0.5283198025765363),
 (2, 0.17545239503969073),
 (3, 0.21109818096069147),
 (4, 0.3373876197807154),
 (5, 0.31801676794533873),
 (6, 0.3373876197807154),
 (7, 0.22460004577636714),
 (8, 0.337807218133774),
 (9, 0.1958249765762748)]


In [28]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=20, id2word=dictionary, passes=2, workers=2)

In [29]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.058*"help" + 0.047*"confid" + 0.043*"mileston" + 0.042*"month" + 0.034*"emerg" + 0.032*"public" + 0.029*"tywbnlfj" + 0.024*"know" + 0.021*"author" + 0.019*"shoot"
Topic: 1 
Words: 0.049*"lift" + 0.043*"unvaccin" + 0.040*"peopl" + 0.023*"opportun" + 0.022*"fulli" + 0.019*"mandat" + 0.017*"govern" + 0.016*"constitut" + 0.016*"nation" + 0.015*"certif"
Topic: 2 
Words: 0.453*"today" + 0.055*"life" + 0.038*"booster" + 0.029*"shoot" + 0.022*"gladi" + 0.022*"berejiklian" + 0.019*"elig" + 0.016*"polit" + 0.016*"australia" + 0.016*"popul"
Topic: 3 
Words: 0.040*"caus" + 0.040*"long" + 0.031*"term" + 0.028*"review" + 0.023*"great" + 0.022*"cell" + 0.020*"children" + 0.020*"open" + 0.020*"start" + 0.019*"late"
Topic: 4 
Words: 0.100*"tweet" + 0.056*"american" + 0.038*"peopl" + 0.034*"old" + 0.031*"research" + 0.023*"highest" + 0.019*"flapol" + 0.018*"citizen" + 0.018*"afghanistan" + 0.018*"glad"
Topic: 5 
Words: 0.020*"young" + 0.020*"get" + 0.020*"appoint" + 0.020*"children" +

In [30]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=20, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.114*"pfizer" + 0.104*"grant" + 0.092*"approv" + 0.090*"break" + 0.069*"teacher" + 0.037*"aim" + 0.036*"monday" + 0.032*"educ" + 0.031*"declin" + 0.030*"fail"
Topic: 1 Word: 0.028*"opportun" + 0.027*"vxsxuitjow" + 0.025*"fulli" + 0.023*"major" + 0.022*"encourag" + 0.018*"mandat" + 0.016*"mean" + 0.015*"certif" + 0.014*"pfizer" + 0.012*"get"
Topic: 2 Word: 0.045*"boo" + 0.045*"trump" + 0.043*"crowd" + 0.042*"ralli" + 0.042*"alabama" + 0.039*"donald" + 0.033*"folk" + 0.033*"recommend" + 0.032*"board" + 0.032*"trumpism"
Topic: 3 Word: 0.028*"moderna" + 0.024*"liter" + 0.023*"financi" + 0.023*"qinksg" + 0.023*"answer" + 0.023*"trial" + 0.023*"ahead" + 0.023*"pharma" + 0.023*"complet" + 0.022*"shot"
Topic: 4 Word: 0.074*"offici" + 0.046*"process" + 0.044*"huge" + 0.037*"grant" + 0.035*"approv" + 0.028*"news" + 0.028*"author" + 0.027*"feder" + 0.021*"emerg" + 0.021*"make"
Topic: 5 Word: 0.110*"ioqsxxv" + 0.103*"older" + 0.103*"individu" + 0.101*"diseas" + 0.095*"prevent" + 0.

In [32]:
# preprocess version 2
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(token)
    return result

processed_docs2 = df['text'].map(preprocess)
processed_docs2[:10]

0    [ottawa, hospital, queensway, carleton, hospit...
1    [facebook, says, post, cast, doubt, covid, vac...
2    [vaccinated, people, appear, getting, coronavi...
3    [sweet, republican, booster, covid, shot, toda...
4                [beccaturmo, anti, https, hktaeylseq]
5    [university, virginia, disenrolls, students, c...
6    [scientists, blasted, biden, administration, p...
7    [india, reports, covid, cases, recoveries, dea...
8    [looking, edge, trumpism, trump, recommends, c...
Name: text, dtype: object

In [34]:
dictionary2 = gensim.corpora.Dictionary(processed_docs)
count = 0
for k, v in dictionary2.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 carleton
1 covid
2 hospit
3 https
4 make
5 mandatori
6 montfort
7 ottawa
8 queensway
9 staff
10 tsxs


In [35]:
dictionary2.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
bow_corpus2 = [dictionary2.doc2bow(doc) for doc in processed_docs2]

In [36]:
from gensim import corpora, models
tfidf2 = models.TfidfModel(bow_corpus2)
corpus_tfidf2 = tfidf2[bow_corpus2]
from pprint import pprint
for doc in corpus_tfidf2:
    pprint(doc)
    break

[(0, 0.41678611795725035),
 (4, 0.41678611795725035),
 (5, 0.39285666214841214),
 (6, 0.41678611795725035),
 (7, 0.27816499524810273),
 (8, 0.41730446172097885),
 (9, 0.2702089197017199)]


In [37]:
lda_model2 = gensim.models.LdaMulticore(bow_corpus2, num_topics=20, id2word=dictionary2, passes=2, workers=2)

In [38]:
for idx, topic in lda_model2.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.416*"today" + 0.355*"ioqsxxv" + 0.063*"life" + 0.030*"booster" + 0.015*"shot" + 0.012*"sweet" + 0.011*"republican" + 0.006*"save" + 0.004*"letter" + 0.003*"gold"
Topic: 1 
Words: 0.087*"moderna" + 0.068*"plan" + 0.067*"ahead" + 0.065*"pharma" + 0.062*"qinksg" + 0.042*"scoop" + 0.031*"review" + 0.026*"remain" + 0.024*"good" + 0.024*"state"
Topic: 2 
Words: 0.180*"process" + 0.109*"chief" + 0.038*"huge" + 0.033*"open" + 0.022*"clinic" + 0.022*"sunday" + 0.022*"time" + 0.021*"john" + 0.020*"august" + 0.020*"walk"
Topic: 3 
Words: 0.396*"pfizer" + 0.086*"biontech" + 0.075*"news" + 0.054*"familiar" + 0.053*"huge" + 0.033*"public" + 0.027*"lift" + 0.026*"help" + 0.025*"tywbnlfj" + 0.021*"monday"
Topic: 4 
Words: 0.203*"florida" + 0.050*"virus" + 0.050*"record" + 0.046*"come" + 0.039*"span" + 0.039*"blue" + 0.035*"state" + 0.029*"host" + 0.029*"week" + 0.029*"radio"
Topic: 5 
Words: 0.126*"yesterday" + 0.115*"meet" + 0.078*"storm" + 0.059*"late" + 0.055*"local" + 0.045*"act

In [43]:
for idx, topic in lda_model2.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, re.findall('"([^"]*)"', topic)))

Topic: 0 
Words: ['today', 'ioqsxxv', 'life', 'booster', 'shot', 'sweet', 'republican', 'save', 'letter', 'gold']
Topic: 1 
Words: ['moderna', 'plan', 'ahead', 'pharma', 'qinksg', 'scoop', 'review', 'remain', 'good', 'state']
Topic: 2 
Words: ['process', 'chief', 'huge', 'open', 'clinic', 'sunday', 'time', 'john', 'august', 'walk']
Topic: 3 
Words: ['pfizer', 'biontech', 'news', 'familiar', 'huge', 'public', 'lift', 'help', 'tywbnlfj', 'monday']
Topic: 4 
Words: ['florida', 'virus', 'record', 'come', 'span', 'blue', 'state', 'host', 'week', 'radio']
Topic: 5 
Words: ['yesterday', 'meet', 'storm', 'late', 'local', 'actor', 'night', 'penn', 'sean', 'face']
Topic: 6 
Words: ['older', 'shot', 'dead', 'great', 'health', 'council', 'public', 'close', 'household', 'half']
Topic: 7 
Words: ['need', 'delta', 'mask', 'variant', 'wear', 'school', 'path', 'shelter', 'social', 'forget']
Topic: 8 
Words: ['year', 'normal', 'status', 'breakthrough', 'news', 'sajid', 'virus', 'start', 'adult', 'video'

In [39]:
lda_model_tfidf2 = gensim.models.LdaMulticore(corpus_tfidf2, num_topics=20, id2word=dictionary2, passes=2, workers=4)
for idx, topic in lda_model_tfidf2.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.031*"forget" + 0.031*"breakthrough" + 0.023*"facebook" + 0.022*"post" + 0.021*"delta" + 0.020*"virus" + 0.019*"march" + 0.019*"platform" + 0.017*"storm" + 0.017*"control"
Topic: 1 Word: 0.072*"moderna" + 0.061*"plan" + 0.060*"qinksg" + 0.058*"pharma" + 0.058*"ahead" + 0.058*"chief" + 0.026*"treatment" + 0.017*"visit" + 0.014*"rzgx" + 0.014*"ytgs"
Topic: 2 Word: 0.045*"fact" + 0.036*"high" + 0.034*"rate" + 0.032*"thought" + 0.030*"concern" + 0.029*"appear" + 0.026*"ayryynmipm" + 0.025*"blast" + 0.021*"test" + 0.020*"proof"
Topic: 3 Word: 0.069*"help" + 0.059*"public" + 0.036*"tywbnlfj" + 0.027*"pfizer" + 0.025*"shot" + 0.023*"area" + 0.018*"johannesburg" + 0.018*"connect" + 0.018*"tgbp" + 0.018*"bvnxf"
Topic: 4 Word: 0.238*"ioqsxxv" + 0.221*"older" + 0.211*"today" + 0.021*"booster" + 0.020*"york" + 0.016*"sweet" + 0.015*"life" + 0.014*"republican" + 0.011*"shot" + 0.005*"unvax"
Topic: 5 Word: 0.076*"tune" + 0.068*"virus" + 0.055*"cmcqrwal" + 0.050*"radio" + 0.049*"host"

In [41]:
for idx, topic in lda_model_tfidf2.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, re.findall('"([^"]*)"', topic)))

Topic: 0 Word: ['forget', 'breakthrough', 'facebook', 'post', 'delta', 'virus', 'march', 'platform', 'storm', 'control']
Topic: 1 Word: ['moderna', 'plan', 'qinksg', 'pharma', 'ahead', 'chief', 'treatment', 'visit', 'rzgx', 'ytgs']
Topic: 2 Word: ['fact', 'high', 'rate', 'thought', 'concern', 'appear', 'ayryynmipm', 'blast', 'test', 'proof']
Topic: 3 Word: ['help', 'public', 'tywbnlfj', 'pfizer', 'shot', 'area', 'johannesburg', 'connect', 'tgbp', 'bvnxf']
Topic: 4 Word: ['ioqsxxv', 'older', 'today', 'booster', 'york', 'sweet', 'life', 'republican', 'shot', 'unvax']
Topic: 5 Word: ['tune', 'virus', 'cmcqrwal', 'radio', 'host', 'phil', 'wonder', 'kaedaf', 'victoria', 'ontario']
Topic: 6 Word: ['mean', 'death', 'right', 'think', 'want', 'vxsxuitjow', 'corona', 'women', 'rager', 'ngre']
Topic: 7 Word: ['biontech', 'pfizer', 'shot', 'dead', 'council', 'coronavirus', 'australian', 'irwvdkfdpa', 'aclohkh', 'xvgavx']
Topic: 8 Word: ['pfizer', 'scoop', 'bqyq', 'okyp', 'door', 'subject', 'major'