In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#stuff for LDA topic modeling of tweets

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *

In [None]:
f = pd.read_csv('bmore.csv')

In [None]:
def clean_text(text):  
    pat1 = r'@[^ ]+'                   
    pat2 = r'https?://[A-Za-z0-9./]+'  
    pat3 = r'\'s'                      
    pat4 = r'\#\w+'                     
    pat5 = r'&amp '                     
    pat6 = r'[^A-Za-z\s]'               
    combined_pat = r'|'.join((pat1, pat2,pat3,pat4,pat5, pat6))
    text = re.sub(combined_pat,"",text).lower()
    return text.strip()

In [None]:
df['text'] = df['text'].apply(clean_text)

In [None]:
#Testing lemmatizer
print(WordNetLemmatizer().lemmatize('went', pos = 'v'))

In [55]:
#Testing stemmer
stemmer = SnowballStemmer("english")
original_words = ['caresses', 'flies', 'dies', 'mules', 'denied','died', 'agreed', 'owned', 
           'humbled', 'sized','meeting', 'stating', 'siezing', 'itemization','sensational', 
           'traditional', 'reference', 'colonizer','plotted']
singles = [stemmer.stem(plural) for plural in original_words]

pd.DataFrame(data={'original word':original_words, 'stemmed':singles })

Unnamed: 0,original word,stemmed
0,caresses,caress
1,flies,fli
2,dies,die
3,mules,mule
4,denied,deni
5,died,die
6,agreed,agre
7,owned,own
8,humbled,humbl
9,sized,size


In [56]:
#text preprocess
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
# Tokenize and lemmatize
def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
            
    return result

In [57]:
# Testing preprocessing function

doc_sample = 'This disk has failed many times. I would like to get it replaced.'

print("Original document: ")
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print("\n\nTokenized and lemmatized document: ")
print(preprocess(doc_sample))

Original document: 
['This', 'disk', 'has', 'failed', 'many', 'times.', 'I', 'would', 'like', 'to', 'get', 'it', 'replaced.']


Tokenized and lemmatized document: 
['disk', 'fail', 'time', 'like', 'replac']


In [65]:
text_clean = [] 

for doc in df.text:
    text_clean.append(preprocess(doc))

#Create dictionary of lemmatized words
dictionary = gensim.corpora.Dictionary(text_clean)

In [66]:
#Checking dictionary created

count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 baltimorean
1 eternal
2 forev
3 francisco
4 pelosi
5 peopl
6 repres
7 speaker
8 appropri
9 barrier
10 field


In [67]:
#Create bag of words from dictionary
twit_bag = [dictionary.doc2bow(doc) for doc in text_clean]

In [68]:
twit_bag[0]

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)]

In [69]:
#shows bag of words for first entry in dataset
bow_doc_x = twit_bag[0]

for i in range(len(bow_doc_x)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_x[i][0], 
                                                     dictionary[bow_doc_x[i][0]], 
                                                     bow_doc_x[i][1]))

Word 0 ("baltimorean") appears 1 time.
Word 1 ("eternal") appears 1 time.
Word 2 ("forev") appears 1 time.
Word 3 ("francisco") appears 1 time.
Word 4 ("pelosi") appears 1 time.
Word 5 ("peopl") appears 1 time.
Word 6 ("repres") appears 1 time.
Word 7 ("speaker") appears 1 time.


In [80]:
tweets_lda = gensim.models.LdaMulticore(twit_bag, 
                                   num_topics = 15, 
                                   id2word = dictionary,                                    
                                   passes = 10,
                                   random_state = 1,
                                   workers = 2)

In [81]:
for idx, topic in tweets_lda.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic))
    print("\n")

Topic: 0 
Words: 0.030*"water" + 0.026*"area" + 0.021*"test" + 0.021*"weather" + 0.018*"special" + 0.017*"expect" + 0.017*"point" + 0.015*"latest" + 0.015*"storm" + 0.014*"continu"


Topic: 1 
Words: 0.033*"covid" + 0.029*"case" + 0.025*"maryland" + 0.021*"report" + 0.019*"public" + 0.019*"death" + 0.015*"vaccin" + 0.014*"ann" + 0.014*"arundel" + 0.013*"believ"


Topic: 2 
Words: 0.111*"citi" + 0.086*"baltimor" + 0.018*"public" + 0.018*"mayor" + 0.017*"council" + 0.015*"meet" + 0.013*"hear" + 0.012*"board" + 0.012*"state" + 0.011*"announc"


Topic: 3 
Words: 0.080*"polic" + 0.043*"baltimor" + 0.042*"shoot" + 0.030*"offic" + 0.025*"yearold" + 0.022*"counti" + 0.019*"say" + 0.019*"charg" + 0.018*"investig" + 0.018*"kill"


Topic: 4 
Words: 0.061*"maryland" + 0.031*"elect" + 0.029*"raven" + 0.025*"vote" + 0.020*"state" + 0.014*"counti" + 0.013*"democrat" + 0.013*"candid" + 0.011*"race" + 0.011*"earli"


Topic: 5 
Words: 0.043*"live" + 0.031*"news" + 0.029*"talk" + 0.024*"baltimor" + 0.021

In [82]:
df['main_topic'] = [int(str(sorted(tweets_lda[i],reverse=True,key=lambda x: x[1])[0][0]).zfill(3)) for i in twit_bag]

In [83]:
df.head()

Unnamed: 0.1,Unnamed: 0,date,author,twitter_name,text,number_of_likes,number_of_retweets,main_topic
0,0.0,2022-11-18 02:52:03+00:00,Brandon M. Scott,MayorBMScott,speaker pelosi may represent the people of san...,36,4,12
1,1.0,2022-11-18 02:52:02+00:00,Brandon M. Scott,MayorBMScott,so many of my peers can thank her for knocking...,26,2,0
2,2.0,2022-11-18 02:52:02+00:00,Brandon M. Scott,MayorBMScott,in addition to being the first female speaker ...,9,0,10
3,3.0,2022-11-18 02:52:01+00:00,Brandon M. Scott,MayorBMScott,it brings us all great pride that a daughter o...,11,2,5
4,4.0,2022-11-18 02:52:00+00:00,Brandon M. Scott,MayorBMScott,on behalf of the city of baltimore i want to c...,221,12,10


In [79]:
df.text[1]

'so many of my peers can thank her for knocking down barriers in order to level the playing field only appropriate'

In [86]:
df.text[100]

'phase  of the  st century schools program is set to begin with renovations for some of our moat'

In [87]:
df.main_topic[100]

10

In [88]:
df.text[500]

'stay connected you never know where your favorite artists entrepreneurs and influencers might be next visit'

In [89]:
df.main_topic[500]

11

In [91]:
df.to_csv('lda_bmore.csv')