## Import Required Libraries

In [None]:
import snscrape.modules.twitter as sntwitter
import pandas as pd
import string
import re
import nltk  
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.probability import FreqDist
from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel
from gensim.models import CoherenceModel

In [None]:
from nltk import ngrams
from rake_nltk import Rake
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import collections
import math
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns

In [None]:
import gensim
from gsdmm import MovieGroupProcess

## Read Tweets Data

In [None]:
tweets_list = []

# Using TwitterSearchScraper to scrape data and append tweets to list
for i,tweet in enumerate(sntwitter.TwitterSearchScraper(' since:2021-11-1 until:2021-11-30 lang:en').get_items()):
    tweets_list.append([tweet.content, tweet.user.username, tweet.date, tweet.id])
    
# Creating a dataframe from the tweets list above
tweets_df = pd.DataFrame(tweets_list, columns=['text', 'user', 'date', 'Tweet Id'])

## Preprocess Data

Drop unnecessary columns

In [None]:
df.drop(columns = ['user', 'date', 'Tweet Id'], inplace = True)

Remove URLs from data

In [None]:
def remove_urls(text):
    return re.sub(r'http\S+','', text)

df['text'] = df['text'].apply(remove_urls)

Lowercase all alphabets and remove punctuation

In [None]:
df['clean'] = df['text'].str.lower().str.replace('[^\w\s]', ' ').str.replace(' +', ' ').str.strip()

In [None]:
df = df.rename(columns={"text": 0, "clean": 1})

Tokenize data

In [None]:
df[1] = df.apply(lambda row: nltk.word_tokenize(row[1]), axis=1)

Remove Stop Words

In [None]:
stop_words = stopwords.words('english')

In [None]:
stop_words.extend(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l','m','n','o','p','q','r','s','t', 'u', 'v', 'w', 'x', 'y', 'z', "about", "across", "after", "all", "also", "an", "and", "another", "added",
"any", "are", "as", "at", "basically", "be", "because", 'become', "been", "before", "being", "between","both", "but", "by","came","can","come","could","did","do","does","each","else","every","either","especially", "for","from","get","given","gets",
'give','gives',"got","goes","had","has","have","he","her","here","him","himself","his","how","if","in","into","is","it","its","just","lands","like","make","making", "made", "many","may","me","might","more","most","much","must","my","never","provide", 
"provides", "perhaps","no","now","of","on","only","or","other", "our","out","over","re","said","same","see","should","since","so","some","still","such","seeing", "see", "take","than","that","the","their","them","then","there",
"these","they","this","those","through","to","too","under","up","use","using","used", "underway", "very","want","was","way","we","well","were","what","when","where","which","while","whilst","who","will","with","would","you","your", 
'etc', 'via', 'eg'])

In [None]:
stop_words += ['hi','\n','\n\n', '&amp;', ' ', '.', '-', 'got', "it's", 'it’s', "i'm", 'i’m', 'im', 'want', 'like', '$', '@', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'rt', 'feel', 'give', 'giving', 'help', 'said', 'also', 'gave', 'like', 'going', 'even']

In [None]:
df[1] = df[1].apply(lambda x: [item for item in x if item not in stop_words])

Perform Lemmatization

In [None]:
wordnet_lemmatizer = WordNetLemmatizer()

df[1] = df[1].apply(lambda x: [wordnet_lemmatizer.lemmatize(y) for y in x]) 

In [None]:
docs = df[1].to_numpy()

## Create a Dictionary

In [None]:
# create dictionary of all words in all documents
dictionary = gensim.corpora.Dictionary(docs)

# filter extreme cases out of dictionary
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

# create BOW dictionary
bow_corpus = [dictionary.doc2bow(doc) for doc in docs]

## Create LDA model

In [None]:
# create LDA model using preferred hyperparameters
lda_model = gensim.models.LdaMulticore(bow_corpus, 
                                         num_topics=5, 
                                         id2word=dictionary, 
                                         passes=4, 
                                         workers=2,
                                         random_state=21)

In [None]:
from gensim.models import CoherenceModel

View LDA topics

In [None]:
lda_model.show_topics()

### Calculate LDA Coherence Score

In [None]:
cm = CoherenceModel(model=lda_model, corpus=bow_corpus, texts=docs, coherence='c_v')
coherence_lda = cm.get_coherence() 
print(coherence_lda)

## Create GSDMM Model

In [None]:
# create variable containing length of dictionary/vocab
vocab_length = len(dictionary)

# initialize GSDMM
gsdmm = MovieGroupProcess(K=15, alpha=0.1, beta=0.3, n_iters=15)

# fit GSDMM model
y = gsdmm.fit(docs, vocab_length)

In [None]:
import numpy as np

Display GSDMM topics with top words

In [None]:
# print number of documents per topic
doc_count = np.array(gsdmm.cluster_doc_count)
print('Number of documents per topic :', doc_count)

# Topics sorted by the number of document they are allocated to
top_index = doc_count.argsort()[-15:][::-1]
print('Most important clusters (by number of docs inside):', top_index)

# define function to get top words per topic
def top_words(cluster_word_distribution, top_cluster, values):
    for cluster in top_cluster:
        sort_dicts = sorted(cluster_word_distribution[cluster].items(), key=lambda k: k[1], reverse=True)[:values]
        print("\nCluster %s : %s"%(cluster, sort_dicts))

# get top words in topics
top_words(gsdmm.cluster_word_distribution, top_index, 20)

Create Lists from GSDMM topics

In [None]:
def get_topics_lists(model, top_clusters, n_words):
    '''
    Gets lists of words in topics as a list of lists.
    
    model: gsdmm instance
    top_clusters:  numpy array containing indices of top_clusters
    n_words: top n number of words to include
    
    '''
    # create empty list to contain topics
    topics = []
    
    # iterate over top n clusters
    for cluster in top_clusters:
        #create sorted dictionary of word distributions
        sorted_dict = sorted(model.cluster_word_distribution[cluster].items(), key=lambda k: k[1], reverse=True)[:n_words]
         
        #create empty list to contain words
        topic = []
        
        #iterate over top n words in topic
        for k,v in sorted_dict:
            #append words to topic list
            topic.append(k)
            
        #append topics to topics list    
        topics.append(topic)
    
    return topics

# get topics to feed to coherence model
topics = get_topics_lists(gsdmm, top_index, 20) 

### Calculate GSDMM Coherence Score

In [None]:
# evaluate model using Topic Coherence score
cm_gsdmm = CoherenceModel(topics=topics, dictionary=dictionary, corpus=bow_corpus, texts=docs, coherence='c_v')

# get coherence value
coherence_gsdmm = cm_gsdmm.get_coherence()  

print(coherence_gsdmm)

In [None]:
top_words(gsdmm.cluster_word_distribution, top_index, 20)

In [None]:
cm_gsdmm = CoherenceModel(topics=topics, dictionary=dictionary, corpus=bow_corpus, texts=docs, coherence='c_v')
coherence_gsdmm = cm_gsdmm.get_coherence()  
print(coherence_gsdmm)