In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import matplotlib.pyplot as plt

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
from tqdm import tqdm

import gensim
from gensim import corpora
from gensim.models import LdaModel
from gensim.models import CoherenceModel

In [2]:
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

The URLs below point to XLSX files containing tweets from 2022 from the following Twitter accounts:
- [@TheBeaverton](https://twitter.com/TheBeaverton)
- [@BillNye](https://twitter.com/BillNye) (Bill Nye)
- [@GeorgeTakei](https://twitter.com/GeorgeTakei) (George Takei)
- [@MeCookieMonster](https://twitter.com/MeCookieMonster) (Cookie Monster)
- [@neiltyson](https://twitter.com/neiltyson) (Neil deGrasse Tyson)
- [@StephenKing](https://twitter.com/StephenKing) (Stephen King)
- [@22_Minutes](https://twitter.com/22_Minutes) (This Hour Has 22 Minutes)

*If you want local copies of these data files, you can download each by Ctrl+clicking (Windows) or Command+clicking (Mac) the links below.*

## Data Loading and Exploration

In [3]:
beaverton_url = 'https://drive.google.com/uc?export=download&id=173t6vMYcbSED81IOmg0MbhHsrSFSiF21'
billnye_url = 'https://drive.google.com/uc?export=download&id=1w1Xm-TQfcPfGICM_8_LQiWSqzf8WUk8N'
georgetakei_url = 'https://drive.google.com/uc?export=download&id=1_Ck9S2aW9cBQQrqoTiNLRmvRu4WvPn8y'
cookiemonster_url = 'https://drive.google.com/uc?export=download&id=10Av-HVklZA4Su3TcvSpxU6nS8At9mJ05'
neiltyson_url = 'https://drive.google.com/uc?export=download&id=1GSKxxldXe--HE4gSYAEn2ip2vS4bXKMM'
stephenking_url = 'https://drive.google.com/uc?export=download&id=1Oq2pUCv0d7x4tDKX1E-3hgdSAuXcZtVU'
twentytwo_minutes_url = 'https://drive.google.com/uc?export=download&id=1B2zwAJYdw_9skXAFbn7QIs6uyIikje5W'

In [4]:
beaverton = pd.read_excel(beaverton_url, engine='openpyxl')
beaverton.head()

Unnamed: 0,tweet-id,tweet-text,tweet-author,tweet-timestamp-date
0,1,Heartwarming: This man reconnected with his mo...,The Beaverton,2022-03-05
1,2,OP-ED: Big Pharma put all of the chips in the ...,The Beaverton,2022-03-04
2,3,Make every day a 'talking to Americans about t...,The Beaverton,2022-03-04
3,4,Single drop of sweat falls down local dad’s fa...,The Beaverton,2022-03-04
4,5,100% of Canadians pretty sure they’ve had COVI...,The Beaverton,2022-03-03


In [5]:
billnye = pd.read_excel(billnye_url, engine='openpyxl')
billnye.head()

Unnamed: 0,tweet-id,tweet-text,tweet-author,tweet-timestamp-date
0,1,"Happy Twosday, people(s). That’s right, it’s 2...",Bill Nye,2022-02-22
1,2,Dr. King would be 93 years old this week. The ...,Bill Nye,2022-01-17
2,3,Happy Isaac Newton’s Birthday James Webb Space...,Bill Nye,2021-12-25
3,4,"Please, consider the following… my new VR Spac...",Bill Nye,2021-12-14
4,5,Winter tornadoes… warm winter in the south & c...,Bill Nye,2021-12-12


In [6]:
georgetakei = pd.read_excel(georgetakei_url, engine='openpyxl')
georgetakei.head()

Unnamed: 0,tweet-id,tweet-text,tweet-author,tweet-timestamp-date
0,1,He is still sorely missed. 💕 What is your favo...,georgetakei,2022-03-12
1,2,It’s terrifying but true. The GOP has become t...,georgetakei,2022-03-12
2,3,This is not normal.,georgetakei,2022-03-12
3,4,"Very well put, indeed.",georgetakei,2022-03-12
4,5,"With the former guy, mockery is the best polic...",georgetakei,2022-03-12


In [7]:
cookiemonster = pd.read_excel(cookiemonster_url, engine='openpyxl')
cookiemonster.head()

Unnamed: 0,tweet-id,tweet-text,tweet-author,tweet-timestamp-date
0,1,"Today, me so thankful for chefs like me friend...",MeCookieMonster,2022-03-08
1,2,You can’t spell cookie without “oo.” Speaking ...,MeCookieMonster,2022-02-25
2,3,Me so grateful @SesameErnie\n. Me think sharin...,MeCookieMonster,2022-02-17
3,4,What do me love? Friends. Family. Cookies. Ple...,MeCookieMonster,2022-02-14
4,5,"Dipping cookies in milk? Me get it, but who ha...",MeCookieMonster,2022-01-28


In [8]:
neiltyson = pd.read_excel(billnye_url, engine='openpyxl')
neiltyson.head()

Unnamed: 0,tweet-id,tweet-text,tweet-author,tweet-timestamp-date
0,1,"Happy Twosday, people(s). That’s right, it’s 2...",Bill Nye,2022-02-22
1,2,Dr. King would be 93 years old this week. The ...,Bill Nye,2022-01-17
2,3,Happy Isaac Newton’s Birthday James Webb Space...,Bill Nye,2021-12-25
3,4,"Please, consider the following… my new VR Spac...",Bill Nye,2021-12-14
4,5,Winter tornadoes… warm winter in the south & c...,Bill Nye,2021-12-12


In [9]:
stephenking = pd.read_excel(stephenking_url, engine='openpyxl')
stephenking.head()

Unnamed: 0,tweet-id,tweet-text,tweet-author,tweet-timestamp-date
0,1,What kind of dinosaur writes romance novels? A...,StephenKing,2022-03-05
1,2,"Molly, aka the Thing of Evil, fools my friend ...",StephenKing,2022-02-25
2,3,Two questions (and a codicil) about MANIFEST: ...,StephenKing,2022-02-25
3,4,What most of us learned as kids on the playgro...,StephenKing,2022-02-23
4,5,"I need baseball! Come on you guys, stop being ...",StephenKing,2022-02-16


In [10]:
twentytwo_minutes = pd.read_excel(twentytwo_minutes_url, engine='openpyxl')
twentytwo_minutes.head()

Unnamed: 0,tweet-id,tweet-text,tweet-author,tweet-timestamp-date
0,1,"And in lighter news this week: Nope, sorry. We...",22Minutes,2022-03-03 00:00:00
1,2,Dee Snider of Twisted Sister says he approves ...,22Minutes,2022-03-03 00:00:00
2,3,Tim Hortons will be releasing their own line o...,22Minutes,2022-03-03 00:00:00
3,4,A new survey shows that 1 in 8 people suffer f...,22Minutes,2022-03-02 00:00:00
4,5,The Bank of Canada has raised its interest rat...,22Minutes,2022-03-02 00:00:00


In [11]:
beaverton['account'] = 'The Beaverton'
billnye['account'] = 'Bill Nye'
georgetakei['account'] = 'George Takei'
cookiemonster['account'] = 'Cookie Monster'
neiltyson['account'] = 'Neil deGrasse Tyson'
stephenking['account'] = 'Stephen King'
twentytwo_minutes['account'] = 'This Hour Has 22 Minutes'

In [12]:
data = pd.concat([beaverton, billnye, georgetakei, cookiemonster, neiltyson, stephenking, twentytwo_minutes], ignore_index=True)
data.head()

Unnamed: 0,tweet-id,tweet-text,tweet-author,tweet-timestamp-date,account
0,1,Heartwarming: This man reconnected with his mo...,The Beaverton,2022-03-05 00:00:00,The Beaverton
1,2,OP-ED: Big Pharma put all of the chips in the ...,The Beaverton,2022-03-04 00:00:00,The Beaverton
2,3,Make every day a 'talking to Americans about t...,The Beaverton,2022-03-04 00:00:00,The Beaverton
3,4,Single drop of sweat falls down local dad’s fa...,The Beaverton,2022-03-04 00:00:00,The Beaverton
4,5,100% of Canadians pretty sure they’ve had COVI...,The Beaverton,2022-03-03 00:00:00,The Beaverton


In [13]:
data.tail()

Unnamed: 0,tweet-id,tweet-text,tweet-author,tweet-timestamp-date,account
1071,196,A student at U of T is launching a city-wide s...,22Minutes,2021-09-17 00:00:00,This Hour Has 22 Minutes
1072,197,New poll shows nearly half of Canadians don’t ...,22Minutes,2021-09-17 00:00:00,This Hour Has 22 Minutes
1073,198,Apple has introduced the iPhone 13! Huge! Mass...,22Minutes,2021-09-16 00:00:00,This Hour Has 22 Minutes
1074,199,The Price Is Right is celebrating its 50th yea...,22Minutes,2021-09-15 00:00:00,This Hour Has 22 Minutes
1075,200,Some restaurants are reporting a shortage of c...,22Minutes,2021-09-15 00:00:00,This Hour Has 22 Minutes


In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1076 entries, 0 to 1075
Data columns (total 5 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   tweet-id              1076 non-null   int64 
 1   tweet-text            1076 non-null   object
 2   tweet-author          1076 non-null   object
 3   tweet-timestamp-date  1076 non-null   object
 4   account               1076 non-null   object
dtypes: int64(1), object(4)
memory usage: 42.2+ KB


## Text Preprocessing

In [15]:
# to initialize stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [16]:
def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text) # [^a-zA-Z\s] => This pattern matches any character that is not an alphabetic letter (A to Z or a to z) or a whitespace.

    tokens = word_tokenize(text) # A function from NLTK that splits a string of text into individual words or tokens (smallest units of meaning).

    tokens = [token.lower() for token in tokens]

    tokens = [token for token in tokens if token not in stop_words]

    tokens = [lemmatizer.lemmatize(token) for token in tokens] # Converts each word (token) into its root form (lemma).

    return tokens

In [17]:
tqdm.pandas() # tqdm.pandas() integrates the tqdm progress bar with pandas, allowing you to track progress when using .apply() on DataFrames or Series.
data['processed_tweet_text'] = data['tweet-text'].progress_apply(preprocess_text)
data[['tweet-text', 'processed_tweet_text']].head()

100%|██████████| 1076/1076 [00:03<00:00, 275.41it/s]


Unnamed: 0,tweet-text,processed_tweet_text
0,Heartwarming: This man reconnected with his mo...,"[heartwarming, man, reconnected, mom, could, a..."
1,OP-ED: Big Pharma put all of the chips in the ...,"[oped, big, pharma, put, chip, vaccine, cant, ..."
2,Make every day a 'talking to Americans about t...,"[make, every, day, talking, american, healthca..."
3,Single drop of sweat falls down local dad’s fa...,"[single, drop, sweat, fall, local, dad, face, ..."
4,100% of Canadians pretty sure they’ve had COVI...,"[canadian, pretty, sure, theyve, covid, already]"


##  Corpus Preparation, LDA Model Training, and Topic Evaluation

This section prepares the processed text for LDA topic modelling by creating a structured corpus and training the model using Gensim.

In [18]:
# Creates a dictionary where each unique word gets a unique integer ID.
# The dictionary stores how many times each word appears across all documents.
dictionary = corpora.Dictionary(data['processed_tweet_text'])

### Why These Parameters?

- `num_topics=5`: Each account likely discusses a few main themes. Using five topics per account helps keep topics clear and meaningful without making them too broad or too specific. **Experimenting with coherence scores** can determine the optimal number.
- `passes=50`: Runs the model enough times to find good topics without wasting time on extra passes that don't add much improvement.
- `update_every=1`: Updates the model after each batch of documents, making training faster and more memory-efficient, especially for large datasets.
- `chunksize=200`: Processes 200 documents at a time, which makes training faster and helps the model find more stable topics.
- `alpha='auto'`: Automatically adjusts how many topics each document contains, so the model can decide if a document should focus on just one topic or multiple topics.
- `per_word_topics=False`: Speeds up processing by skipping extra details about individual words, which are not usually needed for topic analysis.

In [19]:
# to loop through each account and train an LDA model for each
for account in data['account'].unique():
    account_data = data[data['account'] == account]

    # Converts each document (text, which is a list of words) into a sparse vector.
    # ['heartwarming', 'man', 'reconnected', 'mom', 'could', 'ask', 'tax'] => [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)]
    account_corpus = [dictionary.doc2bow(text) for text in account_data['processed_tweet_text']]

    lda_model = LdaModel(corpus=account_corpus,
                         id2word=dictionary,
                         num_topics=5,
                         passes=50,
                         update_every=1,
                         chunksize=200,
                         alpha='auto',
                         per_word_topics=False)

    # The coherence score measures how well the words in a topic are related, helping us evaluate the quality of the LDA model.
    # A higher coherence score means better topics, making the model more useful!
    # The coherence score in LDA topic modeling typically ranges between 0 and 1, but it does not have strict theoretical bounds.
    # A higher coherence score (~0.5 to 0.7) means the topics are well-formed and useful!
    coherence_model = CoherenceModel(model=lda_model,
                                     texts=account_data['processed_tweet_text'],
                                     dictionary=dictionary,
                                     coherence='c_v') # Use coherence='c_v' for topic interpretability and when deciding the best num_topics.
    coherence_score = coherence_model.get_coherence()

    print(f"\nTopics for Account: {account} (Coherence Score: {coherence_score:.4f})")
    print("-" * 100)

    for topic_id, topic in lda_model.show_topics(formatted=True, num_words=10):
      print(f"Topic {topic_id + 1}: {topic}")


Topics for Account: The Beaverton (Coherence Score: 0.6136)
----------------------------------------------------------------------------------------------------
Topic 1: 0.005*"man" + 0.004*"life" + 0.003*"canadian" + 0.003*"winter" + 0.003*"back" + 0.002*"new" + 0.002*"night" + 0.002*"enough" + 0.002*"time" + 0.002*"every"
Topic 2: 0.004*"woman" + 0.004*"local" + 0.004*"report" + 0.003*"scientist" + 0.003*"time" + 0.003*"make" + 0.002*"youre" + 0.002*"toronto" + 0.002*"person" + 0.002*"day"
Topic 3: 0.005*"day" + 0.004*"team" + 0.004*"man" + 0.004*"new" + 0.003*"leaf" + 0.003*"bad" + 0.002*"seven" + 0.002*"player" + 0.002*"buying" + 0.002*"bubble"
Topic 4: 0.006*"canada" + 0.004*"man" + 0.003*"local" + 0.002*"music" + 0.002*"report" + 0.002*"new" + 0.002*"study" + 0.002*"pretty" + 0.002*"cdnpoli" + 0.002*"global"
Topic 5: 0.007*"canadian" + 0.003*"man" + 0.003*"covid" + 0.002*"never" + 0.002*"pizza" + 0.002*"took" + 0.002*"climate" + 0.002*"political" + 0.002*"change" + 0.002*"cartoo

In [20]:
for account in data['account'].unique():
    account_data = data[data['account'] == account]

    account_corpus = [dictionary.doc2bow(text) for text in account_data['processed_tweet_text']]

    lda_model = LdaModel(corpus=account_corpus,
                         id2word=dictionary,
                         num_topics=3,
                         passes=50,
                         update_every=1,
                         chunksize=200,
                         alpha='auto',
                         per_word_topics=False)

    coherence_model = CoherenceModel(model=lda_model,
                                     texts=account_data['processed_tweet_text'],
                                     dictionary=dictionary,
                                     coherence='c_v')
    coherence_score = coherence_model.get_coherence()

    print(f"\nTopics for Account: {account} (Coherence Score: {coherence_score:.4f})")
    print("-" * 100)

    for topic_id, topic in lda_model.show_topics(formatted=True, num_words=10):
      print(f"Topic {topic_id + 1}: {topic}")


Topics for Account: The Beaverton (Coherence Score: 0.6282)
----------------------------------------------------------------------------------------------------
Topic 1: 0.005*"man" + 0.003*"local" + 0.003*"make" + 0.002*"scientist" + 0.002*"day" + 0.002*"youre" + 0.002*"woman" + 0.002*"like" + 0.002*"time" + 0.002*"person"
Topic 2: 0.005*"canadian" + 0.004*"new" + 0.003*"time" + 0.003*"toronto" + 0.003*"team" + 0.003*"report" + 0.003*"get" + 0.002*"winter" + 0.002*"every" + 0.002*"leaf"
Topic 3: 0.004*"man" + 0.003*"canada" + 0.003*"day" + 0.003*"local" + 0.003*"two" + 0.002*"cat" + 0.002*"life" + 0.002*"new" + 0.002*"back" + 0.002*"cdnpoli"

Topics for Account: Bill Nye (Coherence Score: 0.3234)
----------------------------------------------------------------------------------------------------
Topic 1: 0.010*"day" + 0.006*"today" + 0.005*"u" + 0.005*"world" + 0.004*"mar" + 0.004*"people" + 0.003*"go" + 0.003*"exploreplanets" + 0.003*"space" + 0.003*"think"
Topic 2: 0.008*"year" + 0

In [21]:
for account in data['account'].unique():
    account_data = data[data['account'] == account]

    account_corpus = [dictionary.doc2bow(text) for text in account_data['processed_tweet_text']]

    lda_model = LdaModel(corpus=account_corpus,
                         id2word=dictionary,
                         num_topics=10,
                         passes=50,
                         update_every=1,
                         chunksize=200,
                         alpha='auto',
                         per_word_topics=False)

    coherence_model = CoherenceModel(model=lda_model,
                                     texts=account_data['processed_tweet_text'],
                                     dictionary=dictionary,
                                     coherence='c_v')
    coherence_score = coherence_model.get_coherence()

    print(f"\nTopics for Account: {account} (Coherence Score: {coherence_score:.4f})")
    print("-" * 100)

    for topic_id, topic in lda_model.show_topics(formatted=True, num_words=10):
      print(f"Topic {topic_id + 1}: {topic}")

  m_lr_i = np.log(numerator / denominator)
  return cv1.T.dot(cv2)[0, 0] / (_magnitude(cv1) * _magnitude(cv2))



Topics for Account: The Beaverton (Coherence Score: nan)
----------------------------------------------------------------------------------------------------
Topic 1: 0.006*"new" + 0.006*"local" + 0.004*"day" + 0.004*"man" + 0.004*"team" + 0.004*"player" + 0.004*"talking" + 0.004*"bunch" + 0.004*"report" + 0.004*"climate"
Topic 2: 0.011*"man" + 0.004*"toronto" + 0.004*"bubble" + 0.004*"enough" + 0.004*"political" + 0.004*"cartoonist" + 0.004*"canadian" + 0.004*"local" + 0.004*"life" + 0.004*"pretty"
Topic 3: 0.009*"canada" + 0.007*"canadian" + 0.007*"winter" + 0.004*"every" + 0.004*"complain" + 0.004*"gift" + 0.002*"never" + 0.002*"study" + 0.002*"actually" + 0.002*"video"
Topic 4: 0.004*"rebel" + 0.004*"cdnpoli" + 0.004*"report" + 0.004*"hockey" + 0.004*"canada" + 0.004*"dish" + 0.004*"left" + 0.004*"bad" + 0.002*"night" + 0.002*"covid"
Topic 5: 0.008*"local" + 0.006*"snow" + 0.006*"man" + 0.006*"winnipeg" + 0.004*"onto" + 0.004*"first" + 0.004*"report" + 0.004*"lake" + 0.002*"cant" 

## Topic Comparison Across Users