In [None]:
###Base Code

In [2]:
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

# Step 1: Scrape tweets
query = '#FreePalestine -is:retweet lang:en'
tweets = client.search_recent_tweets(query=query, max_results=10)
tweet_texts = [tweet.text for tweet in tweets.data]

# Step 2: Preprocess
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess(text):
    text = re.sub(r"http\S+|www\S+|@\S+|#\S+", "", text)
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    text = text.lower()
    tokens = [word for word in text.split() if word not in stop_words]
    return " ".join(tokens)

cleaned_tweets = [preprocess(t) for t in tweet_texts]

# Step 3: Vectorize and model topics
vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
X = vectorizer.fit_transform(cleaned_tweets)

nmf = NMF(n_components=3, random_state=42)
nmf.fit(X)

feature_names = vectorizer.get_feature_names_out()
for topic_idx, topic in enumerate(nmf.components_):
    print(f"Topic #{topic_idx + 1}:")
    print(", ".join([feature_names[i] for i in topic.argsort()[:-11:-1]]))
    print()

Topic #1:
terrorist, israel, state, organization, apartheid, earth, vipe, hope, died, clean

Topic #2:
war, accountable, colonizing, gtgt, hold, guys, need, criminals, hope, died

Topic #3:
wt, omg, sht, actual, horrific, terrorists, america, countries, creating, destroyed



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bturnbull1\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
###Base Code with Explanation

In [None]:
###Importing Packages
##First I import the Python regular expressions module which provides pattern-based text cleaning such as removing URLs and hashtags from the tweets.
import re
##I then import the NLTK (Natural Language Toolkit) and the English stopword list. Stopwords are common words like “the”, “is”, “in” that are removed in the preprocessing so they do not alter the analysis.
import nltk
from nltk.corpus import stopwords
##Then the TF-IDF Vectorizer from scikit-learn is imported to convert text into numbers within a document-term matrix. This tool creates an overall TF-IDF numerical measure which is composed of both Term Frequency (TF) which counts how often 
##a word appears in a single tweet, and Inverse Document Frequency (IDF) which scores a word based on how rare it is across all the tweets. A high IDF word will appear in fewer tweets than a low IDF word.
##Therefore the combined TF-IDF measure highlights words that are frequent in one tweet and rare across others, which is really helpful for finding topic-specific words.
from sklearn.feature_extraction.text import TfidfVectorizer
##I also import Non-negative Matrix Factorization (NMF), which is a topic modeling algorithm that takes the TF-IDF matrix and breaks it down into a set of topics that illustrates the main themes 
##in a large collection of text. It turns the text into a document-term matrix where each row is a tweet, each column is a word and each number shows how often that word appeared in the tweet.
from sklearn.decomposition import NMF

###Step 1: Scrape tweets
##Now I move into actually scraping/gathering the tweets from Twitter/X. 
##First I define the search query, for this test run I only want tweets containing #FreePalestine, excluding retweets, and written in English.
query = '#FreePalestine -is:retweet lang:en'
##Next I send the query to the Twitter API using Tweepy (I've already authenticated my bearer token required for access)
##Given the free-tier Twitter API limits I retrieve only the 10 most recent tweets that match the query to avoiding maxing out my monthly quota too soon.
tweets = client.search_recent_tweets(query=query, max_results=10)
##Now I extract the text content of each tweet and create a list
tweet_texts = [tweet.text for tweet in tweets.data]

###Step 2: Preprocess
##I have my list of tweet text, now I need to preprocess it before the analysis can begin.
##I download the list of English stopwords discussed above in Importing Packages and load the stopwords list as a Pythen set so it can be used efficiently.
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

##I create a preprocess function that removes URLs (http... or www...),Twitter mentions (@username),Hashtags (#hashtag), all non-alphabetic characters, converts all letters to lowercase, 
##splits text into individual words (tokens), remove stopwords,and reconstructs the cleaned tokens back into a single string that can be used for modeling.
def preprocess(text):
    text = re.sub(r"http\S+|www\S+|@\S+|#\S+", "", text)
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    text = text.lower()
    tokens = [word for word in text.split() if word not in stop_words]
    return " ".join(tokens)

##I then apply this preprocess function to each tweet in the previously created list
cleaned_tweets = [preprocess(t) for t in tweet_texts]

###Step 3: Vectorize and model topics
##The last step is the actual analysis. First I start the TF-IDF vectorizer discussed at the beginning in Importing Packages. This will again remove all English stopwords and will then
##rank all the words by their TF-IDF score and only keep the top 1,000 with the highest overall score, the most "important" words. 
vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
##I then convert the cleaned top 1,000 words into a document-term matrix that the NMF model can use.
X = vectorizer.fit_transform(cleaned_tweets)
##I start the NMF model by requesting three topics initially to keep the analysis manageable, since using 10 or more topics can become confusing and lead to overlapping themes. 
##I can adjust the number of topics later once I confirm the model is working well. These three topics are latent themes created de novo from the data—they are not simply selected from 
##existing words in the matrix. NMF groups terms that frequently co-occur across tweets and identifies overarching topics based on those term groupings.
##Setting random_state at 42 makes the model reproducible, this gives the model a starting point so the algorithm starts the same way every time, producing consistent, reproducible results.
##Note that any integer would work.
nmf = NMF(n_components=3, random_state=42)
##The The .fit() function takes the TF-IDF matrix of tweets and runs the NMF algorithm to find the underlying topic structure. After fitting, the model has learned the topic-term relationships
##as dicussed above and can now be used to extract or transform data into topics.
##Note TF-IDF provides the words and NMF provides the structure, topics as weighted groups of those words
nmf.fit(X)
##Finally, I want to display the topics so I retrieve the list of 1,000 top words that the TF-IDF originally identified from the text data.
feature_names = vectorizer.get_feature_names_out()
##Then I for loop over each topic developed by NMF in the previous step. Each topic is a list of weights for each word.
##Note that the NMF output is stored in nmf.components_
##This loop prints the topic number using 1-based indexing to make it readable.
##Next it prints the top 10 words based on their weights in the NMF topic component, the argsort()[:-11:-1]: sorts indices of the top 10 words by weight, in descending order.
##Lastly, I add a blank line between topics for readability.
for topic_idx, topic in enumerate(nmf.components_):
    print(f"Topic #{topic_idx + 1}:")
    print(", ".join([feature_names[i] for i in topic.argsort()[:-11:-1]]))
    print()

###Result
##The code produces three topics learned by the NMF model, each represented by a list of the most important words associated with that topic. These words are selected based on their high weights 
##in the NMF model, which was derived from the TF-IDF vectorized tweet data. I can now conduct a similar analysis of #BlackLivesMatter tweets and use that for further comparison between the 
##two datasets.
    
#FP Topic #1:
#palestine, wont, forget, use, platforms, humanity, awareness, equality, spread, lets

#FP Topic #2:
#genocide, revenge, dehumanising, dehumanised, process, gaza, murder, israelis, response, palestinians

#FP Topic #3:
#fuck, threat, say, morning, louder, hey, death, like, post, war

In [23]:
###FreePalestine Dataset

In [24]:
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

# Step 1: Scrape tweets
fp_query = '#FreePalestine -is:retweet lang:en'
fp_tweets = client.search_recent_tweets(query=fp_query, max_results=10) 
fp_tweet_texts = [tweet.text for tweet in fp_tweets.data]  

# Step 2: Preprocess
nltk.download('stopwords')
fp_stop_words = set(stopwords.words('english'))

def preprocess_fp(text):
    text = re.sub(r"http\S+|www\S+|@\S+|#\S+", "", text)
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    text = text.lower()
    tokens = [word for word in text.split() if word not in fp_stop_words]
    return " ".join(tokens)

fp_cleaned_tweets = [preprocess_fp(t) for t in fp_tweet_texts]

# Step 3: Vectorize and model topics
fp_vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
fp_X = fp_vectorizer.fit_transform(fp_cleaned_tweets)

fp_nmf = NMF(n_components=3, random_state=42)
fp_nmf.fit(fp_X)

fp_feature_names = fp_vectorizer.get_feature_names_out()
for fp_topic_idx, topic in enumerate(fp_nmf.components_):
    print(f"FP Topic #{fp_topic_idx + 1}:")
    print(", ".join([fp_feature_names[i] for i in topic.argsort()[:-11:-1]]))
    print()

FP Topic #1:
palestine, wont, forget, use, platforms, humanity, awareness, equality, spread, lets

FP Topic #2:
genocide, revenge, dehumanising, dehumanised, process, gaza, murder, israelis, response, palestinians

FP Topic #3:
fuck, threat, say, morning, louder, hey, death, like, post, war



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bturnbull1\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
##BLM Dataset

In [25]:
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

# Step 1: Scrape tweets
blm_query = '#BlackLivesMatter -is:retweet lang:en'
blm_tweets = client.search_recent_tweets(query=blm_query, max_results=10) 
blm_tweet_texts = [tweet.text for tweet in blm_tweets.data]  

# Step 2: Preprocess
nltk.download('stopwords')
blm_stop_words = set(stopwords.words('english'))

def preprocess_blm(text):
    text = re.sub(r"http\S+|www\S+|@\S+|#\S+", "", text)
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    text = text.lower()
    tokens = [word for word in text.split() if word not in blm_stop_words]
    return " ".join(tokens)

blm_cleaned_tweets = [preprocess_blm(t) for t in blm_tweet_texts]

# Step 3: Vectorize and model topics
blm_vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
blm_X = blm_vectorizer.fit_transform(blm_cleaned_tweets)

blm_nmf = NMF(n_components=3, random_state=42)
blm_nmf.fit(blm_X)

blm_feature_names = blm_vectorizer.get_feature_names_out()
for blm_topic_idx, topic in enumerate(blm_nmf.components_):
    print(f"BLM Topic #{blm_topic_idx + 1}:")
    print(", ".join([blm_feature_names[i] for i in topic.argsort()[:-11:-1]]))
    print()

TooManyRequests: 429 Too Many Requests
Too Many Requests

In [None]:
###Reddit Alternative

In [None]:
import praw

reddit = praw.Reddit(
    client_id="YOUR_CLIENT_ID",
    client_secret="YOUR_CLIENT_SECRET",
    user_agent="your_user_agent"
)

subreddit = reddit.subreddit("PalestinianRights")
for post in subreddit.new(limit=5):
    print(post.title)