In [1]:
!pip install ntscraper
!pip install transformers
!pip install scipy

Collecting ntscraper
  Downloading ntscraper-0.3.13-py3-none-any.whl (11 kB)
Installing collected packages: ntscraper
Successfully installed ntscraper-0.3.13


In [2]:
import pandas as pd
from ntscraper import Nitter
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax
import matplotlib.pyplot as plt
import plotly.express as px
import pytz
from datetime import datetime

## Modify to serach based on username, hashtags, or keywords. Change mode variable as well.

Note: this step may take a few minutes as it collects data

In [4]:
scraper = Nitter(log_level=1, skip_instance_check=False)
twitter_username = "POTUS"

tweets = scraper.get_tweets(twitter_username, mode = 'user', since = '2023-12-01', until = '2023-12-31')

Testing instances: 100%|██████████| 77/77 [01:26<00:00,  1.13s/it]
INFO:root:No instance specified, using random instance https://nitter.privacydev.net
INFO:root:Current stats for POTUS: 20 tweets, 0 threads...
INFO:root:Current stats for POTUS: 40 tweets, 0 threads...


## preprocess text from each tweet to fit models needs

In [5]:
temp = []
timestamps = []
for tweet in tweets['tweets']: #go through every tweet
  timestamps.append(tweet['date'])
  tweet_words = []
  for word in tweet['text'].split(' '): # go through every word of every tweet
    if word.startswith('@') and len(word) > 1:
        word = '@user'

    elif word.startswith('http'):
        word = "http"

    tweet_words.append((word))

  temp.append((tweet_words))

final_tweets = []
for tweet in temp:
  tweet_proc = " ".join(tweet)
  final_tweets.append(tweet_proc)

final_tweets.reverse()
timestamps.reverse()

print(final_tweets)
print(timestamps)

['My heart is with those who lost their lives in today’s senseless shooting in Prague, those injured, and the Czech people. Our authorities are in touch with Czech law enforcement, and we stand ready to offer additional support if needed.', 'America was founded on the principle of equal justice under law — our criminal justice system can and should reflect this core value that makes our communities safer and stronger.   That’s why I’m announcing additional steps to make the promise of equal justice a reality.', 'First, I’m commuting the sentences of 11 people who are serving disproportionately long sentences for non-violent drug offenses.   All of them would have been eligible to receive significantly lower sentences if they were charged with the same offense today.', 'Second, I’m issuing a Proclamation that will pardon additional offenses of simple possession and use of marijuana under federal and D.C. law.   Too many lives have been upended because of our failed approach.   It’s time

In [6]:
roberta = "cardiffnlp/twitter-roberta-base-sentiment"

model = AutoModelForSequenceClassification.from_pretrained(roberta)
tokenizer = AutoTokenizer.from_pretrained(roberta)

labels = ['Negative', 'Neutral', 'Positive']

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/747 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]



vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

# Calculate the scores for each tweet

In [11]:
tweets_data = []
i = 0
for tweet in final_tweets:
  timestamp = timestamps[i]

  encoded_tweet = tokenizer(tweet, return_tensors='pt')
  output = model(**encoded_tweet)
  scores = output[0][0].detach().numpy()
  scores = softmax(scores)

  tweet_data = {
        "timestamp": timestamp,
        "tweet": tweet,
        "negative": scores[0],
        "neutral": scores[1],
        "positive": scores[2]
  }

  tweets_data.append(tweet_data)
  i+=1

In [15]:
df = pd.DataFrame(tweets_data)

df['timestamp'] = pd.to_datetime(df['timestamp'], format='%b %d, %Y · %I:%M %p %Z')
eastern = pytz.timezone('US/Eastern')
df['timestamp'] = df['timestamp'].dt.tz_convert(eastern)
df.set_index('timestamp', inplace=True)

max_length = 100 #tweets can be up to 280 characters
window_size = 10
df_smoothed = df[['negative', 'neutral', 'positive']].rolling(window=window_size).mean()
df_smoothed['tweet'] = df['tweet'].apply(lambda x: (x[:max_length] + '...') if len(x) > max_length else x)

fig = px.line(df_smoothed, x=df_smoothed.index, y='positive', title='Tweet Sentiment Scores Over Time',
              labels={'x': 'Timestamp', 'positive': 'Sentiment Score'},
              hover_data={'tweet': True}, markers=True)

# If you want to plot other sentiments, you can add them as well
# fig.add_scatter(x=df_smoothed.index, y=df_smoothed['negative'], mode='lines', name='Negative')
# fig.add_scatter(x=df_smoothed.index, y=df_smoothed['neutral'], mode='lines', name='Neutral')

fig.update_xaxes(title_text='Timestamp')
fig.update_yaxes(title_text='Sentiment Score')
fig.update_layout(legend_title_text='Sentiment Type')
fig.show()