<a href="https://colab.research.google.com/github/tmdang1101/twitter_sentiment_analysis/blob/main/Twitter_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Preliminaries #

In [None]:
# Install transformers module to use transformer models from HuggingFace
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m31.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.0-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.1/199.1 KB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.0 tokenizers-0.13.2 transformers-4.26.1


In [None]:
# Connects this notebook to Google Drive
import os
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

pathname = "/content/drive/My Drive/Twitter Sentiment Analysis/"
os.chdir(pathname)

Mounted at /content/drive


In [None]:
# Import libraries
import tweepy
from tweepy import API 
from tweepy import OAuthHandler
 
import twitter_credentials

import numpy as np
import pandas as pd
import re

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax

# Tweepy and Data Pre-Processing #

In [None]:
# Twitter Authenticator #
class TwitterAuthenticator():
    def authenticate_twitter_app(self):
        auth = OAuthHandler(twitter_credentials.CONSUMER_KEY, twitter_credentials.CONSUMER_SECRET)
        auth.set_access_token(twitter_credentials.ACCESS_TOKEN, twitter_credentials.ACCESS_TOKEN_SECRET)
        return auth
        
# Twitter Client #
class TwitterClient():
    def __init__(self, twitter_user=None):
        self.auth = TwitterAuthenticator().authenticate_twitter_app()
        self.twitter_client = API(self.auth)
        self.twitter_user = twitter_user

    def get_twitter_client_api(self):
        return self.twitter_client

In [None]:
# Tweet Analyzer #
class TweetAnalyzer():
    def clean_tweet(self, tweet):
        return ' '.join(re.sub('(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)', ' ', tweet).split())

    def tweets_to_data_frame(self, tweets):
        df = pd.DataFrame(data=[tweet.full_text for tweet in tweets], columns=['tweets'])

        df['len'] = np.array([len(tweet.full_text) for tweet in tweets])
        df['date'] = np.array([tweet.created_at for tweet in tweets])
        df['likes'] = np.array([tweet.favorite_count for tweet in tweets])
        df['retweets'] = np.array([tweet.retweet_count for tweet in tweets])

        return df

    def transformer_model(self, tweet):
        # preprocess tweet
        tweet_words = []

        for word in tweet.split(' '):
            if word.startswith('@') and len(word) > 1:
                word = '@user'
            
            elif word.startswith('http'):
                word = 'http'
                if len(tweet_words) == 0:
                  return 0
            tweet_words.append(word)

        processed_tweet = ' '.join(tweet_words)

        # load model and tokenizer
        model_name = 'm-newhauser/distilbert-political-tweets'

        model = AutoModelForSequenceClassification.from_pretrained(model_name)
        tokenizer = AutoTokenizer.from_pretrained(model_name)

        # sentiment analysis
        encoded_tweet = tokenizer(processed_tweet, return_tensors='pt')
        output = model(**encoded_tweet)

        scores = output[0][0].detach().numpy()
        scores = softmax(scores)

        # tweet score
        max = 0
        index = 0
        for i in range(len(scores)):
          if max < scores[i]:
            max = scores[i]
            index = i

        if index == 0:
          return 1
        else:
          return -1

# Political Orientation Score #

In [None]:
def classify_user(user_handle):
  twitter_client = TwitterClient()
  tweet_analyzer = TweetAnalyzer()

  api = twitter_client.get_twitter_client_api()

  tweets = api.user_timeline(screen_name = user_handle, count = 200, include_rts = False, tweet_mode = 'extended')

  df = tweet_analyzer.tweets_to_data_frame(tweets)

  with pd.option_context('expand_frame_repr', False):
    print(df.head())

  df['sentiment'] = np.array([tweet_analyzer.transformer_model(tweet) for tweet in df['tweets']])

  return round(df.loc[:, 'sentiment'].mean() * 10, 2)

In [None]:
user_handle = input("Give me a Twitter user handle: ")
score = classify_user(user_handle)
print(f"\nI predict this user to have a score of {score} on the political orientation scale.")

Give me a Twitter user handle: 
@forsberg370
                                              tweets  len                date  likes  retweets
0  Learn with me on Duolingo! I’m moving up the l...  180 2023-01-16 15:18:17      0         0
1  Learn a language with me for free! Duolingo is...  142 2023-01-11 16:42:29      0         0
2  Look how much I learned on Duolingo in 2022! H...   97 2022-12-07 03:16:26      0         0
3  Look how much I learned on Duolingo in 2022! H...   97 2022-12-07 03:16:15      0         0
4  Learn with me on Duolingo! I’m moving up the l...  180 2022-12-05 03:00:53      1         0

I predict this user to have a score of 1.33 on the political orientation scale.
