In [1]:
import os
import pandas as pd
import numpy as np
import tweepy
from IPython.core.display import display, HTML
from dotenv import load_dotenv
from transformers import AutoTokenizer
from transformers import pipeline
import plotly.express as px
import plotly.graph_objects as go
from tqdm import tqdm
import torch
device='cuda'if torch.cuda.is_available()else'cpu';print(f'Using {device}')
load_dotenv()

S = {'negative':'red','neutral':'yellow','positive':'green'}

def view(df_):
    with pd.option_context('display.max_rows',None,'display.max_columns',None,'display.width',None):
        display(HTML(df_.to_html()))


API_KEY = os.getenv('API_KEY')
API_KEY_SECRET = os.getenv('API_KEY_SECRET')
ACCESS_TOKEN = os.getenv('ACCESS_TOKEN')
ACCESS_TOKEN_SECRET = os.getenv('ACCESS_TOKEN_SECRET')

2022-01-17 20:13:29.287500: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-01-17 20:13:29.287523: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


Using cpu


In [2]:
auth = tweepy.OAuthHandler(API_KEY, API_KEY_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
api = tweepy.API(auth)
# view(pd.json_normalize(api.rate_limit_status()).T)

In [3]:
model_name = "cardiffnlp/twitter-roberta-base-sentiment"
clf = pipeline(
    task='sentiment-analysis',
    model=model_name,
    tokenizer=AutoTokenizer.from_pretrained(model_name)
)

### Get data by search

In [30]:
limit = 500
q = "trudeau"
data = [_ for _ in tqdm(tweepy.Cursor(api.search_tweets,q=q,tweet_mode='extended').items(limit))]
tweets = [{'tweet':d.full_text,'date':d.created_at,'search_term':q,'poster':d.user.screen_name}for d in data]

500it [00:09, 53.88it/s]


### Get data by screen name

In [24]:
# limit = 10
# screen_name = 'ringostarrmusic'
# data = [t for t in tweepy.Cursor(api.user_timeline,screen_name=screen_name, tweet_mode='extended').items(limit)]
# tweets = [{'tweet':d.full_text,'date':d.created_at,'screen_name':d.user.screen_name}for d in data]

### Inference

In [None]:
out = clf([t['tweet'] for t in tweets])

In [None]:
df = pd.concat([pd.DataFrame(tweets),pd.DataFrame(out)],axis=1).sort_values('date')
df.rename({'label':'sentiment'},axis=1,inplace=True)
df['sentiment'] = df['sentiment'].map({'LABEL_0':'negative','LABEL_1':'neutral','LABEL_2':'positive'})
# subset = pd.IndexSlice['label']
df.style.apply(lambda x:["background:red"if'negative'in x.iloc[0]else"background:green"if'positive'in x.iloc[0]else"background:yellow"for v in x],axis=1,subset='sentiment')

### View tweet sentiments over time

In [None]:
fig = px.histogram(df,
                   x="date",
                   color="sentiment",
                   text_auto=True,
                   color_discrete_map=S,
                   nbins=100)
fig.show()

### View cumulative sentiment over time

In [None]:
## encode sentiment labels for plotting
df=pd.concat([pd.get_dummies(df['sentiment']),df],axis=1)
for _ in list(S):
    if _ in df:
        df[f'{_}_c'] = df[_].cumsum()

In [None]:
fig=go.Figure()
# only add sentiments labels that exist in data
for k,v in {k:v for k,v in S.items()if k in set(df['sentiment'])&S.keys()}.items():
    fig.add_trace(go.Scatter(x=df['date'],y=df[f'{k}_c'],mode='lines',name=k,line_color=v))
        
fig.update_layout(
    title=dict(
        text='Cumulative Sentiment',
        y=0.9,
        x=0.5,
        xanchor='center',
        yanchor='top'),
    xaxis_title='Time',
    yaxis_title='Count',
    legend_title='Sentiment',
    font=dict(
        family='Arial',
        size=12,
        color='Black'
    )
)        
fig.show()