In [3]:

import os
from pyarrow import feather
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np
from scipy.special import softmax
import torch
import pandas as pd

In [4]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
def write_to_feather(df, name):
    root_dir = os.path.dirname(os.path.abspath('twitter_sentimentAnalysis.ipynb'))
    path = os.path.join(root_dir, 'twitter_res/' + name)
    feather.write_feather(df, path)

In [6]:
df_twitter_old = pd.read_feather("twitter_res/twitter_old.ftr")
df_twitter_new = pd.read_feather("twitter_res/twitter_new.ftr")

In [7]:
print(df_twitter_old['tweet'][0]);

 climate change interesting hustle global warming planet stopped warming 15 yes suv boom 


In [9]:
counter = 0
allScores = [[] for i in range(len(df_twitter_old['tweet']))]

for tweet in df_twitter_old['tweet']:
    encoded_input = tokenizer(tweet, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores) #scores = [negative, neutral, positive] -> Are always in this order
    # check if data is modeld correctly
    if counter % 3000 == 0:
        print(f'{counter}: {scores}')
    allScores[counter] = scores
    counter += 1

df_twitter_old['sentimentScore'] = allScores
df_twitter_old

0: [0.01343372 0.56145763 0.42510867]
3000: [0.11462115 0.8682781  0.01710073]
6000: [0.03462958 0.88376236 0.08160809]
9000: [0.83620507 0.15550041 0.00829458]
12000: [0.01532613 0.8692461  0.11542763]
15000: [0.02356858 0.7689509  0.20748055]
18000: [0.03332511 0.87329644 0.09337848]
21000: [0.06901434 0.69873685 0.23224889]
24000: [0.5294029  0.38407466 0.08652245]
27000: [0.18921433 0.773444   0.03734165]
30000: [0.0109002  0.9530916  0.03600808]
33000: [0.3270773  0.5920179  0.08090478]


Unnamed: 0,tweet,tweet_ts,hashtags,sentimentScore
0,climate change interesting hustle global warm...,31-10-2016,[],"[0.013433724, 0.56145763, 0.42510867]"
1,"watch #beforetheflood right here, travels wo...",31-10-2016,[#beforetheflood],"[0.019552002, 0.795175, 0.18527298]"
2,fabulous! leonardo #dicaprio's film #climate c...,31-10-2016,"[#dicaprio, #climate]","[0.0022631218, 0.01210344, 0.98563343]"
3,watched amazing documentary leonardodicaprio ...,31-10-2016,[],"[0.0034224496, 0.031758707, 0.9648187]"
4,"pranita biswasi, lutheran odisha, gives testi...",31-10-2016,[],"[0.021988245, 0.9171695, 0.06084218]"
...,...,...,...,...
43934,#awareness walls aren$q$t answer people fleein...,26-10-2016,[#awareness],"[0.62899506, 0.3573929, 0.013611956]"
43935,americans scared clowns climate change.,26-10-2016,[],"[0.7736483, 0.20565642, 0.020695252]"
43939,respective parties prevent climate change glob...,26-10-2016,[#zpndebate],"[0.1894237, 0.77215046, 0.038425878]"
43941,still can$q$t believe gif taehyung saved huma...,26-10-2016,[],"[0.028441388, 0.19685273, 0.77470595]"


In [10]:
#save the data
write_to_feather(df_twitter_old, 'twitter_old_analysis.ftr')

In [11]:
#df_twitter_test = pd.read_feather("twitter_res/twitter_old_analysis.ftr");
#df_twitter_test

In [12]:
df_twitter_old['sentimentScore'][0][0]

0.013433724

In [None]:
counter = 0
allScores = [[] for i in range(len(df_twitter_new['tweet']))]

for tweet in df_twitter_new['tweet']:
    encoded_input = tokenizer(tweet, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores) #scores = [negative, neutral, positive]
    #print(f'{counter}: {scores}')
    if counter % 10000 == 0:
        print(f'{counter}: {scores}')
    allScores[counter] = scores
    counter += 1

df_twitter_new['sentimentScore'] = allScores
df_twitter_new

0: [0.7790241  0.21138299 0.00959283]
10000: [0.00229184 0.16949917 0.82820904]


In [15]:
#save the data
write_to_feather(df_twitter_new, 'twitter_new_analysis.ftr')