imports libraries

In [None]:
import snscrape.modules.twitter as sntwitter
from deep_translator import GoogleTranslator
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax
import pandas as pd

collects tweets from twitter api

In [None]:
query = "covid OR pandemia lang:pt until:2021-05-11 since:2021-03-16"
dates = []
locations = []
tweets_pt = []
for tweet in sntwitter.TwitterSearchScraper(query).get_items():
    location = tweet.user.location
    if location.find('Brasil') != -1:
        date = str(tweet.date).split('+')[0]
        dates.append(date)
        locations.append(location)
        tweets_pt.append(tweet.rawContent)
        print(f'{date} => {len(tweets_pt)}', end='\r')
print(f'\nDONE: {len(tweets_pt)} tweets collected')

preprocesses tweets

In [None]:
tweets_proc = []
for tweet in tweets_pt:
    tweet_words = []
    for word in tweet.split():
        if word.startswith('@') and len(word) > 1:
            word = '@user'
        elif word.startswith('http'):
            word = 'http'
        tweet_words.append(word)
    tweets_proc.append(" ".join(tweet_words))
    print(f'{len(tweets_proc)} / {len(tweets_pt)}', end='\r')
print(f'\nDONE: {len(tweets_proc)} tweets preprocessed')

In [None]:
df_pt = pd.DataFrame({
    'date': dates,
    'location': locations,
    'tweet': tweets_proc
})

df_pt
df_pt.to_csv('second_wave.csv')

translates tweets to english

In [7]:
tweets_en = GoogleTranslator(source='pt', target='en').translate_batch(tweets_proc)
print(f'DONE: {len(tweets_en)} tweets translated')

In [None]:
df_en = pd.DataFrame({
    'date': dates,
    'location': locations,
    'tweet_pt': tweets_proc,
    'tweet_en': tweets_en
})
df_en
df_en.to_csv('df_en.csv')

loads model and tokenizer

In [None]:
roberta = 'cardiffnlp/twitter-roberta-base-sentiment-latest'
tokenizer = AutoTokenizer.from_pretrained(roberta)
model = AutoModelForSequenceClassification.from_pretrained(roberta)
labels = ['Negative', 'Neutral', 'Positive']

analyzes sentiment of tweets

In [None]:
scores = []
for tweet in tweets_en:
    encoded_tweet = tokenizer(tweet, return_tensors='pt')
    output = model(**encoded_tweet)
    output_score = output[0][0].detach().numpy()
    scores.append(softmax(output_score))
    print(f'{len(scores)} / {len(tweets_en)}', end='\r')
print(f'\nDONE: {len(scores)} tweets scored')

creates a dataframe with the results

In [None]:
negative = [score[0] for score in scores]
neutral = [score[1] for score in scores]
positive = [score[2] for score in scores]
label = [labels[score.argmax()] for score in scores]

df = pd.DataFrame({
    'date': dates,
    'location': locations,
    'tweets_pt': tweets_proc,
    'tweets_en': tweets_en,
    'negative': negative,
    'neutral': neutral,
    'positive': positive,
    'label': label
})

df.to_csv('database.csv')

calculates the average sentiment of tweets

In [None]:
result = [0, 0, 0]

result[0] = sum(negative) / len(scores)
result[1] = sum(neutral) / len(scores)
result[2] = sum(positive) / len(scores)

for i in range(3):
    l = labels[i]
    s = result[i]
    print(l, s)