In [7]:
import numpy as np
import pandas as pd
from scipy.special import softmax
import csv
import urllib.request
import ssl
import certifi

In [10]:
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
# Tasks:
# emoji, emotion, hate, irony, offensive, sentiment
# stance/abortion, stance/atheism, stance/climate, stance/feminist, stance/hillary
task='sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

#tokenizer = AutoTokenizer.from_pretrained(MODEL)
#model = TFAutoModelForSequenceClassification.from_pretrained(MODEL)
#model.save_pretrained(save_directory='../models/' + MODEL)
#tokenizer.save_pretrained(save_directory='../models/' + MODEL)

tokenizer = AutoTokenizer.from_pretrained('../models/' + MODEL, local_files_only=True)
model = TFAutoModelForSequenceClassification.from_pretrained('../models/' + MODEL, local_files_only=True)


All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at ../models/cardiffnlp/twitter-roberta-base-sentiment.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


In [11]:
# download label mapping
labels=[]
task='sentiment'
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link, context=ssl.create_default_context(cafile=certifi.where())) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]

In [12]:
# # TF Examples
#Yeah exactly, dxy is pumping and every thing is still green.. SPX and DJI and BTC.. struggling yeah, but not dumping though, given that bulls and bears are calling for the 38k.. 💁🏻‍♂️ So where do you see us go from here? In the next weeks and until the end of year?
#"I think bitcoin is really on the verge of getting broad acceptance by conventional finance people"
#"WHY CHINA’S RENEWED HARSH CRACKDOWN ON CRYPTO ISN’T STIFLING BITCOIN — AT LEAST NOT YET #bitcoin #btc"
text1 = "If Bitcoin was able to capture a $1T market cap, it's not even impossible at this point. It might take time. But there is a chance."
text = "If SEC approves the one of the ETF’s , this will allow people and instutitional investors to buy bitcoin easier."
def predict(text):
    encoded_input = tokenizer(text, return_tensors='tf')
    output = model(encoded_input)
    scores = output[0][0].numpy()
    scores = softmax(scores)
    ranking = np.argsort(scores)
    ranking = ranking[::-1]
    for i in range(scores.shape[0]):
        l = labels[ranking[i]]
        s = scores[ranking[i]]
        print(f"{i+1}) {l} {np.round(float(s), 4)}")
predict(text1)

1) positive 0.6832
2) neutral 0.3055
3) negative 0.0113


In [13]:
#load preprocessed dataset with text to predict
data = pd.read_csv('../data/interim/sample_bitcoin_tweets_ten_each_day.csv')
data.shape

(310, 15)

In [15]:
#apply prediction on texts
def predict(text):
    encoded_input = tokenizer(text, return_tensors='tf')
    output = model(encoded_input)
    scores = output[0][0].numpy()
    scores = softmax(scores)
    ranking = np.argsort(scores)
    ranking = ranking[::-1]
    prediction = labels[ranking[0]]
    score = scores[ranking[0]]
    confidence = np.round(float(score), 4)
    return pd.Series([prediction, confidence])

#btcdata['sentiment'] = btcdata['text'].apply(lambda x: x.lower())
data[['prediction','confidence']] = data['text'].apply(lambda x: predict(x))

In [16]:
data.head()

Unnamed: 0,dateTime,Unnamed: 1,id,date,text,hashtags,replyCount,retweetCount,likeCount,userName,userFollowerCount,userFavCount,userFriendCount,dateTime.1,textLength,prediction,confidence
0,1,877320,1.432983e+18,2021-09-01 08:24:57+00:00,"keep putting money into your pension, everythi...","['Bitcoin', 'FoxBusiness']",0.0,0.0,1.0,Alberto62316949,55.0,819.0,225.0,2021-09-01 08:24:57+00:00,161,neutral,0.4586
1,1,1620664,1.421888e+18,2021-08-01 17:37:21+00:00,the divine line “bitcoin/cryptocurrency is for...,"['crypto', 'btc', 'bitcoin', 'bnb', 'bsc', 'cr...",0.0,0.0,1.0,Bbcryptonews,563.0,422.0,1357.0,2021-08-01 17:37:21+00:00,149,neutral,0.5688
2,1,863024,1.433146e+18,2021-09-01 19:15:14+00:00,shop &amp;,"['linkedin', 'twitter', 'facebook', 'instagram...",0.0,1.0,0.0,bmurphypointman,71127.0,24.0,12210.0,2021-09-01 19:15:14+00:00,10,neutral,0.7449
3,1,872346,1.433054e+18,2021-09-01 13:08:46+00:00,gm fam. breaking $50k today. 👊 send it,['bitcoin'],12.0,6.0,127.0,kailybuemi,5966.0,9010.0,1128.0,2021-09-01 13:08:46+00:00,38,positive,0.6983
4,1,877074,1.432987e+18,2021-09-01 08:42:12+00:00,delegates from el salvador's finance commissio...,"['Bitcoin', 'ElSalvador']",0.0,0.0,2.0,CryptoManiacs10,521.0,1045.0,108.0,2021-09-01 08:42:12+00:00,219,positive,0.7327


In [17]:
data[['text','prediction','confidence']].to_csv('../data/interim/sample_bitcoin_tweets_ten_each_day_prediction.csv')