In [1]:
import torch
import torch.nn.functional as F
from pytorch_pretrained_bert import BertTokenizer
from bertModel import BertClassification
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

In [7]:
labels = {0:'neutral', 1:'positive',2:'negative'}
num_labels= len(labels)
vocab = "finance-uncased"
vocab_path = 'analyst_tone/vocab'
pretrained_weights_path = "analyst_tone/pretrained_weights" # this is pre-trained FinBERT weights
fine_tuned_weight_path = "analyst_tone/fine_tuned.pth"      # this is fine-tuned FinBERT weights
max_seq_length=512
device='cuda'
tokenizer = BertTokenizer(vocab_file = vocab_path, do_lower_case = True, do_basic_tokenize = True)

In [9]:
model = BertClassification(weight_path= pretrained_weights_path, num_labels=num_labels, vocab=vocab)
model.load_state_dict(torch.load(fine_tuned_weight_path, map_location=torch.device(device)))
model.to(device);
model.eval();

In [18]:
df = pd.read_csv('../tweets_influencers.csv')
df['sentiment'] = ''
df = df[~df['text'].isna()]
print(len(df))
df.head()

17263


Unnamed: 0,id,user,fullname,url,timestamp,replies,likes,retweets,text,sentiment
0,1.220894e+16,gavinandresen,Gavin Andresen,,2010-12-07 18:16:53+00:00,0.0,0.0,4.0,Launched my second #bitcoin project today: htt...,
1,7.123706e+16,gavinandresen,Gavin Andresen,,2011-05-19 13:33:33+00:00,0.0,5.0,6.0,"Yesterday it was #bitcoin in Slate, today Gizm...",
2,2.035533e+17,ErikVoorhees,Erik Voorhees,,2012-05-18 16:30:57+00:00,0.0,5.0,9.0,@zerohedge Bitcoin wasn't hacked (and has neve...,
3,2.078903e+17,rogerkver,Roger Ver,,2012-05-30 17:44:50+00:00,4.0,3.0,3.0,Any suggestions for which bitcoin logo to put ...,
4,2.224597e+17,ErikVoorhees,Erik Voorhees,,2012-07-09 22:38:14+00:00,76.0,1382.0,362.0,#Bitcoin just broke $7.00... now up 40% in the...,


In [19]:
for index, row in tqdm(df.iterrows(), total=len(df)):
    sent = row['text']
    tokenized_sent = tokenizer.tokenize(sent)
    if len(tokenized_sent) > max_seq_length:
        tokenized_sent = tokenized_sent[:max_seq_length]
    
    ids_review  = tokenizer.convert_tokens_to_ids(tokenized_sent)
    mask_input = [1]*len(ids_review)        
    padding = [0] * (max_seq_length - len(ids_review))
    ids_review += padding
    mask_input += padding
    input_type = [0]*max_seq_length
    
    input_ids = torch.tensor(ids_review).to(device).reshape(-1, max_seq_length)
    attention_mask =  torch.tensor(mask_input).to(device).reshape(-1, max_seq_length)
    token_type_ids = torch.tensor(input_type).to(device).reshape(-1, max_seq_length)
    
    with torch.set_grad_enabled(False):
        outputs = model(input_ids, token_type_ids, attention_mask)
        outputs = F.softmax(outputs,dim=1)
        #print(sent, '\nFinBERT predicted sentiment: ', labels[torch.argmax(outputs).item()], '\n')
        df.loc[index, 'sentiment'] = labels[torch.argmax(outputs).item()]
        
    #if index > 100:
    #    break

  0%|          | 0/17263 [00:00<?, ?it/s]

In [20]:
df.head(100)

Unnamed: 0,id,user,fullname,url,timestamp,replies,likes,retweets,text,sentiment
0,1.220894e+16,gavinandresen,Gavin Andresen,,2010-12-07 18:16:53+00:00,0.0,0.0,4.0,Launched my second #bitcoin project today: htt...,neutral
1,7.123706e+16,gavinandresen,Gavin Andresen,,2011-05-19 13:33:33+00:00,0.0,5.0,6.0,"Yesterday it was #bitcoin in Slate, today Gizm...",neutral
2,2.035533e+17,ErikVoorhees,Erik Voorhees,,2012-05-18 16:30:57+00:00,0.0,5.0,9.0,@zerohedge Bitcoin wasn't hacked (and has neve...,neutral
3,2.078903e+17,rogerkver,Roger Ver,,2012-05-30 17:44:50+00:00,4.0,3.0,3.0,Any suggestions for which bitcoin logo to put ...,neutral
4,2.224597e+17,ErikVoorhees,Erik Voorhees,,2012-07-09 22:38:14+00:00,76.0,1382.0,362.0,#Bitcoin just broke $7.00... now up 40% in the...,positive
...,...,...,...,...,...,...,...,...,...,...
95,5.646002e+17,gavinandresen,Gavin Andresen,,2015-02-09 01:42:37+00:00,7.0,22.0,45.0,Robin Hanson on #bitcoin ; my only quibble is ...,neutral
96,5.655334e+17,ToneVays,Tone Vays [RIP Tyler Jenks],,2015-02-11 15:30:55+00:00,2.0,2.0,1.0,Stuck at $223 (fib) again but #bitcoin is coil...,neutral
97,5.676711e+17,notsofast,notsofast,,2015-02-17 13:05:05+00:00,0.0,2.0,1.0,"""Smart Contracts..."" http://t.co/qjtctVLmBT vi...",neutral
98,5.731372e+17,ChrisDunnTV,Chris Dunn,,2015-03-04 15:05:21+00:00,10.0,24.0,17.0,"Nice bull trend in #bitcoin, watching $300's h...",neutral


In [21]:
df.to_csv('../tweets_influencers_processed.csv', index=False)

In [39]:
df['neutral'] = (df['sentiment'] == 'neutral').astype(int)
df['positive'] = (df['sentiment'] == 'positive').astype(int)
df['negative'] = (df['sentiment'] == 'negative').astype(int)

In [40]:
df['sentiment'].value_counts()

neutral     14868
positive     1672
negative      723
Name: sentiment, dtype: int64

In [41]:
df['date'] = pd.to_datetime(df['timestamp']).dt.date

In [45]:
final = df[['date', 'neutral', 'positive', 'negative']].groupby('date').sum().reset_index()
final

Unnamed: 0,date,neutral,positive,negative
0,2010-12-07,1,0,0
1,2011-05-19,1,0,0
2,2012-05-18,1,0,0
3,2012-05-30,1,0,0
4,2012-07-09,0,1,0
...,...,...,...,...
838,2019-11-19,45,3,2
839,2019-11-20,54,11,0
840,2019-11-21,70,6,6
841,2019-11-22,63,8,7


In [46]:
final.to_csv('../sentiment_count_influencers.csv', index=False)