In [1]:
import pandas as pd
import numpy as np
import fasttext
from tqdm.auto import tqdm
import datetime

# Step 1: Filter top 100 most liked tweets per day

In [2]:
%%time
data = pd.read_csv('../tweets.csv', sep=';', parse_dates=['timestamp'])
# remove blank tweets
data = data[~data['text'].isna()]
print(len(data))
data.head()



18809800
CPU times: user 1min 56s, sys: 1min 10s, total: 3min 7s
Wall time: 3min 50s


Unnamed: 0,id,user,fullname,url,timestamp,replies,likes,retweets,text
0,1132977055300300800,KamdemAbdiel,Abdiel kamdem,,2019-05-27 11:49:14+00:00,0.0,0.0,0.0,È appena uscito un nuovo video! LES CRYPTOMONN...
1,1132977073402736640,bitcointe,Bitcointe,,2019-05-27 11:49:18+00:00,0.0,0.0,0.0,Cardano: Digitize Currencies; EOS https://t.co...
2,1132977023893139456,3eyedbran,Bran - 3 Eyed Raven,,2019-05-27 11:49:06+00:00,0.0,2.0,1.0,Another Test tweet that wasn't caught in the s...
3,1132977089089556481,DetroitCrypto,J. Scardina,,2019-05-27 11:49:22+00:00,0.0,0.0,0.0,Current Crypto Prices! \n\nBTC: $8721.99 USD\n...
4,1132977092340191232,mmursaleen72,Muhammad Mursaleen,,2019-05-27 11:49:23+00:00,0.0,0.0,0.0,Spiv (Nosar Baz): BITCOIN Is An Asset &amp; NO...


In [3]:
df = data.copy()
df['date'] = df['timestamp'].dt.date
df['date'].nunique()

3005

In [5]:
df = df.sort_values(by=['date', 'likes'], ascending=False)

In [18]:
%%time
max_tweets_by_date = 100
dfs = []
for date, group in tqdm(df.groupby('date')):
    dfs.append(group.head(max_tweets_by_date))
    
dfs = pd.concat(dfs)
print(len(dfs))
# dfs[['id', 'date']].groupby('date').count().to_csv('summary.csv')

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3005.0), HTML(value='')))


266635
CPU times: user 10.6 s, sys: 32.3 s, total: 42.9 s
Wall time: 1min 18s


# Step 2: Detect language of each tweet

In [2]:
path_to_pretrained_model = '../lid.176.bin'
fmodel = fasttext.load_model(path_to_pretrained_model)
lang, prob = fmodel.predict('testua bla bla eu')
lang, prob



(('__label__pt',), array([0.55595934]))

In [24]:
dfs['language'] = ''
dfs['probability'] = 0

for index, row in tqdm(dfs.iterrows(), total=len(dfs)):
    lang, prob = fmodel.predict(row['text'].replace('\n',''))
    dfs.loc[index, 'language'] = lang[0]
    dfs.loc[index, 'probability'] = prob[0]

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=266635.0), HTML(value='')))




In [25]:
dfs.head()

Unnamed: 0,id,user,fullname,url,timestamp,replies,likes,retweets,text,date,language,probability
14404079,32867411,chrispychong,chrispy,,2007-04-19 07:14:38+00:00,0.0,0.0,2.0,is happily mugging at BTC where she will hook ...,2007-04-19,__label__en,0.982041
21,1110302988,halfin,halfin,,2009-01-11 03:33:52+00:00,790.0,14470.0,5542.0,Running bitcoin,2009-01-11,__label__en,0.27637
7563298,1136749815,halfin,halfin,,2009-01-21 17:29:40+00:00,55.0,1544.0,392.0,Looking at ways to add more anonymity to bitcoin,2009-01-21,__label__en,0.763378
7609229,1153096538,halfin,halfin,,2009-01-27 20:14:10+00:00,44.0,1042.0,277.0,Thinking about how to reduce CO2 emissions fro...,2009-01-27,__label__en,0.88
7668664,1158416742,fafcffacfff,GoldLover,,2009-01-29 13:37:53+00:00,0.0,28.0,16.0,From: Satoshi Nakamoto - 2009-01-11 22:32 Bitc...,2009-01-29,__label__en,0.615885


In [26]:
dfs['language'].value_counts()

__label__en     237582
__label__ja      11886
__label__pt       2848
__label__es       2840
__label__tr       2820
                 ...  
__label__ro          1
__label__nn          1
__label__vec         1
__label__ckb         1
__label__ilo         1
Name: language, Length: 61, dtype: int64

In [27]:
dfs[dfs['language'] == '__label__en'].to_csv('../tweets_top100perday_eng.csv', index=False)
df2 = dfs[dfs['language'] == '__label__en']

In [3]:
df2 = pd.read_csv('../tweets_top100perday_eng.csv')
df2

Unnamed: 0,id,user,fullname,url,timestamp,replies,likes,retweets,text,date,language,probability
0,3.286741e+07,chrispychong,chrispy,,2007-04-19 07:14:38+00:00,0.0,0.0,2.0,is happily mugging at BTC where she will hook ...,2007-04-19,__label__en,0.982041
1,1.110303e+09,halfin,halfin,,2009-01-11 03:33:52+00:00,790.0,14470.0,5542.0,Running bitcoin,2009-01-11,__label__en,0.276370
2,1.136750e+09,halfin,halfin,,2009-01-21 17:29:40+00:00,55.0,1544.0,392.0,Looking at ways to add more anonymity to bitcoin,2009-01-21,__label__en,0.763378
3,1.153097e+09,halfin,halfin,,2009-01-27 20:14:10+00:00,44.0,1042.0,277.0,Thinking about how to reduce CO2 emissions fro...,2009-01-27,__label__en,0.880000
4,1.158417e+09,fafcffacfff,GoldLover,,2009-01-29 13:37:53+00:00,0.0,28.0,16.0,From: Satoshi Nakamoto - 2009-01-11 22:32 Bitc...,2009-01-29,__label__en,0.615885
...,...,...,...,...,...,...,...,...,...,...,...,...
237577,1.198042e+18,girlgone_crypto,Girl Gone Crypto,,2019-11-23 00:54:38+00:00,30.0,60.0,5.0,Time for another round of #bitcoin limbo! How ...,2019-11-23,__label__en,0.677420
237578,1.198198e+18,nacinorocco,Rocco Nacino,,2019-11-23 11:15:37+00:00,2.0,60.0,2.0,So this happened today. Oathtaking and Officia...,2019-11-23,__label__en,0.836487
237579,1.198080e+18,Mashinsky,Alex Mashinsky,,2019-11-23 03:24:18+00:00,0.0,57.0,13.0,While $BTC is crashing one of its shining star...,2019-11-23,__label__en,0.888772
237580,1.198098e+18,HotepJesus,Hotep Jesus,,2019-11-23 04:35:57+00:00,14.0,57.0,3.0,Blocks can be lifted with a $107 payment in bi...,2019-11-23,__label__en,0.852379


# Step 3: Generate sentiment scores

In [4]:
import torch
import torch.nn.functional as F
from pytorch_pretrained_bert import BertTokenizer
from bertModel import BertClassification
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

In [5]:
labels = {0:'neutral', 1:'positive',2:'negative'}
num_labels= len(labels)
vocab = "finance-uncased"
vocab_path = 'analyst_tone/vocab'
pretrained_weights_path = "analyst_tone/pretrained_weights" # this is pre-trained FinBERT weights
fine_tuned_weight_path = "analyst_tone/fine_tuned.pth"      # this is fine-tuned FinBERT weights
max_seq_length=512
device='cuda'
tokenizer = BertTokenizer(vocab_file = vocab_path, do_lower_case = True, do_basic_tokenize = True)

In [6]:
model = BertClassification(weight_path= pretrained_weights_path, num_labels=num_labels, vocab=vocab)
model.load_state_dict(torch.load(fine_tuned_weight_path, map_location=torch.device(device)))
model.to(device);
model.eval();

  nn.init.xavier_normal(self.classifier.weight)


In [19]:
for index, row in tqdm(df2.iterrows(), total=len(df2)):
    sent = row['text']
    tokenized_sent = tokenizer.tokenize(sent)
    if len(tokenized_sent) > max_seq_length:
        tokenized_sent = tokenized_sent[:max_seq_length]
    
    ids_review  = tokenizer.convert_tokens_to_ids(tokenized_sent)
    mask_input = [1]*len(ids_review)        
    padding = [0] * (max_seq_length - len(ids_review))
    ids_review += padding
    mask_input += padding
    input_type = [0]*max_seq_length
    
    input_ids = torch.tensor(ids_review).to(device).reshape(-1, max_seq_length)
    attention_mask =  torch.tensor(mask_input).to(device).reshape(-1, max_seq_length)
    token_type_ids = torch.tensor(input_type).to(device).reshape(-1, max_seq_length)
    
    with torch.set_grad_enabled(False):
        outputs = model(input_ids, token_type_ids, attention_mask)
        outputs = F.softmax(outputs,dim=1)
        df2.loc[index, 'sentiment'] = labels[torch.argmax(outputs).item()]
        df2.loc[index, 'neutral'] = outputs[0, 0].item()
        df2.loc[index, 'positive'] = outputs[0, 1].item()
        df2.loc[index, 'negative'] = outputs[0, 2].item()

  0%|          | 0/237582 [00:00<?, ?it/s]

In [20]:
df2.head()

Unnamed: 0,id,user,fullname,url,timestamp,replies,likes,retweets,text,date,language,probability,neutral,positive,negative,sentiment
0,32867410.0,chrispychong,chrispy,,2007-04-19 07:14:38+00:00,0.0,0.0,2.0,is happily mugging at BTC where she will hook ...,2007-04-19,__label__en,0.982041,0.99999,8.654533e-06,1.342184e-06,neutral
1,1110303000.0,halfin,halfin,,2009-01-11 03:33:52+00:00,790.0,14470.0,5542.0,Running bitcoin,2009-01-11,__label__en,0.27637,0.999987,1.657014e-06,1.18229e-05,neutral
2,1136750000.0,halfin,halfin,,2009-01-21 17:29:40+00:00,55.0,1544.0,392.0,Looking at ways to add more anonymity to bitcoin,2009-01-21,__label__en,0.763378,0.999997,1.632836e-06,1.13627e-06,neutral
3,1153097000.0,halfin,halfin,,2009-01-27 20:14:10+00:00,44.0,1042.0,277.0,Thinking about how to reduce CO2 emissions fro...,2009-01-27,__label__en,0.88,0.999642,0.0002439857,0.0001142936,neutral
4,1158417000.0,fafcffacfff,GoldLover,,2009-01-29 13:37:53+00:00,0.0,28.0,16.0,From: Satoshi Nakamoto - 2009-01-11 22:32 Bitc...,2009-01-29,__label__en,0.615885,1.0,2.394339e-08,2.423017e-07,neutral


In [21]:
df2.to_csv('../tweets_top100perday_eng_processed.csv', index=False)

# Step 4: Group scores by day

In [24]:
df2['count'] = 1
final = df2[['date', 'count', 'neutral', 'positive', 'negative']].groupby('date').sum().reset_index()
final

Unnamed: 0,date,count,neutral,positive,negative
0,2007-04-19,1,0.999990,8.654533e-06,1.342184e-06
1,2009-01-11,1,0.999987,1.657014e-06,1.182290e-05
2,2009-01-21,1,0.999997,1.632836e-06,1.136270e-06
3,2009-01-27,1,0.999642,2.439857e-04,1.142936e-04
4,2009-01-29,1,1.000000,2.394339e-08,2.423017e-07
...,...,...,...,...,...
2994,2019-11-19,88,75.583519,8.905116e+00,3.511364e+00
2995,2019-11-20,96,83.759390,1.060227e+01,1.638338e+00
2996,2019-11-21,97,86.064436,5.207286e+00,5.728278e+00
2997,2019-11-22,89,79.063211,4.076485e+00,5.860303e+00


In [25]:
final.to_csv('../sentiment_count_top100perday_eng.csv', index=False)