In [1]:
import pandas as pd
import numpy as np
import fasttext
from tqdm.auto import tqdm
import datetime

# Step 1: Filter top 100 most liked tweets per day

In [2]:
%%time
data = pd.read_csv('../tweets.csv', sep=';', parse_dates=['timestamp'])
# remove blank tweets
data = data[~data['text'].isna()]
print(len(data))
data.head()



18809800
CPU times: user 1min 56s, sys: 1min 10s, total: 3min 7s
Wall time: 3min 50s


Unnamed: 0,id,user,fullname,url,timestamp,replies,likes,retweets,text
0,1132977055300300800,KamdemAbdiel,Abdiel kamdem,,2019-05-27 11:49:14+00:00,0.0,0.0,0.0,È appena uscito un nuovo video! LES CRYPTOMONN...
1,1132977073402736640,bitcointe,Bitcointe,,2019-05-27 11:49:18+00:00,0.0,0.0,0.0,Cardano: Digitize Currencies; EOS https://t.co...
2,1132977023893139456,3eyedbran,Bran - 3 Eyed Raven,,2019-05-27 11:49:06+00:00,0.0,2.0,1.0,Another Test tweet that wasn't caught in the s...
3,1132977089089556481,DetroitCrypto,J. Scardina,,2019-05-27 11:49:22+00:00,0.0,0.0,0.0,Current Crypto Prices! \n\nBTC: $8721.99 USD\n...
4,1132977092340191232,mmursaleen72,Muhammad Mursaleen,,2019-05-27 11:49:23+00:00,0.0,0.0,0.0,Spiv (Nosar Baz): BITCOIN Is An Asset &amp; NO...


In [3]:
df = data.copy()
df['date'] = df['timestamp'].dt.date
df['date'].nunique()

3005

In [5]:
df = df.sort_values(by=['date', 'likes'], ascending=False)

In [18]:
%%time
max_tweets_by_date = 100
dfs = []
for date, group in tqdm(df.groupby('date')):
    dfs.append(group.head(max_tweets_by_date))
    
dfs = pd.concat(dfs)
print(len(dfs))
# dfs[['id', 'date']].groupby('date').count().to_csv('summary.csv')

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3005.0), HTML(value='')))


266635
CPU times: user 10.6 s, sys: 32.3 s, total: 42.9 s
Wall time: 1min 18s


# Step 2: Detect language of each tweet

In [22]:
path_to_pretrained_model = '../lid.176.bin'
fmodel = fasttext.load_model(path_to_pretrained_model)
lang, prob = fmodel.predict('testua bla bla eu')
lang, prob



(('__label__pt',), array([0.55595934]))

In [24]:
dfs['language'] = ''
dfs['probability'] = 0

for index, row in tqdm(dfs.iterrows(), total=len(dfs)):
    lang, prob = fmodel.predict(row['text'].replace('\n',''))
    dfs.loc[index, 'language'] = lang[0]
    dfs.loc[index, 'probability'] = prob[0]

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=266635.0), HTML(value='')))




In [25]:
dfs.head()

Unnamed: 0,id,user,fullname,url,timestamp,replies,likes,retweets,text,date,language,probability
14404079,32867411,chrispychong,chrispy,,2007-04-19 07:14:38+00:00,0.0,0.0,2.0,is happily mugging at BTC where she will hook ...,2007-04-19,__label__en,0.982041
21,1110302988,halfin,halfin,,2009-01-11 03:33:52+00:00,790.0,14470.0,5542.0,Running bitcoin,2009-01-11,__label__en,0.27637
7563298,1136749815,halfin,halfin,,2009-01-21 17:29:40+00:00,55.0,1544.0,392.0,Looking at ways to add more anonymity to bitcoin,2009-01-21,__label__en,0.763378
7609229,1153096538,halfin,halfin,,2009-01-27 20:14:10+00:00,44.0,1042.0,277.0,Thinking about how to reduce CO2 emissions fro...,2009-01-27,__label__en,0.88
7668664,1158416742,fafcffacfff,GoldLover,,2009-01-29 13:37:53+00:00,0.0,28.0,16.0,From: Satoshi Nakamoto - 2009-01-11 22:32 Bitc...,2009-01-29,__label__en,0.615885


In [26]:
dfs['language'].value_counts()

__label__en     237582
__label__ja      11886
__label__pt       2848
__label__es       2840
__label__tr       2820
                 ...  
__label__ro          1
__label__nn          1
__label__vec         1
__label__ckb         1
__label__ilo         1
Name: language, Length: 61, dtype: int64

In [27]:
dfs[dfs['language'] == '__label__en'].to_csv('../tweets_top100perday_eng.csv', index=False)
df2 = dfs[dfs['language'] == '__label__en']

# Step 3: Generate sentiment scores

In [None]:
import torch
import torch.nn.functional as F
from pytorch_pretrained_bert import BertTokenizer
from bertModel import BertClassification
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

In [None]:
labels = {0:'neutral', 1:'positive',2:'negative'}
num_labels= len(labels)
vocab = "finance-uncased"
vocab_path = 'analyst_tone/vocab'
pretrained_weights_path = "analyst_tone/pretrained_weights" # this is pre-trained FinBERT weights
fine_tuned_weight_path = "analyst_tone/fine_tuned.pth"      # this is fine-tuned FinBERT weights
max_seq_length=512
device='cuda'
tokenizer = BertTokenizer(vocab_file = vocab_path, do_lower_case = True, do_basic_tokenize = True)

In [None]:
model = BertClassification(weight_path= pretrained_weights_path, num_labels=num_labels, vocab=vocab)
model.load_state_dict(torch.load(fine_tuned_weight_path, map_location=torch.device(device)))
model.to(device);
model.eval();

In [None]:
for index, row in tqdm(df2.iterrows(), total=len(df2)):
    sent = row['text']
    tokenized_sent = tokenizer.tokenize(sent)
    if len(tokenized_sent) > max_seq_length:
        tokenized_sent = tokenized_sent[:max_seq_length]
    
    ids_review  = tokenizer.convert_tokens_to_ids(tokenized_sent)
    mask_input = [1]*len(ids_review)        
    padding = [0] * (max_seq_length - len(ids_review))
    ids_review += padding
    mask_input += padding
    input_type = [0]*max_seq_length
    
    input_ids = torch.tensor(ids_review).to(device).reshape(-1, max_seq_length)
    attention_mask =  torch.tensor(mask_input).to(device).reshape(-1, max_seq_length)
    token_type_ids = torch.tensor(input_type).to(device).reshape(-1, max_seq_length)
    
    with torch.set_grad_enabled(False):
        # TODO save raw scores instead of labels
        outputs = model(input_ids, token_type_ids, attention_mask)
        outputs = F.softmax(outputs,dim=1)
        df2.loc[index, 'sentiment'] = labels[torch.argmax(outputs).item()]

In [None]:
df2.head()

In [None]:
df2.to_csv('../tweets_20likes_eng_processed.csv', index=False)

# Step 4: 