In [4]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline

In [5]:
finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
nlp = pipeline("sentiment-analysis", model=finbert, tokenizer=tokenizer)

In [6]:
sentences = ["there is a shortage of capital, and we need extra financing",  
             "growth is strong and we have plenty of liquidity", 
             "there are doubts about our finances", 
             "profits are flat"]
results = nlp(sentences)
print(results)  #LABEL_0: neutral; LABEL_1: positive; LABEL_2: negative

[{'label': 'Negative', 'score': 0.9966173768043518}, {'label': 'Positive', 'score': 1.0}, {'label': 'Negative', 'score': 0.9999710321426392}, {'label': 'Neutral', 'score': 0.9889441728591919}]


In [7]:
sampled_df = pd.read_csv('../data/cleaned/sampled.csv', index_col=0)

In [8]:
sentiments = []
for message in tqdm(sampled_df.body):
    sentiments.append(nlp(message))

  0%|          | 0/36500 [00:00<?, ?it/s]

In [10]:
sentiments_ = [ 0 if e[0]['label'] == 'Neutral' else e[0]['score'] if e[0]['label'] == 'Positive' else -1 * e[0]['score'] for e in sentiments]
sampled_df['sentiment'] = sentiments_

In [16]:
sampled_df['d'] = pd.to_datetime(pd.to_datetime(sampled_df['Date']).dt.date)

In [21]:
avg_sentiment = sampled_df.groupby('d').agg({'sentiment': 'mean'}).reset_index()

In [23]:
avg_sentiment

Unnamed: 0,d,sentiment
0,2019-01-01,0.104639
1,2019-01-02,-0.092262
2,2019-01-03,-0.034344
3,2019-01-04,0.200733
4,2019-01-05,0.029053
...,...,...
360,2019-12-27,0.081534
361,2019-12-28,0.029389
362,2019-12-29,0.088099
363,2019-12-30,0.147955


In [24]:
df = pd.read_csv("../data/cleaned/full.csv", index_col = 0)

In [25]:
df['d'] = pd.to_datetime(pd.to_datetime(df['Date_']).dt.date)

In [26]:
new_df = df.merge(avg_sentiment[['d', 'sentiment']], on='d', how='left')

In [29]:
new_df.drop(columns = ['sentiment_x'], inplace=True)

In [34]:
new_df.rename({'sentiment_y': 'sentiment'}, inplace=True)

In [35]:
new_df = new_df.drop_duplicates(subset='Date_', keep='first')

In [36]:
new_df

Unnamed: 0,Date_,s_score,a_score,Open,High,Low,Close,Adj Close,Volume,d_score,sentiment_y
0,2019-01-02,0.110075,-0.023881,2476.959961,2519.489990,2467.469971,2510.030029,2510.030029,3733160000,0.000000,-0.092262
100,2019-01-03,0.336328,-0.085938,2491.919922,2493.139893,2443.959961,2447.889893,2447.889893,3858830000,-62.140137,-0.034344
200,2019-01-04,0.516071,-0.126786,2474.330078,2538.070068,2474.330078,2531.939941,2531.939941,4234140000,84.050049,0.200733
300,2019-01-07,0.485776,-0.101724,2535.610107,2566.159912,2524.560059,2549.689941,2549.689941,4133120000,17.750000,0.019103
400,2019-01-08,0.398810,-0.068254,2568.110107,2579.820068,2547.560059,2574.409912,2574.409912,4120060000,24.719971,0.060482
...,...,...,...,...,...,...,...,...,...,...,...
24700,2019-12-24,0.575968,-0.114516,3225.449951,3226.429932,3220.510010,3223.379883,3223.379883,1296530000,-0.630127,0.057784
24800,2019-12-26,0.642647,-0.122059,3227.199951,3240.080078,3227.199951,3239.909912,3239.909912,2164540000,16.530029,0.162792
24900,2019-12-27,0.365909,-0.063636,3247.229980,3247.929932,3234.370117,3240.020020,3240.020020,2429150000,0.110107,0.081534
25000,2019-12-30,0.632090,-0.093284,3240.090088,3240.919922,3216.570068,3221.290039,3221.290039,3021720000,-18.729980,0.147955


In [37]:
new_df.to_csv("../data/cleaned/full.csv")