# FinBERT With Headlines

In [3]:
import pandas

headline_df = pandas.read_csv("dataset/headline_train.csv")
headline_df.head(5)

Unnamed: 0,id,sentence,snippets,target,sentiment_score,aspects
0,1,Royal Mail chairman Donald Brydon set to step ...,['set to step down'],Royal Mail,-0.374,['Corporate/Appointment']
1,2,Stakes High for AstraZeneca Heart Drug Facing ...,['Facing Tough Competition'],AstraZeneca,-0.24,['Corporate/Risks']
2,3,UPDATE 1-Dairy Crest loses a third of Morrison...,['Crest loses a third of Morrisons milk contra...,Morrisons,-0.161,['Corporate/Sales/Failed Contract Discussion']
3,4,Insight hires Aviva's David Hillier for multi-...,['hires Aviva's David Hillier for multi-asset ...,Insight,0.137,['Corporate/Appointment/Executive Appointment']
4,5,Primark racks up a happy Christmas after stron...,['after strong sales'],Primark,0.704,['Corporate/Sales']


In [6]:
import numpy as np

headlines_array = np.array(headline_df)
np.random.shuffle(headlines_array)
headlines_list = list(headlines_array[:,1])

print(headlines_list)

['Irish housebuilder Cairn Homes plans London listing', 'L&G still paying price for dividend cut during crisis, chief says', 'Petrofac books further Â£30m cost for Shetland gas terminal delays', 'Industry NewsWolseley confident in reslilience amid mixed markets', 'Petrofac share price rises despite Â£30m costs on North Sea project', 'Keith Skeoch to step up as David Nish quits as chief executive of Standard Life', 'Oil majors like Royal Dutch Shell, Chevron, BP fail to find reserves to counter ...', 'AB InBev looks to win over SABMiller investors', "Britain's FTSE forges ahead as Shire surges", 'Builder Persimmon hails 6% rise in house sales', "Intertek swings to Â£347 mln loss on oil's slump", 'Diageo receives reports from United Spirits on financial irregularities involving ...', "Why I'd Buy ARM Holdings plc And BHP Billiton plc Today", 'Travis Perkins Hikes Dividend 20% As Profit And Revenue Rise', 'AstraZeneca profit down as sales of stalwarts fade', "Spain's CaixaBank Expects To 

In [7]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
  
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

Downloading:   0%|          | 0.00/252 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/758 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/418M [00:00<?, ?B/s]

In [8]:
#That's where the headlines get tokenized to be inputted into model

inputs = tokenizer(headlines_list, padding = True, truncation = True, return_tensors='pt')
print(inputs)

{'input_ids': tensor([[  101,  3493,  2160,  ...,     0,     0,     0],
        [  101,  1048,  1004,  ...,     0,     0,     0],
        [  101,  9004,  3217,  ...,     0,     0,     0],
        ...,
        [  101, 13371,  1011,  ...,     0,     0,     0],
        [  101,  5467,  4125,  ...,     0,     0,     0],
        [  101, 26236,  9818,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}


In [9]:
#inference

outputs = model(**inputs)
print(outputs.logits.shape)

torch.Size([436, 3])


In [10]:
#Postprocessing with softmax

import torch

predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
print(predictions)

tensor([[0.0480, 0.0192, 0.9328],
        [0.5866, 0.1432, 0.2701],
        [0.1999, 0.0472, 0.7528],
        ...,
        [0.1093, 0.0158, 0.8750],
        [0.7944, 0.0202, 0.1853],
        [0.0259, 0.0226, 0.9514]], grad_fn=<SoftmaxBackward0>)


In [11]:
model.config.id2label

{0: 'positive', 1: 'negative', 2: 'neutral'}

In [14]:
#Formatting the results as a pandas data frame

import pandas as pd

#Headline #Positive #Negative #Neutral
positive = predictions[:, 0].tolist()
negative = predictions[:, 1].tolist()
neutral = predictions[:, 2].tolist()

table = {'Headline':headlines_list,
         "Positive":positive,
         "Negative":negative, 
         "Neutral":neutral}
      
df = pd.DataFrame(table, columns = ["Headline", "Positive", "Negative", "Neutral"])

df.head(10)

Unnamed: 0,Headline,Positive,Negative,Neutral
0,Irish housebuilder Cairn Homes plans London li...,0.048033,0.019163,0.932804
1,L&G still paying price for dividend cut during...,0.586642,0.143236,0.270122
2,Petrofac books further Â£30m cost for Shetland...,0.199948,0.047227,0.752825
3,Industry NewsWolseley confident in reslilience...,0.943798,0.023129,0.033074
4,Petrofac share price rises despite Â£30m costs...,0.909451,0.064922,0.025627
5,Keith Skeoch to step up as David Nish quits as...,0.02157,0.275133,0.703297
6,"Oil majors like Royal Dutch Shell, Chevron, BP...",0.058272,0.505076,0.436652
7,AB InBev looks to win over SABMiller investors,0.941902,0.013037,0.04506
8,Britain's FTSE forges ahead as Shire surges,0.572216,0.174003,0.253781
9,Builder Persimmon hails 6% rise in house sales,0.928966,0.026421,0.044613


In [19]:
def find_label(value):
    if -0.1 <= value <= 0.1:
        return 0
    elif value < -0.1:
        return -1
    else:
        return 1

index = 0
correct = 0.0
for x in headlines_array:
    true_label = find_label(x[4])
    max_v = max(predictions[index, 0], predictions[index, 1], predictions[index, 2])
    if max_v == predictions[index, 0]:
        prediction_label = 1
    elif max_v == predictions[index, 1]:
        prediction_label = -1
    else:
        prediction_label = 0
    
    if prediction_label == true_label:
        correct += 1
    index += 1
print("Headline Prediction Rate: ", correct / (index + 1) * 100, "%")
    

Headline Prediction Rate:  59.95423340961098 %


# FinBERT With Post

In [2]:
import pandas

post_df = pandas.read_csv("dataset/post_train.csv")
post_df.head(5)   

Unnamed: 0,id,sentence,snippets,target,sentiment_score,aspects
0,1,Slowly adding some $FIO here but gotta be care...,['Slowly adding some $FIO here but gotta be ca...,FIO,0.459,['Stock/Price Action/Bullish/Bull Position']
1,2,$TRX http://stks.co/1KkK Long setup. MACD cross.,['Long setup. MACD cross.'],TRX,0.438,['Stock/Technical Analysis']
2,3,I am not optimistic about $amzn both fundement...,['both fundementals and charts look like poopo...,AMZN,-0.506,['Stock/Price Action/Bearish']
3,4,$GRPN might be selling off ahead of $P earning...,['might be selling off ahead'],P,-0.202,['Stock/Price Action/Bearish/Bearish Behavior']
4,5,$IACI http://stks.co/tJU Looks good on the wee...,['Looks good on the weekly chart.'],IACI,0.379,['Stock/Technical Analysis']


In [3]:
import numpy as np

post_array = np.array(post_df)
np.random.shuffle(post_array)
post_list = list(post_array[:,1])

print(post_list)



In [5]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
  
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")
#That's where the headlines get tokenized to be inputted into model

inputs = tokenizer(post_list, padding = True, truncation = True, return_tensors='pt')
print(inputs)

{'input_ids': tensor([[  101,  5573, 27451,  ...,     0,     0,     0],
        [  101,  1002, 24529,  ...,     0,     0,     0],
        [  101,  1002, 23564,  ...,     0,     0,     0],
        ...,
        [  101,  3145,  2504,  ...,     0,     0,     0],
        [  101,  1002, 24829,  ...,     0,     0,     0],
        [  101,  1002, 14161,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}


In [6]:
#inference

outputs = model(**inputs)
print(outputs.logits.shape)

torch.Size([675, 3])


In [7]:
import torch

predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
print(predictions)

tensor([[0.9301, 0.0403, 0.0296],
        [0.0269, 0.1169, 0.8562],
        [0.9223, 0.0232, 0.0545],
        ...,
        [0.1357, 0.0154, 0.8489],
        [0.1248, 0.7910, 0.0841],
        [0.9058, 0.0305, 0.0637]], grad_fn=<SoftmaxBackward0>)


In [8]:
#Formatting the results as a pandas data frame

import pandas as pd

#Headline #Positive #Negative #Neutral
positive = predictions[:, 0].tolist()
negative = predictions[:, 1].tolist()
neutral = predictions[:, 2].tolist()

table = {'Headline':post_list,
         "Positive":positive,
         "Negative":negative, 
         "Neutral":neutral}
      
df = pd.DataFrame(table, columns = ["Headline", "Positive", "Negative", "Neutral"])

df.head(10)

Unnamed: 0,Headline,Positive,Negative,Neutral
0,"Sudden optimism about iPhone sales (i.e., not ...",0.930094,0.040275,0.029631
1,$TSLA announces a recall and the stock doesn't...,0.026863,0.11693,0.856207
2,$ZAGG getting ready...Target $15 plus on a clo...,0.922294,0.023221,0.054485
3,$AAPL AAPL finally breaking under the 49 M.A.....,0.30371,0.010989,0.685302
4,Covered my small $MWW short @ 7.99 for a .16 l...,0.108848,0.648447,0.242705
5,$NFLX and $TSLA making all kinds of gains 📈,0.722883,0.01234,0.264777
6,$DARA breaking out,0.138333,0.425106,0.436561
7,$SKH http://stks.co/163e Long setup. Watch for...,0.1763,0.025571,0.798129
8,Long $GPRO for a brealout trade,0.12434,0.031759,0.843901
9,$EBAY e-bay bay! working on a couple daily d...,0.028595,0.916665,0.05474


In [11]:
def find_label(value):
    if -0.1 <= value <= 0.1:
        return 0
    elif value < -0.1:
        return -1
    else:
        return 1

index = 0
correct = 0.0
for x in post_array:
    true_label = find_label(x[4])
    max_v = max(predictions[index, 0], predictions[index, 1], predictions[index, 2])
    if max_v == predictions[index, 0]:
        prediction_label = 1
    elif max_v == predictions[index, 1]:
        prediction_label = -1
    else:
        prediction_label = 0
    
    if prediction_label == true_label:
        correct += 1
    index += 1
print("Post Prediction Rate: ", correct / (index + 1) * 100, "%")
    

Post Prediction Rate:  43.047337278106504 %
