# FinBERT With Headlines

In [1]:
import pandas

headline_df = pandas.read_csv("dataset/headline_train.csv")
headline_df.head(5)

Unnamed: 0,id,sentence,snippets,target,sentiment_score,aspects
0,1,Royal Mail chairman Donald Brydon set to step ...,['set to step down'],Royal Mail,-0.374,['Corporate/Appointment']
1,2,Stakes High for AstraZeneca Heart Drug Facing ...,['Facing Tough Competition'],AstraZeneca,-0.24,['Corporate/Risks']
2,3,UPDATE 1-Dairy Crest loses a third of Morrison...,['Crest loses a third of Morrisons milk contra...,Morrisons,-0.161,['Corporate/Sales/Failed Contract Discussion']
3,4,Insight hires Aviva's David Hillier for multi-...,['hires Aviva's David Hillier for multi-asset ...,Insight,0.137,['Corporate/Appointment/Executive Appointment']
4,5,Primark racks up a happy Christmas after stron...,['after strong sales'],Primark,0.704,['Corporate/Sales']


In [2]:
import numpy as np

headlines_array = np.array(headline_df)
np.random.shuffle(headlines_array)
headlines_list = list(headlines_array[:,1])

print(headlines_list)

["Royal Mail 'breached competition law' over delivery service changes, Ofcom claims", "Despite sales growth, UK's Tesco cautions recovery to be bumpy", 'U.S. Debt Lures Schroders as ECB Depresses Rates', "Britain's FTSE falls back with Ashtead, commodities under pressure", 'ARM Holdings plc Partners With International Business Machines Corp. To Drive ...', "GE to Sell Majority Stake in Bank BPH's Core Bank to Alior Bank", "BG Group Still Happy With Shell's $70 Billion Offer", 'RPT-Old Mutual Q1 gross sales beat forecasts, up 18 pct', 'Royal Mail turnaround proving expensive in tough UK market', "UPDATE 3-Auto Trader shares leap in UK's biggest private equity-backed listing", 'Rolls-Royce Wins $9.2 Billion Order From Emirates Airline', 'HSBC appoints business leaders to board', 'CompaniesTravis Perkins lifts dividend, earnings rise 15%', 'InterContinental Hotels Group share price climbs on $1.5bn special dividend', 'ARM Royalties Accelerate as Smartphone Market Regains Strength', 'BHP B

In [3]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
  
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

In [4]:
#That's where the headlines get tokenized to be inputted into model

inputs = tokenizer(headlines_list, padding = True, truncation = True, return_tensors='pt')
print(inputs)

{'input_ids': tensor([[  101,  2548,  5653,  ...,     0,     0,     0],
        [  101,  2750,  4341,  ...,     0,     0,     0],
        [  101,  1057,  1012,  ...,     0,     0,     0],
        ...,
        [  101,  8904, 17345,  ...,     0,     0,     0],
        [  101,  4035,  2078,  ...,     0,     0,     0],
        [  101,  2129, 26680,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}


In [5]:
#inference

outputs = model(**inputs)
print(outputs.logits.shape)

torch.Size([436, 3])


In [6]:
#Postprocessing with softmax

import torch

predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
print(predictions)

tensor([[0.0234, 0.9057, 0.0709],
        [0.8056, 0.1704, 0.0240],
        [0.0591, 0.8498, 0.0911],
        ...,
        [0.0687, 0.0094, 0.9219],
        [0.0720, 0.0149, 0.9131],
        [0.0288, 0.0282, 0.9430]], grad_fn=<SoftmaxBackward0>)


In [7]:
model.config.id2label

{0: 'positive', 1: 'negative', 2: 'neutral'}

In [8]:
#Formatting the results as a pandas data frame

import pandas as pd

#Headline #Positive #Negative #Neutral
positive = predictions[:, 0].tolist()
negative = predictions[:, 1].tolist()
neutral = predictions[:, 2].tolist()

table = {'Headline':headlines_list,
         "Positive":positive,
         "Negative":negative, 
         "Neutral":neutral}
      
df = pd.DataFrame(table, columns = ["Headline", "Positive", "Negative", "Neutral"])

df.head(10)

Unnamed: 0,Headline,Positive,Negative,Neutral
0,Royal Mail 'breached competition law' over del...,0.023391,0.905711,0.070898
1,"Despite sales growth, UK's Tesco cautions reco...",0.805642,0.170371,0.023987
2,U.S. Debt Lures Schroders as ECB Depresses Rates,0.059083,0.849842,0.091075
3,"Britain's FTSE falls back with Ashtead, commod...",0.01196,0.964339,0.023701
4,ARM Holdings plc Partners With International B...,0.873552,0.007633,0.118814
5,GE to Sell Majority Stake in Bank BPH's Core B...,0.035929,0.016357,0.947715
6,BG Group Still Happy With Shell's $70 Billion ...,0.877383,0.014796,0.107821
7,"RPT-Old Mutual Q1 gross sales beat forecasts, ...",0.95061,0.028745,0.020645
8,Royal Mail turnaround proving expensive in tou...,0.034805,0.932253,0.032941
9,UPDATE 3-Auto Trader shares leap in UK's bigge...,0.528794,0.327484,0.143721


In [9]:
def find_label(value):
    if -0.1 <= value <= 0.1:
        return 0
    elif value < -0.1:
        return -1
    else:
        return 1

correct = 0.0
for idx, x in enumerate(headlines_array):
    true_label = find_label(x[4])
    pred_list = [float(predictions[idx][0]), float(predictions[idx][1]), float(predictions[idx][2])]
    max_v = max(pred_list)
    if max_v == pred_list[0]:
        prediction_label = 1
    elif max_v == pred_list[1]:
        prediction_label = -1
    else:
        prediction_label = 0
    
    if prediction_label == true_label:
        correct += 1
print("Headline Accuracy: ", correct / len(headlines_array) * 100, "%")
    

Headline Accuracy:  60.09174311926605 %


# FinBERT With Post

In [10]:
import pandas

post_df = pandas.read_csv("dataset/post_train.csv")
post_df.head(5)   

Unnamed: 0,id,sentence,snippets,target,sentiment_score,aspects
0,1,Slowly adding some $FIO here but gotta be care...,['Slowly adding some $FIO here but gotta be ca...,FIO,0.459,['Stock/Price Action/Bullish/Bull Position']
1,2,$TRX http://stks.co/1KkK Long setup. MACD cross.,['Long setup. MACD cross.'],TRX,0.438,['Stock/Technical Analysis']
2,3,I am not optimistic about $amzn both fundement...,['both fundementals and charts look like poopo...,AMZN,-0.506,['Stock/Price Action/Bearish']
3,4,$GRPN might be selling off ahead of $P earning...,['might be selling off ahead'],P,-0.202,['Stock/Price Action/Bearish/Bearish Behavior']
4,5,$IACI http://stks.co/tJU Looks good on the wee...,['Looks good on the weekly chart.'],IACI,0.379,['Stock/Technical Analysis']


In [11]:
import numpy as np

post_array = np.array(post_df)
np.random.shuffle(post_array)
post_list = list(post_array[:,1])

print(post_list)



In [12]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
  
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")
#That's where the headlines get tokenized to be inputted into model

inputs = tokenizer(post_list, padding = True, truncation = True, return_tensors='pt')
print(inputs)

{'input_ids': tensor([[  101,  3435,  8189,  ...,     0,     0,     0],
        [  101,  2559,  2005,  ...,     0,     0,     0],
        [  101,  1002,  2358,  ...,     0,     0,     0],
        ...,
        [  101,  1002,  1999,  ...,     0,     0,     0],
        [  101,  1002,  1052,  ...,     0,     0,     0],
        [  101,  1002, 24529,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}


In [6]:
#inference

outputs = model(**inputs)
print(outputs.logits.shape)

torch.Size([675, 3])


In [7]:
import torch

predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
print(predictions)

tensor([[0.9301, 0.0403, 0.0296],
        [0.0269, 0.1169, 0.8562],
        [0.9223, 0.0232, 0.0545],
        ...,
        [0.1357, 0.0154, 0.8489],
        [0.1248, 0.7910, 0.0841],
        [0.9058, 0.0305, 0.0637]], grad_fn=<SoftmaxBackward0>)


In [8]:
#Formatting the results as a pandas data frame

import pandas as pd

#Headline #Positive #Negative #Neutral
positive = predictions[:, 0].tolist()
negative = predictions[:, 1].tolist()
neutral = predictions[:, 2].tolist()

table = {'Headline':post_list,
         "Positive":positive,
         "Negative":negative, 
         "Neutral":neutral}
      
df = pd.DataFrame(table, columns = ["Headline", "Positive", "Negative", "Neutral"])

df.head(10)

Unnamed: 0,Headline,Positive,Negative,Neutral
0,"Sudden optimism about iPhone sales (i.e., not ...",0.930094,0.040275,0.029631
1,$TSLA announces a recall and the stock doesn't...,0.026863,0.11693,0.856207
2,$ZAGG getting ready...Target $15 plus on a clo...,0.922294,0.023221,0.054485
3,$AAPL AAPL finally breaking under the 49 M.A.....,0.30371,0.010989,0.685302
4,Covered my small $MWW short @ 7.99 for a .16 l...,0.108848,0.648447,0.242705
5,$NFLX and $TSLA making all kinds of gains 📈,0.722883,0.01234,0.264777
6,$DARA breaking out,0.138333,0.425106,0.436561
7,$SKH http://stks.co/163e Long setup. Watch for...,0.1763,0.025571,0.798129
8,Long $GPRO for a brealout trade,0.12434,0.031759,0.843901
9,$EBAY e-bay bay! working on a couple daily d...,0.028595,0.916665,0.05474


In [11]:
def find_label(value):
    if -0.1 <= value <= 0.1:
        return 0
    elif value < -0.1:
        return -1
    else:
        return 1

index = 0
correct = 0.0
for x in post_array:
    true_label = find_label(x[4])
    pred_list = [float(predictions[idx][0]), float(predictions[idx][1]), float(predictions[idx][2])]
    max_v = max(pred_list)
    if max_v == pred_list[0]:
        prediction_label = 1
    elif max_v == pred_list[1]:
        prediction_label = -1
    else:
        prediction_label = 0
    
    if prediction_label == true_label:
        correct += 1
    index += 1
print("Post Prediction Rate: ", correct / index * 100, "%")
    

Post Prediction Rate:  43.047337278106504 %


# FinBERT Against FPB Data

In [6]:
import pandas
import numpy as np


fpb100_df = pandas.read_csv("dataset/FPB100.csv")
fpb100_df.head(5)

fpb100_array = np.array(fpb100_df)
np.random.shuffle(fpb100_array)
fpb100_list = list(fpb100_array[0:500,1])

print(fpb100_list)
print(len(fpb100_list))

500


In [7]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
  
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

In [8]:
#That's where the headlines get tokenized to be inputted into model

inputs = tokenizer(fpb100_list, padding = True, truncation = True, return_tensors='pt')
print(inputs)

{'input_ids': tensor([[ 101, 2002, 9794,  ...,    0,    0,    0],
        [ 101, 2720, 1012,  ...,    0,    0,    0],
        [ 101, 1996, 3066,  ...,    0,    0,    0],
        ...,
        [ 101, 1037, 2541,  ...,    0,    0,    0],
        [ 101, 2009, 2085,  ...,    0,    0,    0],
        [ 101, 5495, 3406,  ...,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}


In [9]:
outputs = model(**inputs)
print(outputs.logits.shape)

torch.Size([500, 3])


In [10]:
import torch

predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
print(predictions)

tensor([[0.0615, 0.0143, 0.9243],
        [0.0572, 0.0131, 0.9297],
        [0.0257, 0.0203, 0.9539],
        ...,
        [0.0217, 0.4069, 0.5714],
        [0.0706, 0.0109, 0.9185],
        [0.0218, 0.0263, 0.9519]], grad_fn=<SoftmaxBackward0>)


In [18]:
#Formatting the results as a pandas data frame

import pandas as pd

#Headline #Positive #Negative #Neutral
positive = predictions[:, 0].tolist()
negative = predictions[:, 1].tolist()
neutral = predictions[:, 2].tolist()


table = {'Text':fpb100_list,
         "Positive":positive,
         "Negative":negative, 
         "Neutral":neutral}
      
df = pd.DataFrame(table, columns = ["Text", "Positive", "Negative", "Neutral"])

df.head(10)

Unnamed: 0,Text,Positive,Negative,Neutral
0,He joins Technopolis from KONE where he has he...,0.061453,0.014286,0.924261
1,Mr. Koistinen joins from Nokia Siemens Network...,0.057228,0.013103,0.929669
2,The deal includes the entire personnel of Plan...,0.025748,0.020314,0.953939
3,Operating profit was EUR 1.6 mn in 2005 compar...,0.092535,0.893666,0.0138
4,The expanded plant is scheduled to be operatio...,0.119794,0.010852,0.869354
5,The company 's share is listed in the Mid Cap ...,0.029226,0.020832,0.949943
6,In a recent interview with the Financial Times...,0.082989,0.014812,0.902199
7,"Swedish , Finnish and Danish listed companies ...",0.026573,0.022509,0.950918
8,Loss after financial items totalled EUR 9.7 mn...,0.16083,0.825394,0.013777
9,Operating profit excluding restructuring costs...,0.949723,0.024407,0.02587


In [28]:

label_map = {"positive" : 1, "negative" : -1, "neutral" : 0}
print(predictions.shape)

correct = 0.0
for idx, x in enumerate(fpb100_array[:500]):
    true_label = label_map[x[2]]
    pred_list = [float(predictions[idx][0]), float(predictions[idx][1]), float(predictions[idx][2])]
    max_v = max(pred_list)
    if max_v == pred_list[0]:
        prediction_label = 1
    elif max_v == pred_list[1]:
        prediction_label = -1
    else:
        prediction_label = 0
    
    if prediction_label == true_label:
        correct += 1
print("FPB Accurarcy: ", correct / idx * 100, "%")

torch.Size([500, 3])
FPB Accurarcy:  97.79559118236473 %
