## EDA and sentiment analysis with FinBERT

In [5]:
import pandas as pd
import numpy as np
import torch
import wandb

from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [6]:
df = pd.read_csv('finviz_ev2.csv')

In [7]:
df.head()

Unnamed: 0,stock,headline,source,date,time
0,TSLA,"Polestar Upgrades Model, Hits EV Production Mi...",(Barrons.com),Aug-23-23,09:07AM
1,TSLA,15 Highest Paying Countries for Engineers,(Insider Monkey),Aug-23-23,09:07AM
2,TSLA,Tesla's German plant lowers production target ...,(Reuters),Aug-23-23,09:07AM
3,TSLA,"Down 9% in the Past 5 Days, Is Now the Right T...",(Motley Fool),Aug-23-23,09:07AM
4,TSLA,Tesla's German plant lowers production target ...,(Reuters),Aug-23-23,09:07AM


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   stock     600 non-null    object
 1   headline  600 non-null    object
 2   source    600 non-null    object
 3   date      600 non-null    object
 4   time      600 non-null    object
dtypes: object(5)
memory usage: 23.6+ KB


In [10]:
headlines_array = np.array(df)
np.random.shuffle(headlines_array)
headlines_array
headlines_list = list(headlines_array[:,1])
headlines_list

['Tesla Data Breach Blamed on Insider Wrongdoing Impacted 75,000',
 'Do Tesla downgrades matter?',
 'Steer Clear of Lucid Stock Even as Big Players Make Bold Bets',
 'Sorry, LCID Stock Investors. Dont Expect Much From Lucid.',
 'Why Electric Vehicle Stocks Like Lucid and Canoo Were Sinking Today',
 '10 Best EV, Battery and Autonomous Driving ETFs',
 'EV Roundup: GM-TSLA Charging Deal, BWA\'s "Charging Forward: 2027" & More',
 'NIO, XPeng (XPEV) & LI Auto (LI) Post June, Q2 Delivery Updates',
 'A New EV Company Has Gone Public. Its Stock Is Worth More Than Fords.',
 'Lucid Stock: A High-Risk, High-Reward EV Play',
 'EV Startups Are Proving Warren Buffett Right',
 'Lucid Motors Announces Andrea Soriani as Vice President of Marketing',
 'The new China has arrived and companies need to adjust: Former Dow CEO',
 'Why EV Stocks Lucid Motors, Rivian, and Nikola Plunged Today',
 '15 Highest Paying Countries for Engineers',
 'NIO Inc. (NIO) Stock Sinks As Market Gains: What You Should Know',
 '

In [11]:
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

In [12]:
inputs = tokenizer(headlines_list, padding=True, truncation=True, return_tensors='pt')
print(inputs)

{'input_ids': tensor([[  101, 26060,  2951,  ...,     0,     0,     0],
        [  101,  2079, 26060,  ...,     0,     0,     0],
        [  101, 20634,  3154,  ...,     0,     0,     0],
        ...,
        [  101, 15544, 18073,  ...,     0,     0,     0],
        [  101,  1996,  2373,  ...,     0,     0,     0],
        [  101, 12941,  3189,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}


In [13]:
outputs = model(**inputs)
print(outputs.logits.shape)

torch.Size([600, 3])


In [14]:
import torch

predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
print(predictions)

tensor([[0.0076, 0.9437, 0.0487],
        [0.0327, 0.2321, 0.7353],
        [0.2725, 0.0183, 0.7092],
        ...,
        [0.8810, 0.0411, 0.0780],
        [0.1757, 0.0153, 0.8090],
        [0.0276, 0.0255, 0.9469]], grad_fn=<SoftmaxBackward0>)


In [15]:
model.config.id2label

{0: 'positive', 1: 'negative', 2: 'neutral'}

In [16]:
#Headline #Positive #Negative #Neutral
positive = predictions[:, 0].tolist()
negative = predictions[:, 1].tolist()
neutral = predictions[:, 2].tolist()

table = {'Headline':headlines_list,
         "Positive":positive,
         "Negative":negative, 
         "Neutral":neutral}
      
df = pd.DataFrame(table, columns = ["Headline", "Positive", "Negative", "Neutral"])

df.head()


Unnamed: 0,Headline,Positive,Negative,Neutral
0,Tesla Data Breach Blamed on Insider Wrongdoing...,0.007614,0.943687,0.048699
1,Do Tesla downgrades matter?,0.032654,0.232068,0.735278
2,Steer Clear of Lucid Stock Even as Big Players...,0.272516,0.018256,0.709228
3,"Sorry, LCID Stock Investors. Dont Expect Much ...",0.062314,0.049773,0.887914
4,Why Electric Vehicle Stocks Like Lucid and Can...,0.011455,0.937792,0.050753


In [17]:
#Logging the pandas table as a WandB Table


wandb.init(project="FinBERT_Sentiment_Analysis_Project")
wandb.run.log({"Financial Sentiment Analysis Table" : wandb.Table(dataframe=df)})
wandb.run.finish()

[34m[1mwandb[0m: Currently logged in as: [33msunisa[0m ([33msunisateam[0m). Use [1m`wandb login --relogin`[0m to force relogin
