In [1]:
import pandas as pd
import time

## Data

In [2]:
df = pd.read_csv("../news_stock_price.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145689 entries, 0 to 145688
Data columns (total 26 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   title                    145689 non-null  object 
 1   url                      145689 non-null  object 
 2   time_published           145689 non-null  object 
 3   authors                  145689 non-null  object 
 4   summary                  145677 non-null  object 
 5   banner_image             141418 non-null  object 
 6   source                   145689 non-null  object 
 7   category_within_source   46206 non-null   object 
 8   source_domain            145689 non-null  object 
 9   topics                   145689 non-null  object 
 10  overall_sentiment_score  145689 non-null  float64
 11  overall_sentiment_label  145689 non-null  object 
 12  ticker_sentiment         145689 non-null  object 
 13  company                  145689 non-null  object 
 14  symb

In [4]:
df["summary_title"] = df["summary"].combine_first(df["title"])

## Finbert (Prosus AI)

https://huggingface.co/ProsusAI/finbert

In [5]:
import torch

In [6]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")

model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

softmax = torch.nn.Softmax(dim=-1)

## Single sentence

#LABEL_0: positive; LABEL_1: negative; LABEL_2: neutral

In [7]:
def process_single_sentence(tokenizer, model, softmax, sentence, verbose=True):
    inputs = tokenizer(sentence, return_tensors="pt")
    if verbose:
        print(f"Tokens: {inputs}\n")
    outputs = model(**inputs)
    if verbose:
        print(f"Logits: {outputs.logits}")
    prob = softmax(outputs.logits)
    return prob

In [8]:
sentences = ["Stocks rallied and the British pound gained.", "profits are flat"]

for sent in sentences:
    print(sent)
    prob = process_single_sentence(tokenizer, model, softmax, sent, verbose=False)
    print(prob)
    print("\n")

Stocks rallied and the British pound gained.
tensor([[0.8984, 0.0345, 0.0672]], grad_fn=<SoftmaxBackward0>)


profits are flat
tensor([[0.2225, 0.0589, 0.7186]], grad_fn=<SoftmaxBackward0>)




## Multiple sentences

In [9]:
def process_multiple_sentence(tokenizer, model, softmax, sentences, verbose=True):
    inputs = tokenizer(sentences, return_tensors="pt", padding=True)
    if verbose:
        print(f"Tokens: {inputs}\n")
    outputs = model(**inputs)
    if verbose:
        print(f"Logits: {outputs.logits}")
    prob = softmax(outputs.logits)
    return prob

In [10]:
sentences = ["Stocks rallied and the British pound gained.", "profits are flat"]
prob = process_multiple_sentence(tokenizer, model, softmax, sentences, verbose=False)
prob

tensor([[0.8984, 0.0345, 0.0672],
        [0.2225, 0.0589, 0.7186]], grad_fn=<SoftmaxBackward0>)

## Inference API

In [25]:
import requests

def query(url, sentences):
    payload = {
        "inputs": sentences,
        "options":{"wait_for_model":True}
    }
    
    API_TOKEN = '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'
    headers = {"Authorization": f"Bearer {API_TOKEN}"}

    response = requests.post(url, headers=headers, json=payload)
    
    return response.json()

In [27]:
API_URL = "https://api-inference.huggingface.co/models/ProsusAI/finbert"

query(url=API_URL, sentences=sentences)

[[{'label': 'positive', 'score': 0.8983616232872009},
  {'label': 'neutral', 'score': 0.06716488301753998},
  {'label': 'negative', 'score': 0.0344734713435173}],
 [{'label': 'neutral', 'score': 0.7185565233230591},
  {'label': 'positive', 'score': 0.2225133776664734},
  {'label': 'negative', 'score': 0.05893009155988693}]]

## Process news summary

### Chunk to avoid OOM

In [11]:
summary_title_list = df["summary_title"].to_list()

all([isinstance(sent, str) for sent in summary_title_list])

True

In [12]:
def split(list_a, chunk_size):

    for i in range(0, len(list_a), chunk_size):
        yield list_a[i:i + chunk_size]

chunk_size = 5000
chunk_summary_title_list = list(split(summary_title_list, chunk_size))

# check chunked list is equivalent to original list
assert len(summary_title_list) == sum([len(l) for l in chunk_summary_title_list])

In [20]:
p_list = []
for i, l in enumerate(chunk_summary_title_list):
    print(f"Processing {i}/{len(chunk_summary_title_list)}")
    print(f"Number of sentences per chunk list: {len(l)}")
    p = process_multiple_sentence(tokenizer, model, softmax, l, verbose=False)
    p_list.append(p)

Processing 0/30
Number of sentences per chunk list: 5000


RuntimeError: [enforce fail at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\c10\core\impl\alloc_cpu.cpp:72] data. DefaultCPUAllocator: not enough memory: you tried to allocate 4915200000 bytes.

### One by one but too slow

In [13]:
prob_list = []
start = time.time()
for i, sent in enumerate(summary_title_list):
    if i%100 == 0 and i != 0:
        end = time.time()
        duration = end-start
        print(f"Took {duration} seconds")
        print(f"Processed {i}/{len(summary_title_list)}")
        start = time.time()
    prob = process_single_sentence(tokenizer, model, softmax, sent, verbose=False)
    prob_list.append(prob)

Took 16.414872646331787 seconds
Processed 100/145689
Took 31.019694089889526 seconds
Processed 200/145689
Took 45.53204011917114 seconds
Processed 300/145689
Took 61.96796441078186 seconds
Processed 400/145689
Took 81.90762233734131 seconds
Processed 500/145689
Took 98.28514385223389 seconds
Processed 600/145689


KeyboardInterrupt: 

In [None]:
# convert prob list to numpy array
finbert_v1_prob = prob_list

In [104]:
# create column names
model_name = "finbert_v1"
label_order = ["pos", "neg", "neutral"]
cols = [f"{model_name}_{label}" for label in label_order]

# add probability columns
test_df[cols] = finbert_v1_prob.detach().numpy()

## Inference API

In [42]:
def split(list_a, chunk_size):

    for i in range(0, len(list_a), chunk_size):
        yield list_a[i:i + chunk_size]

chunk_size = 100
chunk_summary_title_list = list(split(summary_title_list, chunk_size))

# check chunked list is equivalent to original list
assert len(summary_title_list) == sum([len(l) for l in chunk_summary_title_list])

In [None]:
p_list = []
for i, l in enumerate(chunk_summary_title_list):
    print(f"Processing {i}/{len(chunk_summary_title_list)}")
    print(f"Number of sentences per chunk list: {len(l)}")
    p = query(url=API_URL, sentences=l)
    assert len(p) == len(l)
    p_list.extend(p)

Processing 0/1457
Number of sentences per chunk list: 100
Processing 1/1457
Number of sentences per chunk list: 100
Processing 2/1457
Number of sentences per chunk list: 100
Processing 3/1457
Number of sentences per chunk list: 100
Processing 4/1457
Number of sentences per chunk list: 100
Processing 5/1457
Number of sentences per chunk list: 100
Processing 6/1457
Number of sentences per chunk list: 100
Processing 7/1457
Number of sentences per chunk list: 100
Processing 8/1457
Number of sentences per chunk list: 100
Processing 9/1457
Number of sentences per chunk list: 100
Processing 10/1457
Number of sentences per chunk list: 100
Processing 11/1457
Number of sentences per chunk list: 100
Processing 12/1457
Number of sentences per chunk list: 100
Processing 13/1457
Number of sentences per chunk list: 100
Processing 14/1457
Number of sentences per chunk list: 100
Processing 15/1457
Number of sentences per chunk list: 100
Processing 16/1457
Number of sentences per chunk list: 100
Process

## Finbert (yiyanghkust)

https://huggingface.co/yiyanghkust/finbert-tone

#LABEL_0: neutral; LABEL_1: positive; LABEL_2: negative

In [88]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline

tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')

model = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)

softmax = torch.nn.Softmax(dim=-1)
# nlp = pipeline("sentiment-analysis", model=finbert, tokenizer=tokenizer)

## Single sentence

In [89]:
sentences = ["there is a shortage of capital, and we need extra financing",  
             "growth is strong and we have plenty of liquidity", 
             "there are doubts about our finances", 
             "profits are flat"]

for sent in sentences:
    print(sent)
    prob = process_single_sentence(tokenizer, model, softmax, sent, verbose=False)
    print(prob)
    print("\n")

there is a shortage of capital, and we need extra financing
tensor([[3.3754e-03, 7.2024e-06, 9.9662e-01]], grad_fn=<SoftmaxBackward0>)


growth is strong and we have plenty of liquidity
tensor([[2.6476e-08, 1.0000e+00, 2.1309e-08]], grad_fn=<SoftmaxBackward0>)


there are doubts about our finances
tensor([[2.6841e-05, 2.1086e-06, 9.9997e-01]], grad_fn=<SoftmaxBackward0>)


profits are flat
tensor([[0.9889, 0.0018, 0.0092]], grad_fn=<SoftmaxBackward0>)




## Multiple sentences

In [90]:
sentences = ["there is a shortage of capital, and we need extra financing",  
             "growth is strong and we have plenty of liquidity", 
             "there are doubts about our finances", 
             "profits are flat"]

process_multiple_sentence(tokenizer, model, softmax, sentences, verbose=False)

tensor([[3.3754e-03, 7.2024e-06, 9.9662e-01],
        [2.6475e-08, 1.0000e+00, 2.1309e-08],
        [2.6841e-05, 2.1086e-06, 9.9997e-01],
        [9.8894e-01, 1.8429e-03, 9.2129e-03]], grad_fn=<SoftmaxBackward0>)