# Sentiment Scoring and Embeddings

In [1]:
import os
from pathlib import Path

from src.config import Config
from src.data import load_news, load_price, merge_price_news
from src.sentiment import FinBERT
from src.utils import save_csv

In [2]:
cfg = Config(Path("../config/config.yaml"))

In [3]:
price = load_price(Path(cfg.data.raw_dir) / cfg.data.name_prices)
news = load_news(Path(cfg.data.raw_dir) / cfg.data.name_news)

In [4]:
news.head()

Unnamed: 0,date,rank,headline
0,2008-08-08,top1,"b""Georgia 'downs two Russian warplanes' as cou..."
1,2008-08-08,top13,"b""So---Russia and Georgia are at war and the N..."
2,2008-08-08,top18,"b'Condoleezza Rice: ""The US would not act to p..."
3,2008-08-08,top3,b'Russia Today: Columns of troops roll into So...
4,2008-08-08,top25,"b""No Help for Mexico's Kidnapping Surge"""


In [6]:
finbert = FinBERT(config=cfg, max_embedding_dims=17)
df_with_sentiment = finbert.transform(news[["date", "headline"]])
daily_sentiment = finbert.aggregate_daily(df_with_sentiment)
daily_sentiment.drop(columns=["headline_count"], inplace=True)
daily_sentiment.head()

2025-08-19 12:28:35,591 - INFO - FinBERT - Loading FinBERT model: yiyanghkust/finbert-tone on device: cuda
2025-08-19 12:28:35,594 - INFO - FinBERT - Starting FinBERT transform on 49718 texts with batch size 32
FinBERT Batch Processing: 100%|██████████| 1554/1554 [00:45<00:00, 34.15it/s]
2025-08-19 12:29:21,106 - INFO - FinBERT - FinBERT embedding and sentiment extraction complete.


Unnamed: 0,date,pos,neu,neg,pos_minus_neg,emb_0,emb_1,emb_2,emb_3,emb_4,...,emb_7,emb_8,emb_9,emb_10,emb_11,emb_12,emb_13,emb_14,emb_15,emb_16
0,2008-08-08,0.723097,0.080308,0.196594,0.526503,-0.482626,-0.328239,-0.636478,0.627401,0.29477,...,0.15161,-0.232541,0.348419,0.543224,-0.179494,0.553609,-0.347092,-0.518843,0.68428,0.227482
1,2008-08-11,0.695768,0.168894,0.135338,0.56043,-0.566419,-0.423188,-0.652022,0.812345,0.474358,...,0.135462,-0.261237,0.484763,0.363776,-0.225049,0.529768,-0.494232,-0.317024,0.633304,0.377935
2,2008-08-12,0.696532,0.040929,0.262539,0.433994,-0.631328,-0.305491,-0.709601,0.736141,0.427515,...,0.295696,-0.27541,0.380731,0.385763,-0.265997,0.593549,-0.534134,-0.397291,0.669396,0.230531
3,2008-08-13,0.833299,0.003527,0.163175,0.670124,-0.369901,-0.184175,-0.789519,0.645993,0.350836,...,0.166847,-0.208769,0.382932,0.77406,-0.113826,0.511805,-0.298713,-0.347355,0.684939,0.299247
4,2008-08-14,0.928612,0.022426,0.048962,0.87965,-0.482822,-0.254288,-0.716074,0.775566,0.416154,...,0.036605,-0.26798,0.408606,0.557198,-0.18828,0.686246,-0.483639,-0.340268,0.713771,0.236775


In [7]:
df = merge_price_news(price, daily_sentiment)
df.head()

Unnamed: 0,date,open,high,low,close,volume,adj_close,pos,neu,neg,...,emb_7,emb_8,emb_9,emb_10,emb_11,emb_12,emb_13,emb_14,emb_15,emb_16
0,2008-08-08,11432.089844,11759.959961,11388.040039,11734.320312,212830000,11734.320312,0.723097,0.080308,0.196594,...,0.15161,-0.232541,0.348419,0.543224,-0.179494,0.553609,-0.347092,-0.518843,0.68428,0.227482
1,2008-08-11,11729.669922,11867.110352,11675.530273,11782.349609,183190000,11782.349609,0.695768,0.168894,0.135338,...,0.135462,-0.261237,0.484763,0.363776,-0.225049,0.529768,-0.494232,-0.317024,0.633304,0.377935
2,2008-08-12,11781.700195,11782.349609,11601.519531,11642.469727,173590000,11642.469727,0.696532,0.040929,0.262539,...,0.295696,-0.27541,0.380731,0.385763,-0.265997,0.593549,-0.534134,-0.397291,0.669396,0.230531
3,2008-08-13,11632.80957,11633.780273,11453.339844,11532.959961,182550000,11532.959961,0.833299,0.003527,0.163175,...,0.166847,-0.208769,0.382932,0.77406,-0.113826,0.511805,-0.298713,-0.347355,0.684939,0.299247
4,2008-08-14,11532.070312,11718.280273,11450.889648,11615.929688,159790000,11615.929688,0.928612,0.022426,0.048962,...,0.036605,-0.26798,0.408606,0.557198,-0.18828,0.686246,-0.483639,-0.340268,0.713771,0.236775


In [8]:
os.makedirs(cfg.data.processed_dir, exist_ok=True)
save_csv(Path(cfg.data.processed_dir) / cfg.data.name_prices_sentiment, df)