### Read Data

In [51]:
%load_ext autoreload
%autoreload 2

import sys
import os

src_dir = os.path.abspath(os.path.join('..', 'src'))
sys.path.append(src_dir)

import requests
import pandas as pd 
from datetime import datetime, timezone
import pytz

from utils.config import settings

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [52]:
# Read Data
data = requests.get(settings.NEWS_URL)
data = data.json()
data.keys()

dict_keys(['items', 'sentiment_score_definition', 'relevance_score_definition', 'feed'])

In [53]:
data['feed'][1]

{'title': 'Zacks Industry Outlook Highlights IBM, Advanced Micro Devices, Micron and IonQ',
 'url': 'https://www.zacks.com/stock/news/2466782/zacks-industry-outlook-highlights-ibm-advanced-micro-devices-micron-and-ionq',
 'time_published': '20250512T134300',
 'authors': ['Zacks Investment Research'],
 'summary': 'IBM, Advanced Micro Devices, Micron and IonQ have been highlighted in this Industry Outlook article.',
 'banner_image': 'https://staticx-tuner.zacks.com/images/articles/main/3a/521.jpg',
 'source': 'Zacks Commentary',
 'category_within_source': 'n/a',
 'source_domain': 'www.zacks.com',
 'topics': [{'topic': 'Technology', 'relevance_score': '0.333333'},
  {'topic': 'Blockchain', 'relevance_score': '0.158519'},
  {'topic': 'Economy - Monetary', 'relevance_score': '0.158519'},
  {'topic': 'Financial Markets', 'relevance_score': '0.989041'},
  {'topic': 'Manufacturing', 'relevance_score': '0.333333'},
  {'topic': 'Earnings', 'relevance_score': '0.993781'},
  {'topic': 'Real Estate

In [54]:
def flatten_data(data):
    flattened_data = []
    for each in data:
        financial_markets_relevance_score = 0
        for topic in each['topics']:
            if topic['topic'] == 'Financial Markets':
                financial_markets_relevance_score = topic['relevance_score']
                break
        record = {
            "timestamp": each['time_published'],
            "title": each['title'],
            "summary": each['summary'],
            "overall_sentiment_score": each['overall_sentiment_score'],
            "overall_sentiment_label": each['overall_sentiment_label'],
            "financial_markets_relevance_score": financial_markets_relevance_score,
        }
    
        flattened_data.append(record)
    return flattened_data

df = pd.DataFrame(data=flatten_data(data['feed']))
df.head(5)

Unnamed: 0,timestamp,title,summary,overall_sentiment_score,overall_sentiment_label,financial_markets_relevance_score
0,20250514T214830,"Nvidia, Amazon, Tesla, Palantir Among Top Bene...","Saudi Arabia's $1T AI push is a ""green light"" ...",0.26249,Somewhat-Bullish,0.161647
1,20250512T134300,"Zacks Industry Outlook Highlights IBM, Advance...","IBM, Advanced Micro Devices, Micron and IonQ h...",0.274505,Somewhat-Bullish,0.989041
2,20250511T101000,IBM Just Gave Nvidia Stock Investors Terrible ...,There's no question that artificial intelligen...,0.12771,Neutral,0.365926
3,20250510T120823,"Bulls And Bears: IBM, Novavax, Arm Holdings - ...",Benzinga examined the prospects for many inves...,0.131806,Neutral,0.413559
4,20250509T130200,4 Stocks to Watch From a Challenging Technolog...,"Amid supply-chain woes, Computer - Integrated ...",0.267918,Somewhat-Bullish,0.955357


In [55]:
df.describe()

Unnamed: 0,overall_sentiment_score
count,50.0
mean,0.200475
std,0.099203
min,-0.069865
25%,0.128828
50%,0.216624
75%,0.266921
max,0.409897


In [56]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn.functional as F

tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

# Make sure you're not using GPU unless needed
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [64]:
# Function to get FinBERT sentiment for a given text
def get_finbert_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        probs = F.softmax(outputs.logits, dim=1)
    labels = ['bullish', 'somewhat-bullish', 'neutral', 'somewhat-bearish', 'bearish']
    sentiment_scores = {label: float(prob) for label, prob in zip(labels, probs[0])}
    predicted_label = labels[torch.argmax(probs)]
    return predicted_label, sentiment_scores

def convert_time_format(time: str) -> datetime:
    """Map time string to datetime object."""
    # Parse the input time
    parsed_time = datetime.strptime(time, '%Y%m%dT%H%M%S')

    # Convert to a specific time zone (e.g., US/Eastern)
    eastern = pytz.timezone("US/Eastern")
    local_time = pytz.utc.localize(parsed_time).astimezone(eastern)

    # Format the local time
    return local_time.strftime("%Y-%m-%d %H:%M:%S")

# Process each news item
results = []
for i in range(df.shape[0]):
    news_article = df.iloc[i]
    text = f"{news_article.get('timestamp', '')}. {news_article.get('summary', '')}. Financial Markets Relevance Score: {news_article.get('financial_markets_relevance_score', '')}. Overall Sentiment Score: {news_article.get('overall_sentiment_score', '')}. Overall Sentiment Label: {news_article.get('overall_sentiment_label', '')}"
    label, scores = get_finbert_sentiment(text)
    results.append({
        "timestamp": convert_time_format(news_article.get("timestamp")),
        "actual_label": news_article.get("overall_sentiment_label"),
        "predicted_label": label,
        "bullish_sentiment_scores": scores.get('bullish', 0),
        "somewhat_bullish_sentiment_scores": scores.get('somewhat-bullish', 0),
        "neutral_sentiment_scores": scores.get('neutral', 0),
        "somewhat_bearish_sentiment_scores": scores.get('somewhat-bearish', 0),
        "bearish_sentiment_scores": scores.get('bearish', 0)
    })

print(results)

[{'timestamp': '2025-05-14 17:48:30', 'actual_label': 'Somewhat-Bullish', 'predicted_label': 'bullish', 'bullish_sentiment_scores': 0.8529199957847595, 'somewhat_bullish_sentiment_scores': 0.014215996488928795, 'neutral_sentiment_scores': 0.13286392390727997, 'somewhat_bearish_sentiment_scores': 0, 'bearish_sentiment_scores': 0}, {'timestamp': '2025-05-12 09:43:00', 'actual_label': 'Somewhat-Bullish', 'predicted_label': 'neutral', 'bullish_sentiment_scores': 0.09164876490831375, 'somewhat_bullish_sentiment_scores': 0.13615798950195312, 'neutral_sentiment_scores': 0.7721932530403137, 'somewhat_bearish_sentiment_scores': 0, 'bearish_sentiment_scores': 0}, {'timestamp': '2025-05-11 06:10:00', 'actual_label': 'Neutral', 'predicted_label': 'neutral', 'bullish_sentiment_scores': 0.04516899958252907, 'somewhat_bullish_sentiment_scores': 0.04922202229499817, 'neutral_sentiment_scores': 0.9056089520454407, 'somewhat_bearish_sentiment_scores': 0, 'bearish_sentiment_scores': 0}, {'timestamp': '20

In [65]:
sentiment_df = pd.DataFrame(results)
sentiment_df.head(12)

Unnamed: 0,timestamp,actual_label,predicted_label,bullish_sentiment_scores,somewhat_bullish_sentiment_scores,neutral_sentiment_scores,somewhat_bearish_sentiment_scores,bearish_sentiment_scores
0,2025-05-14 17:48:30,Somewhat-Bullish,bullish,0.85292,0.014216,0.132864,0,0
1,2025-05-12 09:43:00,Somewhat-Bullish,neutral,0.091649,0.136158,0.772193,0,0
2,2025-05-11 06:10:00,Neutral,neutral,0.045169,0.049222,0.905609,0,0
3,2025-05-10 08:08:23,Neutral,bullish,0.654343,0.088715,0.256943,0,0
4,2025-05-09 09:02:00,Somewhat-Bullish,bullish,0.64009,0.087656,0.272254,0,0
5,2025-05-09 07:53:00,Somewhat-Bullish,bullish,0.502033,0.034697,0.46327,0,0
6,2025-05-06 11:00:11,Somewhat-Bullish,bullish,0.924684,0.03357,0.041746,0,0
7,2025-05-06 05:37:44,Neutral,somewhat-bullish,0.007315,0.971593,0.021092,0,0
8,2025-05-06 05:15:00,Neutral,neutral,0.267807,0.020713,0.711479,0,0
9,2025-05-05 15:00:00,Somewhat-Bullish,somewhat-bullish,0.324627,0.4037,0.271673,0,0


In [66]:
(sentiment_df['actual_label'].str.lower() == sentiment_df['predicted_label'].str.lower()).value_counts()

False    34
True     16
Name: count, dtype: int64

In [61]:
sample = sentiment_df['timestamp'][:5]
sample

0    2025-05-14 17:48:30
1    2025-05-12 09:43:00
2    2025-05-11 06:10:00
3    2025-05-10 08:08:23
4    2025-05-09 09:02:00
Name: timestamp, dtype: object

In [62]:
sample[1]  sample[0]

TypeError: unsupported operand type(s) for -: 'str' and 'str'