# Sentiment Analysis on Financial News Headlines

In [2]:
import pandas as pd
import numpy as np

In [3]:
headline_df = pd.read_csv("../data/cleaned_cnbc_headlines.csv")

## A. Sentiment Analysis Using VADER

### A-1: Apply VADER on Headlines

In [4]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [5]:
# Initialize VADER
analyzer = SentimentIntensityAnalyzer()

In [6]:
# Compute compound sentiment score for each headline
headline_df['sentiment_vader'] = headline_df['headline'].apply(lambda x: analyzer.polarity_scores(str(x))['compound'])

# Optional: label as positive/neutral/negative
def label_sentiment(score):
    if score >= 0.05:
        return 'positive'
    elif score <= -0.05:
        return 'negative'
    else:
        return 'neutral'

headline_df['sentiment_label_vader'] = headline_df['sentiment_vader'].apply(label_sentiment)

In [7]:
headline_df.head()

Unnamed: 0,date,headline,sentiment_vader,sentiment_label_vader
0,2020-07-17,Jim Cramer: A better way to invest in the Covi...,0.4404,positive
1,2020-07-17,Cramer's lightning round: I would own Teradyne,0.0,neutral
2,2020-07-17,"Cramer's week ahead: Big week for earnings, ev...",0.0,neutral
3,2020-07-17,IQ Capital CEO Keith Bliss says tech and healt...,0.5719,positive
4,2020-07-16,Wall Street delivered the 'kind of pullback I'...,0.0,neutral


### A-2: Aggregate Sentiment Score Per Day

In [8]:
# 1. Average sentiment score per day
daily_scores = headline_df.groupby('date')['sentiment_vader'].mean().reset_index()
daily_scores.rename(columns={'sentiment_vader': 'daily_sentiment_score'}, inplace=True)

daily_scores.head()

Unnamed: 0,date,daily_sentiment_score
0,2018-03-01,-0.083457
1,2018-03-02,-0.0999
2,2018-03-05,-0.1103
3,2018-03-06,0.227525
4,2018-03-07,-0.125275


In [9]:
# 2. Count of headlines per day
daily_counts = headline_df.groupby('date').size().reset_index(name='headline_count')
daily_counts.head()

Unnamed: 0,date,headline_count
0,2018-03-01,7
1,2018-03-02,6
2,2018-03-05,6
3,2018-03-06,4
4,2018-03-07,8


### A-3: Aggregate Sentiment Label Per Day

* Option 1: Average numerical scores of sentiment labels and then applying thresholds to map that average back to a label. 
* Option 2: Majority Count: Picks most common label exactly.

In [10]:
# Optional 1: Average sentiment score + threshold

# Convert labels to ordered scores
label_map = {'negative': -1, 'neutral': 0, 'positive': 1}
headline_df['label_score_vader'] = headline_df['sentiment_label_vader'].map(label_map)

# Average label score → then convert back to label
daily_label_score = headline_df.groupby('date')['label_score_vader'].mean().reset_index()

def label_from_score(avg_score):
    if avg_score >= 0.05:
        return 'positive'
    elif avg_score <= -0.05:
        return 'negative'
    else:
        return 'neutral'

daily_label_score['daily_sentiment_label'] = daily_label_score['label_score_vader'].apply(label_from_score)
daily_label_score.drop('label_score_vader', axis=1, inplace=True)

# Clean headlines dataframe
headline_df.drop('label_score_vader', axis=1, inplace=True)

daily_label_score.head()


Unnamed: 0,date,daily_sentiment_label
0,2018-03-01,negative
1,2018-03-02,negative
2,2018-03-05,positive
3,2018-03-06,positive
4,2018-03-07,negative


In [11]:
# Optional 2:  Majority count of headlines per sentiment label
majority_label = headline_df.groupby('date')['sentiment_label_vader'].agg(lambda x: x.value_counts().idxmax()).reset_index()
majority_label.rename(columns={'sentiment_label_vader': 'daily_sentiment_label'}, inplace=True)

majority_label.head()

Unnamed: 0,date,daily_sentiment_label
0,2018-03-01,negative
1,2018-03-02,neutral
2,2018-03-05,positive
3,2018-03-06,positive
4,2018-03-07,negative


### A-4: Merge to Daily Sentiment Dataframe & Save to File

In [12]:
# For merge option 1: average label score
daily_sentiment = daily_scores.merge(daily_label_score, on='date')

# For merge option 2: majority label
# daily_sentiment = daily_scores.merge(majority_label, on='date')

daily_sentiment = daily_sentiment.merge(daily_counts, on='date')

# Save the final DataFrame
daily_sentiment.to_csv("../data/daily_sentiment_vader.csv", index=False)

## B. Sentiment Analysis Using FinBERT

### B-1: Load FinBERT Model & Apply on Headlines

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

# Load FinBERT model
model_name = "yiyanghkust/finbert-tone"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Create pipeline for sentiment classification
finbert = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

config.json:   0%|          | 0.00/533 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

Device set to use mps:0


model.safetensors:   0%|          | 0.00/439M [00:00<?, ?B/s]

In [13]:
# Run FinBERT on all headlines
results = finbert(headline_df['headline'].tolist())

# Add results to DataFrame
headline_df['sentiment_finbert'] = [r['label'].lower() for r in results]
headline_df['sentiment_score_finbert'] = [r['score'] for r in results]

**Comment:**

`sentiment_score_finbert` is the model’s confidence in its prediction. It comes from the softmax output of FinBERT’s final classification layer.

In [14]:
headline_df.head()

Unnamed: 0,date,headline,sentiment_vader,sentiment_label_vader,sentiment_finbert,sentiment_score_finbert
0,2020-07-17,Jim Cramer: A better way to invest in the Covi...,0.4404,positive,positive,0.972148
1,2020-07-17,Cramer's lightning round: I would own Teradyne,0.0,neutral,neutral,0.999453
2,2020-07-17,"Cramer's week ahead: Big week for earnings, ev...",0.0,neutral,neutral,0.997833
3,2020-07-17,IQ Capital CEO Keith Bliss says tech and healt...,0.5719,positive,neutral,0.522736
4,2020-07-16,Wall Street delivered the 'kind of pullback I'...,0.0,neutral,neutral,0.986839


### B-2: Aggregate Sentiment Label Per Day & Save Daily Sentiment to File

In [15]:
label_counts = headline_df.groupby(['date', 'sentiment_finbert']).size().unstack(fill_value=0)
label_counts.head()

sentiment_finbert,negative,neutral,positive
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-03-01,4,2,1
2018-03-02,1,5,0
2018-03-05,1,3,2
2018-03-06,0,4,0
2018-03-07,0,7,1


In [16]:
daily_majority_label = headline_df.groupby('date')['sentiment_finbert'].agg(lambda x: x.value_counts().idxmax()).reset_index()
daily_majority_label.columns = ['date', 'finbert_sentiment_label']

daily_majority_label.head()

Unnamed: 0,date,finbert_sentiment_label
0,2018-03-01,negative
1,2018-03-02,neutral
2,2018-03-05,neutral
3,2018-03-06,neutral
4,2018-03-07,neutral


In [19]:
# Combine all features
daily_sentiment = label_counts.reset_index()
daily_sentiment = daily_sentiment.merge(daily_majority_label, on='date')

daily_sentiment.rename(columns={'finbert_sentiment_label': 'daily_sentiment_label'}, inplace=True)

# Preview
daily_sentiment.head()

Unnamed: 0,date,negative,neutral,positive,daily_sentiment_label
0,2018-03-01,4,2,1,negative
1,2018-03-02,1,5,0,neutral
2,2018-03-05,1,3,2,neutral
3,2018-03-06,0,4,0,neutral
4,2018-03-07,0,7,1,neutral


In [20]:
daily_sentiment.to_csv("../data/daily_sentiment_finbert.csv", index=False)

### B-3: Compare VADER Sentiment and FinBERT Sentiment

In [None]:
# do this part later

## C. Visualize Correlation Between Sentiment & Market Movement

In [36]:
# do this part later