### Compiled models for sentiment analysis
- First portion for headlines, second for abstract

#### Load company data

In [1]:
#import dependencies
import pandas as pd
import numpy as np 

In [3]:
#edit this part for new company
df = pd.read_csv('Apple_Inc_text_data.csv')
# format datetime again
df['pub_date'] = pd.to_datetime(df['pub_date']) 
df['pub_date'] = df['pub_date'].dt.date
#adding ticker in case we want to merge any DFs
df['ticker'] = 'AAPL'
df = df[['ticker'] + [col for col in df.columns if col != 'ticker']]
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2143 entries, 0 to 2142
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   ticker            2143 non-null   object
 1   pub_date          2143 non-null   object
 2   abstract          2143 non-null   object
 3   lead_para         2124 non-null   object
 4   headline          2143 non-null   object
 5   doc_type          2143 non-null   object
 6   section_name      2143 non-null   object
 7   type_of_material  2142 non-null   object
 8   rank              2143 non-null   int64 
 9   web_url           2143 non-null   object
dtypes: int64(1), object(9)
memory usage: 167.6+ KB


In [4]:
#narrowing down to headlines and abstract
# Select the desired columns
sentiment_df = df[['ticker', 'pub_date', 'headline']]
sentiment_df = sentiment_df.copy()  # Make a copy to avoid modifications on a slice


## Run headlines through all 5 models

- This section compiles all sentiment features for all 5 models for 1 company
- The final dataframe should consist of positive, neagtive and neutral probabilities for all models

### 1. Vader

In [5]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Tammy\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [6]:
# Run model
sia = SentimentIntensityAnalyzer()

# Function to get full sentiment scores (including probabilities)
def get_sentiment(text):
    # Get the sentiment scores dictionary (positive, neutral, negative, and compound scores)
    sentiment_scores = sia.polarity_scores(text)
    return sentiment_scores

# Apply the sentiment analysis to the 'headline' column
sentiment_df['sentiment_scores'] = sentiment_df['headline'].apply(get_sentiment)

# Separate out the individual sentiment probabilities into new columns
sentiment_df['vader_pos'] = sentiment_df['sentiment_scores'].apply(lambda x: x['pos'])
sentiment_df['vader_neu'] = sentiment_df['sentiment_scores'].apply(lambda x: x['neu'])
sentiment_df['vader_neg'] = sentiment_df['sentiment_scores'].apply(lambda x: x['neg'])

# Function to classify sentiment into Positive, Negative, or Neutral based on the compound score
def classify_sentiment(score):
    if score > 0.05:
        return 'Positive'
    elif score < -0.05:
        return 'Negative'
    else:
        return 'Neutral'

# Apply the compound score to classify sentiment
sentiment_df['vader_label'] = sentiment_df['sentiment_scores'].apply(lambda x: classify_sentiment(x['compound']))

# Display the result
# Drop the 'sentiment_scores' column from the DataFrame
sentiment_df = sentiment_df.drop(columns=['sentiment_scores'])

# Display the result
sentiment_df.head(5)


Unnamed: 0,ticker,pub_date,headline,vader_pos,vader_neu,vader_neg,vader_label
0,AAPL,2015-04-07,M.B.A. Programs That Get You Where You Want to Go,0.126,0.874,0.0,Positive
1,AAPL,2015-04-14,What We’re Reading,0.0,1.0,0.0,Neutral
2,AAPL,2015-04-13,IBM Creates Watson Health to Analyze Medical Data,0.231,0.769,0.0,Positive
3,AAPL,2015-04-22,What’s That on Beyoncé’s Wrist? Let Me Guess ....,0.0,1.0,0.0,Neutral
4,AAPL,2015-04-01,Daily Report: Tech Leaders Come Together to Op...,0.0,1.0,0.0,Neutral


### 2. Transformer (distilRoBERTa - tuned for financial news sentiment)

In [7]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("frostedtrees/Fin_distilroberta")
drob_model = AutoModelForSequenceClassification.from_pretrained("frostedtrees/Fin_distilroberta", num_labels=3)  # Assuming 3 sentiments: positive, neutral, negative


In [8]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

def predict_sentiment(headline):
    inputs = tokenizer(headline, return_tensors="pt", truncation=True, padding=True)
    
    with torch.no_grad():
        outputs = drob_model(**inputs)
    
    logits = outputs.logits
    probs = torch.softmax(logits, dim=-1)  # Convert logits to probabilities
    
    return probs[0].cpu().numpy()  # Returns an array of probabilities

# Apply prediction to DataFrame
sentiment_probs = sentiment_df['headline'].apply(predict_sentiment)

# Create separate columns for sentiment probabilities
sentiment_df[['drob_neg', 'drob_neu', 'drob_pos']] = pd.DataFrame(sentiment_probs.tolist(), index=sentiment_probs.index)

# Determine final sentiment label based on highest probability and simplify label mapping
sentiment_df['drob_label'] = sentiment_df[['drob_neg', 'drob_neu', 'drob_pos']].idxmax(axis=1)
sentiment_df['drob_label'] = sentiment_df['drob_label'].replace({
    'drob_pos': 'Positive',
    'drob_neu': 'Neutral',
    'drob_neg': 'Negative'
})

sentiment_df.head()


Unnamed: 0,ticker,pub_date,headline,vader_pos,vader_neu,vader_neg,vader_label,drob_neg,drob_neu,drob_pos,drob_label
0,AAPL,2015-04-07,M.B.A. Programs That Get You Where You Want to Go,0.126,0.874,0.0,Positive,0.0214,0.79008,0.18852,Neutral
1,AAPL,2015-04-14,What We’re Reading,0.0,1.0,0.0,Neutral,0.055277,0.798512,0.146211,Neutral
2,AAPL,2015-04-13,IBM Creates Watson Health to Analyze Medical Data,0.231,0.769,0.0,Positive,0.015715,0.696537,0.287748,Neutral
3,AAPL,2015-04-22,What’s That on Beyoncé’s Wrist? Let Me Guess ....,0.0,1.0,0.0,Neutral,0.042261,0.861066,0.096673,Neutral
4,AAPL,2015-04-01,Daily Report: Tech Leaders Come Together to Op...,0.0,1.0,0.0,Neutral,0.50606,0.291502,0.202438,Negative


In [None]:
# import tensorflow as tf

# def drob_polarity_scores(sentence):
#     encoded_text = tokenizer(sentence, return_tensors='tf')
    
#     output = drob_model(encoded_text)
#     scores = output.logits[0].numpy()
#     scores = tf.nn.softmax(scores).numpy()
    
#     scores_dict = {
#         'drob_neg': scores[0],
#         'drob_neu': scores[1],
#         'drob_pos': scores[2]
#     }
#     return scores[0], scores[1], scores[2]

# sentiment_df[['drob_pos','drob_neg','drob_neu']] = sentiment_df['headline'].apply(drob_polarity_scores).apply(pd.Series)

# # Add label based on largest (neg/neu/pos)
# sentiment_df['drob_label'] = sentiment_df.apply(lambda x:'Negative' if x['drob_neg'] >
#                      x['drob_neu'] and x['drob_neu'] > x['drob_pos'] else ('Neutral' if x['drob_neu']>x['drob_pos'] else 'Positive'),axis=1)

# sentiment_df.head() 


In [52]:
# Save the DataFrame to a CSV file just in case 
sentiment_df.to_csv('apple_sentiment_analysis_results.csv', index=False)


## DeBerta


In [9]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("tammiloveshf/Fin_DeBerta")
deb_model = AutoModelForSequenceClassification.from_pretrained("tammiloveshf/Fin_DeBerta")

tokenizer_config.json:   0%|          | 0.00/1.34k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/8.66M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/738M [00:00<?, ?B/s]

In [11]:
import torch

# Function to get sentiment and probabilities for negative, neutral, and positive
def get_sentiment(headline):
    # Tokenize the input headline
    encoded_text = tokenizer(headline, return_tensors='pt', padding=True, truncation=True)
    
    # Get the model's output (logits)
    output = deb_model(**encoded_text)
    
    # Apply softmax to get probabilities for each class (negative, neutral, positive)
    probs = torch.nn.functional.softmax(output.logits, dim=-1)
    
    # Extract sentiment scores (negative, neutral, positive)
    deb_neg, deb_neu, deb_pos = probs[0].detach().numpy()
    
    # Return sentiment scores as well as the label
    return deb_neg, deb_neu, deb_pos, 'Negative' if deb_neg > deb_neu and deb_neg > deb_pos else ('Neutral' if deb_neu > deb_pos else 'Positive')

# Apply the function to the 'headline' column and unpack the values into separate columns
sentiment_df[['deb_pos', 'deb_neg', 'deb_neu', 'deb_label']] = sentiment_df['headline'].apply(get_sentiment).apply(pd.Series)

# Display the first few rows of the DataFrame to verify the results
sentiment_df.head(10)


KeyboardInterrupt: 

In [12]:
sentiment_df.head()

Unnamed: 0,ticker,pub_date,headline,vader_pos,vader_neu,vader_neg,vader_label,drob_neg,drob_neu,drob_pos,drob_label,deb_pos,deb_neg,deb_neu,deb_label
0,AAPL,2015-04-07,M.B.A. Programs That Get You Where You Want to Go,0.126,0.874,0.0,Positive,0.0214,0.79008,0.18852,Neutral,0.001556,0.910922,0.087523,Neutral
1,AAPL,2015-04-14,What We’re Reading,0.0,1.0,0.0,Neutral,0.055277,0.798512,0.146211,Neutral,0.012333,0.986333,0.001334,Neutral
2,AAPL,2015-04-13,IBM Creates Watson Health to Analyze Medical Data,0.231,0.769,0.0,Positive,0.015715,0.696537,0.287748,Neutral,0.001769,0.969001,0.029229,Neutral
3,AAPL,2015-04-22,What’s That on Beyoncé’s Wrist? Let Me Guess ....,0.0,1.0,0.0,Neutral,0.042261,0.861066,0.096673,Neutral,0.059633,0.933838,0.006529,Neutral
4,AAPL,2015-04-01,Daily Report: Tech Leaders Come Together to Op...,0.0,1.0,0.0,Neutral,0.50606,0.291502,0.202438,Negative,0.910898,0.066995,0.022107,Negative


In [13]:
# Save the DataFrame to a CSV file just in case 
sentiment_df.to_csv('finetuned_sentiment_model.csv', index=False)

## Run abstract through all 5 models


In [40]:
#narrowing down to headlines and abstract
# Select the desired columns
sentiment_df = df[['ticker', 'pub_date', 'abstract']]
sentiment_df = sentiment_df.copy()  # Make a copy to avoid modifications on a slice
sentiment_df.head()

Unnamed: 0,ticker,pub_date,abstract
0,AAPL,2015-04-07,"Want to work at Amazon, Apple or McKinsey? Som..."
1,AAPL,2015-04-14,Get recommendations from New York Times report...
2,AAPL,2015-04-13,The business unit will partner with companies ...
3,AAPL,2015-04-22,"With superstars first in line, Apple appears t..."
4,AAPL,2015-04-01,"In an industry that avoids controversy, the he..."


### 1. Vader

In [None]:
# Run model
sia = SentimentIntensityAnalyzer()

# Function to get full sentiment scores (including probabilities)
def get_sentiment(text):
    # Get the sentiment scores dictionary (positive, neutral, negative, and compound scores)
    sentiment_scores = sia.polarity_scores(text)
    return sentiment_scores

# Apply the sentiment analysis to the 'headline' column
sentiment_df['sentiment_scores'] = sentiment_df['abstract'].apply(get_sentiment)

# Separate out the individual sentiment probabilities into new columns
sentiment_df['vader_pos'] = sentiment_df['sentiment_scores'].apply(lambda x: x['pos'])
sentiment_df['vader_neu'] = sentiment_df['sentiment_scores'].apply(lambda x: x['neu'])
sentiment_df['vader_neg'] = sentiment_df['sentiment_scores'].apply(lambda x: x['neg'])

# Function to classify sentiment into Positive, Negative, or Neutral based on the compound score
def classify_sentiment(score):
    if score > 0.05:
        return 'Positive'
    elif score < -0.05:
        return 'Negative'
    else:
        return 'Neutral'

# Apply the compound score to classify sentiment
sentiment_df['vader_label'] = sentiment_df['sentiment_scores'].apply(lambda x: classify_sentiment(x['compound']))

# Display the result
# Drop the 'sentiment_scores' column from the DataFrame
sentiment_df = sentiment_df.drop(columns=['sentiment_scores'])

# Display the result
sentiment_df.head(5)


### 2. Transformer (distilRoBERTa - tuned on financial phrasebank)

In [None]:
import tensorflow as tf

def drob_polarity_scores(sentence):
    encoded_text = tokenizer(sentence, return_tensors='tf')
    
    output = drob_model(encoded_text)
    scores = output.logits[0].numpy()
    scores = tf.nn.softmax(scores).numpy()
    
    # scores_dict = {
    #     'drob_neg': scores[0],
    #     'drob_neu': scores[1],
    #     'drob_pos': scores[2]
    # }
    # return scores[0], scores[1], scores[2]

sentiment_df[['drob_pos','drob_neg','drob_neu']] = sentiment_df['headline'].apply(drob_polarity_scores).apply(pd.Series)

# Add label based on largest (neg/neu/pos)
sentiment_df['drob_label'] = sentiment_df.apply(lambda x:'Negative' if x['drob_neg'] >
                     x['drob_neu'] and x['drob_neu'] > x['drob_pos'] else ('Neutral' if x['drob_neu']>x['drob_pos'] else 'Positive'),axis=1)

sentiment_df.head()


## DeBerta


In [None]:
import torch

# Function to get sentiment and probabilities for negative, neutral, and positive
def get_sentiment(headline):
    # Tokenize the input headline
    encoded_text = tokenizer(headline, return_tensors='pt', padding=True, truncation=True)
    
    # Get the model's output (logits)
    output = deb_model(**encoded_text)
    
    # Apply softmax to get probabilities for each class (negative, neutral, positive)
    probs = torch.nn.functional.softmax(output.logits, dim=-1)
    
    # Extract sentiment scores (negative, neutral, positive)
    deb_neg, deb_neu, deb_pos = probs[0].detach().numpy()
    
    # Return sentiment scores as well as the label
    return deb_neg, deb_neu, deb_pos, 'Negative' if deb_neg > deb_neu and deb_neg > deb_pos else ('Neutral' if deb_neu > deb_pos else 'Positive')

# Apply the function to the 'headline' column and unpack the values into separate columns
sentiment_df[['deb_pos', 'deb_neg', 'deb_neu', 'deb_sentiment']] = sentiment_df['headline'].apply(get_sentiment).apply(pd.Series)

# Display the first few rows of the DataFrame to verify the results
sentiment_df.head()
