### Sentiment Analysis

#### 1. Connect to the server

In [3]:
import pandas as pd
import numpy as np
from clickhouse_driver import Client

In [4]:
# macro database information, do not change
host_name = 'chenlin01.fbe.hku.hk'
user_name = 'mfin7037_best_students'
pswd = 'alanisthecoolest123' # input password
db_name = 'tiingo'
port = 9000

In [5]:
client = Client(
    host=host_name,
    user=user_name,
    password=pswd,
    database=db_name,
    port=port,
)

In [6]:
client.execute('SHOW DATABASES')

[('default',), ('tiingo',)]

In [7]:
client.execute('SHOW TABLES')

[('news',)]

In [8]:
client.execute('USE tiingo')
client.execute('SELECT * FROM news limit 10')

[('2018-05-02T12:14:50.841934+00:00',
  'Yamana Gold Inc. (NYSE:AUY) shares are down more than -8.65% this year and recently decreased -0.70% or -$0.02 to settle at $2.85. SM Energy Company (NYSE:SM), on the other hand, is up 6.61% year to date as of 05/01/2018. It currently trades at $23.54 and has returned 3.11% during the past week. Yamana Gold Inc.…',
  10471811.0,
  '2018-05-02T11:06:33+00:00',
  'stocknewsgazette.com',
  'Energy/Materials/Stock',
  'sm',
  'auy/sm',
  'Yamana Gold Inc. (AUY) vs. SM Energy Company (SM): Which is the Better Investment?',
  'stocknewsgazette.com',
  'https://stocknewsgazette.com/2018/05/02/yamana-gold-inc-auy-vs-sm-energy-company-sm-which-is-the-better-investment/'),
 ('2018-05-02T12:15:20.345795+00:00',
  'AGNC Investment Corp. (NASDAQ:AGNC) shares are down more than -6.59% this year and recently decreased -0.32% or -$0.06 to settle at $18.86. B2Gold Corp. (NYSE:BTG), on the other hand, is down -9.03% year to date as of 05/01/2018. It currently tra

In [9]:
# Use a stable sort key
consistent_sample = client.execute('''
    SELECT *
    FROM news
    WHERE title != ''
    ORDER BY id DESC
    LIMIT 10
''')

# Get column names from table structure
column_info = client.execute('DESCRIBE TABLE news')
columns = [col[0] for col in column_info]  # Extract column names

# Create DataFrame
news_sample_df = pd.DataFrame(consistent_sample, columns=columns)

In [10]:
client.disconnect()

#### 2. Testing for FinBERT

In [11]:
# requirements 
# torch        2.2.2 
# pandas       2.2.1
# numpy        1.26.4
# scipy        1.13.0
# huggingface-hub     0.29.1

from transformers import AutoTokenizer, AutoModelForSequenceClassification
import scipy
import torch
print(f'PyTorch version: {torch.__version__}')
print(f'CUDA available: {torch.cuda.is_available()}')  # Should be True on servers
print(f'CUDA version: {torch.version.cuda}')  # Should show 12.1 on servers
import pandas as pd

PyTorch version: 2.2.2
CUDA available: False
CUDA version: None


In [12]:
def calc_sentiment(text: str, tokenizer: AutoTokenizer, model: AutoModelForSequenceClassification, model_name: str) -> tuple:
    '''
    Analyzes the sentiment of a given text using the given tokenizer and model.

    Args:
        text (str): The text to analyze.
        tokenizer: The tokenizer to use.
        model: The model to use. Model should be a classification model.
        model_name (str): Name of the model - different model different output format.
    Returns:
        tuple: A tuple containing:
            - float: Probability of positive sentiment.
            - float: Probability of negative sentiment.
            - float: Probability of neutral sentiment.
            - float: Sentiment score (positive - negative).
            - str: Sentiment label with the highest probability.
    '''
    with torch.no_grad():
        inputs = tokenizer(
            text, return_tensors='pt', padding=True, truncation=True, max_length=1024
        )
        outputs = model(**inputs)

        # convert logits to probabilities
        values = scipy.special.softmax(outputs.logits.numpy().squeeze())

        # initialize result, check none result in the upstream functions
        result = None

        if model_name == 'finbert':
            labels = [*model.config.id2label.values()]
            result = (
                values[0], # positive
                values[1], # negative
                values[2], # neutral
                values[0] - values[1], # sentiment score
                labels[np.argmax(values)], # sentiment label (max probability label)
            )

        if model_name == 'sst2' or model_name == 'deberta':
            label_map = {
                0: 'negative',
                1: 'positive'
            }
            label_idx = np.argmax(values)
            result = (
                values[1], # positive
                values[0], # negative
                values[1] - values[0], # sentiment score
                label_map[label_idx], # sentiment label (max probability label)
            )

        return result

In [13]:
def finbert_sentiment_fill_df(df: pd.DataFrame, text_col: str) -> pd.DataFrame:
    '''
    This function takes a DataFrame and a column name containing text data, and returns a new DataFrame
    with additional columns for FinBERT sentiment analysis scores and labels.

    Parameters:
    df (pd.DataFrame): The input DataFrame containing text data.
    text_col (str): The name of the column in the DataFrame that contains the text data to be analyzed.

    Returns:
    pd.DataFrame: A new DataFrame with the original data and additional columns:
        - finbert_pos: Probability of positive sentiment.
        - finbert_neg: Probability of negative sentiment.
        - finbert_neu: Probability of neutral sentiment.
        - finbert_score: Sentiment score calculated as (positive - negative).
        - finbert_sentiment: Sentiment label with the highest probability.
    '''
    tokenizer_finbert = AutoTokenizer.from_pretrained('ProsusAI/finbert')
    model_finbert = AutoModelForSequenceClassification.from_pretrained('ProsusAI/finbert')

    copy_df = df.copy()
    copy_df[['finbert_pos', 'finbert_neg', 'finbert_neu', 'finbert_stmt_score', 'finbert_stmt_label']] = (
        copy_df[text_col].apply(lambda x: calc_sentiment(x, tokenizer_finbert, model_finbert, model_name='finbert')).apply(pd.Series)
    )
    return copy_df

In [14]:
# example usage:
test_df = news_sample_df.copy()
test_df = finbert_sentiment_fill_df(test_df, text_col='description')
test_df.head()

Unnamed: 0,crawlDate,description,id,publishedDate,source,tags,ticker,tickers_all,title,tld,url,finbert_pos,finbert_neg,finbert_neu,finbert_stmt_score,finbert_stmt_label
0,2022-09-08 03:59:27,BALTIMORE (AP) — Alek Manoah retiró a 22 de su...,47305095.0,2022-09-08 03:24:19,houstonchronicle.com,,,,Azulejos ganan 3 de 4; Manoah anula a Orioles,houstonchronicle.com,https://www.houstonchronicle.com/sports/articl...,0.410662,0.019876,0.569462,0.390786,neutral
1,2022-09-08 03:59:24,CHICAGO (AP) — El dominicano Arístides Aquino ...,47305094.0,2022-09-08 03:37:30,houstonchronicle.com,Dominica/Dominican Republic,,,"Con 2 jonrones de Aquino, Rojos aplastan a Cac...",houstonchronicle.com,https://www.houstonchronicle.com/sports/articl...,0.130872,0.01829,0.850838,0.112582,neutral
2,2022-09-08 03:59:22,Prime Video has released a statement.,47305093.0,2022-09-08 03:58:52,screenrant.com,Tv News,,,Rings of Power Officially Defends Cast Amid Ra...,screenrant.com,https://screenrant.com/rings-power-show-racist...,0.020164,0.054695,0.925141,-0.034531,neutral
3,2022-09-08 03:59:21,"KANSAS CITY, Mo. (AP) — Salvador Perez’s sacri...",47305092.0,2022-09-08 03:38:50,houstonchronicle.com,%2Fnews%2Fsports+News/%2Fsports%2Fteam+Sports%...,,,Perez's sac fly in 9th lifts Royals to win ove...,houstonchronicle.com,https://www.houstonchronicle.com/sports/articl...,0.617857,0.034792,0.347351,0.583064,positive
4,2022-09-08 03:58:26,"North West Earnings Miss, Revenue Beats In Q2",47305091.0,2022-09-08 03:40:37,investing.com,Consumer Defensive/Financials/Stock/Unknown Se...,stkl,mdf/metr/mtraf/stkl,"North West Earnings Miss, Revenue Beats In Q2",investing.com,https://www.investing.com/news/north-west-earn...,0.525914,0.450251,0.023835,0.075663,positive


#### 3. Testing other pre-trained models

In [15]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# BERT Base SST-2 (General Purpose)
def sst2_sentiment_fill_df(df: pd.DataFrame, text_col: str) -> pd.DataFrame:
    tokenizer_sst2 = AutoTokenizer.from_pretrained('textattack/bert-base-uncased-SST-2')
    model_sst2 = AutoModelForSequenceClassification.from_pretrained('textattack/bert-base-uncased-SST-2')

    copy_df = df.copy()
    copy_df[['sst2_pos', 'sst2_neg', 'sst2_stmt_score', 'sst2_stmt_label']] = (
        copy_df[text_col].apply(lambda x: calc_sentiment(x, tokenizer_sst2, model_sst2, model_name='sst2')).apply(pd.Series)
    )
    return copy_df

In [16]:
# example usage:
test_df = news_sample_df.copy()
test_df = sst2_sentiment_fill_df(test_df, text_col='description')
test_df.head()

Unnamed: 0,crawlDate,description,id,publishedDate,source,tags,ticker,tickers_all,title,tld,url,sst2_pos,sst2_neg,sst2_stmt_score,sst2_stmt_label
0,2022-09-08 03:59:27,BALTIMORE (AP) — Alek Manoah retiró a 22 de su...,47305095.0,2022-09-08 03:24:19,houstonchronicle.com,,,,Azulejos ganan 3 de 4; Manoah anula a Orioles,houstonchronicle.com,https://www.houstonchronicle.com/sports/articl...,0.428736,0.571264,-0.142527,negative
1,2022-09-08 03:59:24,CHICAGO (AP) — El dominicano Arístides Aquino ...,47305094.0,2022-09-08 03:37:30,houstonchronicle.com,Dominica/Dominican Republic,,,"Con 2 jonrones de Aquino, Rojos aplastan a Cac...",houstonchronicle.com,https://www.houstonchronicle.com/sports/articl...,0.952399,0.047601,0.904799,positive
2,2022-09-08 03:59:22,Prime Video has released a statement.,47305093.0,2022-09-08 03:58:52,screenrant.com,Tv News,,,Rings of Power Officially Defends Cast Amid Ra...,screenrant.com,https://screenrant.com/rings-power-show-racist...,0.373591,0.626409,-0.252818,negative
3,2022-09-08 03:59:21,"KANSAS CITY, Mo. (AP) — Salvador Perez’s sacri...",47305092.0,2022-09-08 03:38:50,houstonchronicle.com,%2Fnews%2Fsports+News/%2Fsports%2Fteam+Sports%...,,,Perez's sac fly in 9th lifts Royals to win ove...,houstonchronicle.com,https://www.houstonchronicle.com/sports/articl...,0.991647,0.008353,0.983293,positive
4,2022-09-08 03:58:26,"North West Earnings Miss, Revenue Beats In Q2",47305091.0,2022-09-08 03:40:37,investing.com,Consumer Defensive/Financials/Stock/Unknown Se...,stkl,mdf/metr/mtraf/stkl,"North West Earnings Miss, Revenue Beats In Q2",investing.com,https://www.investing.com/news/north-west-earn...,0.01477,0.98523,-0.970459,negative


In [17]:
# combine with finbert
test_df = finbert_sentiment_fill_df(test_df, text_col='description')
test_df.head()

Unnamed: 0,crawlDate,description,id,publishedDate,source,tags,ticker,tickers_all,title,tld,url,sst2_pos,sst2_neg,sst2_stmt_score,sst2_stmt_label,finbert_pos,finbert_neg,finbert_neu,finbert_stmt_score,finbert_stmt_label
0,2022-09-08 03:59:27,BALTIMORE (AP) — Alek Manoah retiró a 22 de su...,47305095.0,2022-09-08 03:24:19,houstonchronicle.com,,,,Azulejos ganan 3 de 4; Manoah anula a Orioles,houstonchronicle.com,https://www.houstonchronicle.com/sports/articl...,0.428736,0.571264,-0.142527,negative,0.410662,0.019876,0.569462,0.390786,neutral
1,2022-09-08 03:59:24,CHICAGO (AP) — El dominicano Arístides Aquino ...,47305094.0,2022-09-08 03:37:30,houstonchronicle.com,Dominica/Dominican Republic,,,"Con 2 jonrones de Aquino, Rojos aplastan a Cac...",houstonchronicle.com,https://www.houstonchronicle.com/sports/articl...,0.952399,0.047601,0.904799,positive,0.130872,0.01829,0.850838,0.112582,neutral
2,2022-09-08 03:59:22,Prime Video has released a statement.,47305093.0,2022-09-08 03:58:52,screenrant.com,Tv News,,,Rings of Power Officially Defends Cast Amid Ra...,screenrant.com,https://screenrant.com/rings-power-show-racist...,0.373591,0.626409,-0.252818,negative,0.020164,0.054695,0.925141,-0.034531,neutral
3,2022-09-08 03:59:21,"KANSAS CITY, Mo. (AP) — Salvador Perez’s sacri...",47305092.0,2022-09-08 03:38:50,houstonchronicle.com,%2Fnews%2Fsports+News/%2Fsports%2Fteam+Sports%...,,,Perez's sac fly in 9th lifts Royals to win ove...,houstonchronicle.com,https://www.houstonchronicle.com/sports/articl...,0.991647,0.008353,0.983293,positive,0.617857,0.034792,0.347351,0.583064,positive
4,2022-09-08 03:58:26,"North West Earnings Miss, Revenue Beats In Q2",47305091.0,2022-09-08 03:40:37,investing.com,Consumer Defensive/Financials/Stock/Unknown Se...,stkl,mdf/metr/mtraf/stkl,"North West Earnings Miss, Revenue Beats In Q2",investing.com,https://www.investing.com/news/north-west-earn...,0.01477,0.98523,-0.970459,negative,0.525914,0.450251,0.023835,0.075663,positive


In [18]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# DeBERTa-v3 (State-of-the-Art) higher accuracy but larger
tokenizer_deberta = AutoTokenizer.from_pretrained('microsoft/deberta-v3-base')
model_deberta = AutoModelForSequenceClassification.from_pretrained('microsoft/deberta-v3-base')

def deberta_sentiment_fill_df(df: pd.DataFrame, text_col: str) -> pd.DataFrame:
    tokenizer_deberta = AutoTokenizer.from_pretrained('microsoft/deberta-v3-base')
    model_deberta = AutoModelForSequenceClassification.from_pretrained('microsoft/deberta-v3-base')

    copy_df = df.copy()
    copy_df[['deberta_pos', 'deberta_neg', 'deberta_stmt_score', 'deberta_stmt_label']] = (
        copy_df[text_col].apply(lambda x: calc_sentiment(x, tokenizer_deberta, model_deberta, model_name='deberta')).apply(pd.Series)
    )
    return copy_df

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
test_df = deberta_sentiment_fill_df(test_df, text_col='description')
test_df.head()

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Unnamed: 0,crawlDate,description,id,publishedDate,source,tags,ticker,tickers_all,title,tld,...,sst2_stmt_label,finbert_pos,finbert_neg,finbert_neu,finbert_stmt_score,finbert_stmt_label,deberta_pos,deberta_neg,deberta_stmt_score,deberta_stmt_label
0,2022-09-08 03:59:27,BALTIMORE (AP) — Alek Manoah retiró a 22 de su...,47305095.0,2022-09-08 03:24:19,houstonchronicle.com,,,,Azulejos ganan 3 de 4; Manoah anula a Orioles,houstonchronicle.com,...,negative,0.410662,0.019876,0.569462,0.390786,neutral,0.517502,0.482498,0.035004,positive
1,2022-09-08 03:59:24,CHICAGO (AP) — El dominicano Arístides Aquino ...,47305094.0,2022-09-08 03:37:30,houstonchronicle.com,Dominica/Dominican Republic,,,"Con 2 jonrones de Aquino, Rojos aplastan a Cac...",houstonchronicle.com,...,positive,0.130872,0.01829,0.850838,0.112582,neutral,0.515786,0.484214,0.031571,positive
2,2022-09-08 03:59:22,Prime Video has released a statement.,47305093.0,2022-09-08 03:58:52,screenrant.com,Tv News,,,Rings of Power Officially Defends Cast Amid Ra...,screenrant.com,...,negative,0.020164,0.054695,0.925141,-0.034531,neutral,0.520406,0.479594,0.040811,positive
3,2022-09-08 03:59:21,"KANSAS CITY, Mo. (AP) — Salvador Perez’s sacri...",47305092.0,2022-09-08 03:38:50,houstonchronicle.com,%2Fnews%2Fsports+News/%2Fsports%2Fteam+Sports%...,,,Perez's sac fly in 9th lifts Royals to win ove...,houstonchronicle.com,...,positive,0.617857,0.034792,0.347351,0.583064,positive,0.518034,0.481966,0.036069,positive
4,2022-09-08 03:58:26,"North West Earnings Miss, Revenue Beats In Q2",47305091.0,2022-09-08 03:40:37,investing.com,Consumer Defensive/Financials/Stock/Unknown Se...,stkl,mdf/metr/mtraf/stkl,"North West Earnings Miss, Revenue Beats In Q2",investing.com,...,negative,0.525914,0.450251,0.023835,0.075663,positive,0.515076,0.484924,0.030152,positive
