### Sentiment Analysis

#### 1. Connect to the server

In [1]:
import pandas as pd
import numpy as np
from clickhouse_driver import Client

In [2]:
# macro database information, do not change
host_name = 'chenlin01.fbe.hku.hk'
user_name = 'mfin7037_best_students'
pswd = 'alanisthecoolest123'
db_name = 'tiingo'
port = 9000

In [3]:
client = Client(
    host=host_name,
    user=user_name,
    password=pswd,
    database=db_name,
    port=port,
)

In [None]:
client.execute('SHOW DATABASES')

In [82]:
client.execute('SHOW TABLES')

[('news',)]

In [83]:
client.execute('USE tiingo')
client.execute('SELECT * FROM news limit 10')

[('2018-05-02T12:14:50.841934+00:00',
  'Yamana Gold Inc. (NYSE:AUY) shares are down more than -8.65% this year and recently decreased -0.70% or -$0.02 to settle at $2.85. SM Energy Company (NYSE:SM), on the other hand, is up 6.61% year to date as of 05/01/2018. It currently trades at $23.54 and has returned 3.11% during the past week. Yamana Gold Inc.…',
  10471811.0,
  '2018-05-02T11:06:33+00:00',
  'stocknewsgazette.com',
  'Energy/Materials/Stock',
  'sm',
  'auy/sm',
  'Yamana Gold Inc. (AUY) vs. SM Energy Company (SM): Which is the Better Investment?',
  'stocknewsgazette.com',
  'https://stocknewsgazette.com/2018/05/02/yamana-gold-inc-auy-vs-sm-energy-company-sm-which-is-the-better-investment/'),
 ('2018-05-02T12:15:20.345795+00:00',
  'AGNC Investment Corp. (NASDAQ:AGNC) shares are down more than -6.59% this year and recently decreased -0.32% or -$0.06 to settle at $18.86. B2Gold Corp. (NYSE:BTG), on the other hand, is down -9.03% year to date as of 05/01/2018. It currently tra

In [84]:
# Use a stable sort key
consistent_sample = client.execute('''
    SELECT *
    FROM news
    WHERE title != ''
    ORDER BY id DESC
    LIMIT 10
''')

# Get column names from table structure
column_info = client.execute('DESCRIBE TABLE news')
columns = [col[0] for col in column_info]  # Extract column names

# Create DataFrame
news_sample_df = pd.DataFrame(consistent_sample, columns=columns)

In [85]:
client.disconnect()

#### 2. Testing for FinBERT

In [1]:
# requirements 
# torch        2.2.2 
# pandas       2.2.1
# numpy        1.26.4
# scipy        1.13.0
# huggingface-hub     0.29.1

from transformers import AutoTokenizer, AutoModelForSequenceClassification
import scipy
import torch
print(f'PyTorch version: {torch.__version__}')
print(f'CUDA available: {torch.cuda.is_available()}')  # Should be True on servers
print(f'CUDA version: {torch.version.cuda}')  # Should show 12.1 on servers
import pandas as pd

PyTorch version: 2.2.2
CUDA available: False
CUDA version: None


In [2]:
tokenizer = AutoTokenizer.from_pretrained('ProsusAI/finbert')
model = AutoModelForSequenceClassification.from_pretrained('ProsusAI/finbert')

In [4]:
def finbert_sentiment(text: str) -> tuple[float, float, float, str]:
    with torch.no_grad():
        inputs = tokenizer(
            text, return_tensors='pt', padding=True, truncation=True, max_length=1024
        )
        outputs = model(**inputs)

        # convert logits to probabilities
        values = scipy.special.softmax(outputs.logits.numpy().squeeze())
        labels = [*model.config.id2label.values(), 'sentiment_score']

        # add sentiment score to the list
        values = [*values, values[0] - values[1]]

        return (
            values[0], # positive
            values[1], # negative
            values[2], # neutral
            values[3], # sentiment score
            labels[values.index(max(values))], # sentiment label (max probability label)
        )

(0.12906198, 0.032153457, 0.8387846, 0.096908525, 'neutral')

In [93]:
df = news_sample_df.copy()
df.head()

Unnamed: 0,crawlDate,description,id,publishedDate,source,tags,ticker,tickers_all,title,tld,url
0,2022-09-08 03:59:27,BALTIMORE (AP) — Alek Manoah retiró a 22 de su...,47305095.0,2022-09-08 03:24:19,houstonchronicle.com,,,,Azulejos ganan 3 de 4; Manoah anula a Orioles,houstonchronicle.com,https://www.houstonchronicle.com/sports/articl...
1,2022-09-08 03:59:24,CHICAGO (AP) — El dominicano Arístides Aquino ...,47305094.0,2022-09-08 03:37:30,houstonchronicle.com,Dominica/Dominican Republic,,,"Con 2 jonrones de Aquino, Rojos aplastan a Cac...",houstonchronicle.com,https://www.houstonchronicle.com/sports/articl...
2,2022-09-08 03:59:22,Prime Video has released a statement.,47305093.0,2022-09-08 03:58:52,screenrant.com,Tv News,,,Rings of Power Officially Defends Cast Amid Ra...,screenrant.com,https://screenrant.com/rings-power-show-racist...
3,2022-09-08 03:59:21,"KANSAS CITY, Mo. (AP) — Salvador Perez’s sacri...",47305092.0,2022-09-08 03:38:50,houstonchronicle.com,%2Fnews%2Fsports+News/%2Fsports%2Fteam+Sports%...,,,Perez's sac fly in 9th lifts Royals to win ove...,houstonchronicle.com,https://www.houstonchronicle.com/sports/articl...
4,2022-09-08 03:58:26,"North West Earnings Miss, Revenue Beats In Q2",47305091.0,2022-09-08 03:40:37,investing.com,Consumer Defensive/Financials/Stock/Unknown Se...,stkl,mdf/metr/mtraf/stkl,"North West Earnings Miss, Revenue Beats In Q2",investing.com,https://www.investing.com/news/north-west-earn...


In [103]:
# Notice that this is the raw text, no preprocessing
df[['finbert_pos', 'finbert_neg', 'finbert_neu', 'finbert_score', 'finbert_sentiment']] = (
    df['description'].apply(finbert_sentiment).apply(pd.Series)
)

In [98]:
df.head()

Unnamed: 0,crawlDate,description,id,publishedDate,source,tags,ticker,tickers_all,title,tld,url,finbert_pos,finbert_neg,finbert_neu,finbert_sentiment_score,finbert_sentiment,finbert_score
0,2022-09-08 03:59:27,BALTIMORE (AP) — Alek Manoah retiró a 22 de su...,47305095.0,2022-09-08 03:24:19,houstonchronicle.com,,,,Azulejos ganan 3 de 4; Manoah anula a Orioles,houstonchronicle.com,https://www.houstonchronicle.com/sports/articl...,0.410662,0.019876,0.569462,0.390786,neutral,0.390786
1,2022-09-08 03:59:24,CHICAGO (AP) — El dominicano Arístides Aquino ...,47305094.0,2022-09-08 03:37:30,houstonchronicle.com,Dominica/Dominican Republic,,,"Con 2 jonrones de Aquino, Rojos aplastan a Cac...",houstonchronicle.com,https://www.houstonchronicle.com/sports/articl...,0.130872,0.01829,0.850838,0.112582,neutral,0.112582
2,2022-09-08 03:59:22,Prime Video has released a statement.,47305093.0,2022-09-08 03:58:52,screenrant.com,Tv News,,,Rings of Power Officially Defends Cast Amid Ra...,screenrant.com,https://screenrant.com/rings-power-show-racist...,0.020164,0.054695,0.925141,-0.034531,neutral,-0.034531
3,2022-09-08 03:59:21,"KANSAS CITY, Mo. (AP) — Salvador Perez’s sacri...",47305092.0,2022-09-08 03:38:50,houstonchronicle.com,%2Fnews%2Fsports+News/%2Fsports%2Fteam+Sports%...,,,Perez's sac fly in 9th lifts Royals to win ove...,houstonchronicle.com,https://www.houstonchronicle.com/sports/articl...,0.617857,0.034792,0.347351,0.583064,positive,0.583064
4,2022-09-08 03:58:26,"North West Earnings Miss, Revenue Beats In Q2",47305091.0,2022-09-08 03:40:37,investing.com,Consumer Defensive/Financials/Stock/Unknown Se...,stkl,mdf/metr/mtraf/stkl,"North West Earnings Miss, Revenue Beats In Q2",investing.com,https://www.investing.com/news/north-west-earn...,0.525914,0.450251,0.023835,0.075663,positive,0.075663
