# Notebook 3: Test of Oseti against other SA models

In [1]:
import oseti
import statistics
import nltk
#nltk.download('all') #runs first time only
from nltk.sentiment import SentimentIntensityAnalyzer
from pathlib import Path
import os
import numpy as np
from transformers import pipeline
import torch
from tqdm import tqdm
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


Note: Here, the Oseti library is used in a slightly modified form. Oseti is dependent on the MeCab tokenization ,however, the current version of Oseti was not adjusted for the updated MeCab. The code may be adjusted for MeCab, but in this notebook we use Neolog Dictionary with Janome as a tokenizer. In practice, this has almost no effect for the sentiment score. In the analysis of the corpora, pure Oseti was used with minor adjustments for compatibility with a newer MeCab version.

In [2]:
sia = SentimentIntensityAnalyzer()
analyzer = oseti.Analyzer()

# Oseti-dictionary based sentiment analysis vs rule-based VADER
Here, we juxtapose the two approaches. Oseti sentiment analyzer has a built-in sentence tokenizer, while VADER demands usage of a particular tokenizers (like one in the NLTK package).

In [3]:
with open ("text samplings\\direct speech sampling JA.txt", encoding="utf-8") as file:
    text = file.read()
sampling_ja = text.split("\n")

with open ("text samplings\\direct speech sampling EN.txt", encoding="utf-8") as file:
    text = file.read()
sampling_en = text.split("\n")

In [4]:
#number of samplings
no_sampling_ja = len(sampling_ja)
no_sampling_en = len(sampling_en)

### Oseti results

In [5]:
oseti_sentiment = [statistics.mean(analyzer.analyze(sent)) for sent in sampling_ja]

### VADER results

In [6]:
vader_sentiment = [sia.polarity_scores(sent)['compound'] for sent in sampling_en]

### bert-finetuned-japanese-sentiment
Training dataset: Amazon Reviews\
No.: 20000 reviews\
Link: https://huggingface.co/christian-phu/bert-finetuned-japanese-sentiment

In [7]:
# model needs the following dependencies:
#!pip install fugashi
#!pip install unidic_lite


sentiment_analyzer = pipeline(
            "sentiment-analysis",
            model="christian-phu/bert-finetuned-japanese-sentiment"
        )
bert_sentiment = []
for sent in tqdm(sampling_ja):
    result = sentiment_analyzer(sent)[0]
    label_to_score = {'positive': 1, 'neutral': 0, 'negative': -1}
    compound_score = label_to_score[result['label']] * result['score']
    bert_sentiment.append(compound_score)

Device set to use cpu
100%|██████████| 159/159 [01:07<00:00,  2.35it/s]


### japanese-sentiment-analysis
Training dataset: Corporate financial reports\
No.: 200 reports (6,119 sentences)\
Link: https://huggingface.co/jarvisx17/japanese-sentiment-analysis

In [8]:
sentiment_analyzer_jarv = pipeline("sentiment-analysis", model="jarvisx17/japanese-sentiment-analysis")
jarv_sentiment = []
for sent in tqdm(sampling_ja):
    result = sentiment_analyzer_jarv(sent)[0]
    label_to_score = {'positive': 1, 'neutral': 0, 'negative': -1}
    compound_score = label_to_score[result['label']] * result['score']
    jarv_sentiment.append(compound_score)

Device set to use cpu
100%|██████████| 159/159 [00:55<00:00,  2.84it/s]


### Japanese Stock Comment Sentiment Model
Training dataset: Comments and discussions related to Japanese stocks\
No.: Not clarified\
Link: https://huggingface.co/c299m/japanese_stock_sentiment\
\
\
This model is inapplicable for SA, as it estimates only market trends in two categories: "bullish" and "bearish".

### Finance-sentiment-ja-base
Training dataset: Japanese financial news\
No.: ≈5,000 sentences/phrases\
Link: https://huggingface.co/bardsai/finance-sentiment-ja-base\
\
The model is unoperabable as in the majority of cases it outputs neutral sentiment scores.

In [9]:
sentiment_analyzer_bardsai = pipeline("sentiment-analysis", model="bardsai/finance-sentiment-ja-base")
bardsai_sentiment = []
for sent in tqdm(sampling_ja):
    result = sentiment_analyzer_bardsai(sent)[0]
    label_to_score = {'positive': 1, 'neutral': 0, 'negative': -1}
    compound_score = label_to_score[result['label']] * result['score']
    bardsai_sentiment.append(compound_score)

Device set to use cpu
100%|██████████| 159/159 [00:51<00:00,  3.10it/s]


### Models Overview

In [10]:
comparative_df = pd.DataFrame({"Oseti": oseti_sentiment, "VADER": vader_sentiment, "bert-finetuned-japanese-sentiment": bert_sentiment,
                               "japanese-sentiment-analysis": jarv_sentiment,"finance-sentiment-ja-base": bardsai_sentiment})

In [11]:
comparative_df.to_csv("Models Overview Dataframe.csv")

In [12]:
comparative_df.describe()

Unnamed: 0,Oseti,VADER,bert-finetuned-japanese-sentiment,japanese-sentiment-analysis,finance-sentiment-ja-base
count,159.0,159.0,159.0,159.0,159.0
mean,0.058192,0.002313,0.374328,0.314734,0.010143
std,0.517422,0.384876,0.646258,0.923552,0.145701
min,-1.0,-0.9287,-0.997148,-0.999939,-0.99992
25%,0.0,-0.0644,0.0,-0.988994,0.0
50%,0.0,0.0,0.652621,0.98831,0.0
75%,0.0,0.1378,0.984478,0.999605,0.0
max,1.0,0.9001,0.999272,0.999955,0.99901


The transformer models for Japanese sentiment analysis did not demonstrate a strong rationale for their advantage over the simplistic, dictionary-based method used by Oseti.

1. They do not provide a direct interface for calculating sentiment intensity. Instead, intensity scores are indirectly inferred from the model’s confidence (probability) in classifying a sentence as positive, negative, or neutral.

2. Among the four documented models, only two are operational. The Japanese Stock Comment Sentiment Model is not suitable for this study, as its sentiment classes ("bearish" and "bullish") do not align with the required categories. The finance-sentiment-ja-base model tends to classify most sentences as neutral when applied to samples from the Atomic Bomb Literature corpus.

3. The transformer models did not demonstrate a meaningfully stronger correlation with the VADER model, nor among themselves.

4. Given the advantages of rule-based models like VADER—particularly their transparency and traceability—we consider VADER a reliable reference point. When comparing against this benchmark, Oseti shows significantly better alignment. Although finance-sentiment-ja-base produced slightly higher precision, recall, and F1 scores, 155 out of its 157 predictions were classified as neutral, limiting its practical usefulness.

In [13]:
comparative_df.corr()

Unnamed: 0,Oseti,VADER,bert-finetuned-japanese-sentiment,japanese-sentiment-analysis,finance-sentiment-ja-base
Oseti,1.0,0.357845,0.268596,0.271637,0.155789
VADER,0.357845,1.0,0.345126,0.381054,0.153501
bert-finetuned-japanese-sentiment,0.268596,0.345126,1.0,0.329962,0.000454
japanese-sentiment-analysis,0.271637,0.381054,0.329962,1.0,0.146033
finance-sentiment-ja-base,0.155789,0.153501,0.000454,0.146033,1.0


In [18]:
from sklearn.metrics import precision_score, recall_score, f1_score

def transform_sentiment(input_scores):
    """sent > 0 -> 1; sent < 0 -> -1; sent = 0 -> 0"""
    transformed_sentiment = [1 if score > 0 else (-1 if score < 0 else 0) for score in input_scores]
    return transformed_sentiment

y_vader = transform_sentiment(vader_sentiment)  
y_oseti = transform_sentiment(oseti_s)
y_bert = transform_sentiment(bert_sentiment)
y_jarv = transform_sentiment(jarv_sentiment)
y_bardsai = transform_sentiment(bardsai_sentiment)

def get_metrics(true_values, predicted_values):
    """Calculates precision, recall, and F1 score."""
    precision = precision_score(true_values, predicted_values, average='weighted')  # Using 'weighted' for multi-class
    recall = recall_score(true_values, predicted_values, average='weighted')
    f1 = f1_score(true_values, predicted_values, average='weighted')
    return [precision, recall, f1]


oseti_metrics = get_metrics(y_vader, y_oseti)
bert_metrics = get_metrics(y_vader, y_bert)
jarv_metrics = get_metrics(y_vader, y_jarv)
bardsai_metrics = get_metrics(y_vader, y_bardsai)

metrics_df = pd.DataFrame({"Metric": ["Precision", "Recall", "F1"], "Oseti": oseti_metrics,
        "bert-finetuned-japanese-sentiment": bert_metrics, "japanese-sentiment-analysis": jarv_metrics,
        "finance-sentiment-ja-base": bardsai_metrics})

metrics_df

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,Metric,Oseti,bert-finetuned-japanese-sentiment,japanese-sentiment-analysis,finance-sentiment-ja-base
0,Precision,0.584939,0.360627,0.230748,0.657199
1,Recall,0.591195,0.345912,0.408805,0.45283
2,F1,0.577118,0.328412,0.289929,0.305955


In [20]:
metrics_df.to_csv("Models against VADER tests.csv", index=False)

Average sentiment scores

In [22]:
from statistics import mean

NameError: name 'mean' is not defined