In [1]:
import evaluate
from datasets import disable_caching, load_from_disk
from transformers import (
    AutoTokenizer,
    pipeline,
)
from transformers.pipelines.pt_utils import KeyDataset

In [2]:
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base-v2")
test = load_from_disk("../data/test/")
accuracy = evaluate.load("accuracy")

In [4]:
classifier = pipeline(
    task="text-classification",
    model="./SentimentPhoBERT",
    tokenizer=tokenizer,
    device=0,
)
predictions = list(
    classifier(
        KeyDataset(test, "sentence"),
        padding=True,
        truncation=True,
        max_length=256,
        batch_size=8
    )
)
predictions = [
    int(prediction["label"].replace("LABEL_", "")) for prediction in predictions
]
accuracy.compute(predictions=predictions, references=test["labels"])

{'accuracy': 0.81175}

In [5]:
classifier = pipeline(
    task="text-classification",
    model="./SentimentPhoBERT-LoRA-150",
    tokenizer=tokenizer,
    device=0,
)
predictions = list(
    classifier(
        KeyDataset(test, "sentence"),
        padding=True,
        truncation=True,
        max_length=150,
        batch_size=8,
    )
)
predictions = [
    int(prediction["label"].replace("LABEL_", "")) for prediction in predictions
]
accuracy.compute(predictions=predictions, references=test["labels"])

{'accuracy': 0.89725}

In [5]:
classifier = pipeline(
    task="text-classification",
    model="./../SentimentPhoBERT-LoRA-256",
    tokenizer=tokenizer,
    device=0,
)

In [43]:
predictions = list(
    classifier(
        KeyDataset(test, "sentence"),
        padding=True,
        truncation=True,
        max_length=256,
        batch_size=8,
    )
)

In [7]:
import jnius_config

jnius_config.add_classpath(
    "libs/jvntextpro.jar",
    "libs/vietsentiwordnet_v1.0.jar",
)

from jnius import autoclass

# Load class
OpinionFinder = autoclass("vnu.uet.vietsentiwordnet.apis.OpinionFinder")
sl = OpinionFinder.getInstance().loadModels()
ResultObject = autoclass("vnu.uet.vietsentiwordnet.objects.ResultObject")
res = ResultObject()
javaclass_String = autoclass("java.lang.String")

Initilize JVnSenSegmenter ...
Reading options ...
Reading options completed!
Reading the context predicate maps ...
Reading context predicate maps (3326 entries) completed!
Reading the context predicate maps ...
Reading label maps (2 entries) completed!
Reading dictionary ...
Reading dictionary (3328 entries) completed!
Reading features ...
Reading 3908 features completed!
numFetures: 3908
Reading the context predicate maps ...
Reading context predicate maps (3326 entries) completed!
Reading the context predicate maps ...
Reading label maps (2 entries) completed!
Reading dictionary ...
Reading dictionary (3328 entries) completed!
Reading features ...
Reading 3908 features completed!
numFetures: 3908
Initilize JVnSegmenter ...
models/jvnsegmenter
Reading options ...
Reading options completed!
Reading the context predicate maps ...
Reading context predicate maps (292546 entries) completed!
Reading label maps ...
Reading label maps (3 entries) completed!
Reading dictionary ...
Reading dic

In [42]:
gg = []
for sentence in test["sentence"]:
    res = sl.doSenLevel(javaclass_String(sentence))
    gg.append(float(res.getScore()))

Starting inference ...
sequence 1
sequence 2
Inference 2 sequences completed!
Inference time: 0.001 seconds
Starting inference ...
sequence 1
sequence 2
sequence 3
sequence 4
sequence 5
sequence 6
sequence 7
Inference 7 sequences completed!
Inference time: 0.001 seconds
Starting inference ...
sequence 1
Inference 1 sequences completed!
Inference time: 0.0 seconds
Starting inference ...
sequence 1
Inference 1 sequences completed!
Inference time: 0.0 seconds
Starting inference ...
sequence 1
Inference 1 sequences completed!
Inference time: 0.0 seconds
Starting inference ...
sequence 1
sequence 2
Inference 2 sequences completed!
Inference time: 0.0 seconds
Starting inference ...
sequence 1
Inference 1 sequences completed!
Inference time: 0.0 seconds
Starting inference ...
sequence 1
sequence 2
sequence 3
sequence 4
sequence 5
sequence 6
sequence 7
Inference 7 sequences completed!
Inference time: 0.0 seconds
Starting inference ...
sequence 1
sequence 2
sequence 3
sequence 4
sequence 5
sequ

In [35]:
print(predictions[:10])

[{'label': 'LABEL_1', 'score': 0.9333884119987488}, {'label': 'LABEL_1', 'score': 0.9455210566520691}, {'label': 'LABEL_1', 'score': 0.9404779076576233}, {'label': 'LABEL_1', 'score': 0.843597948551178}, {'label': 'LABEL_0', 'score': 0.9273842573165894}, {'label': 'LABEL_1', 'score': 0.8446128964424133}, {'label': 'LABEL_0', 'score': 0.9390023946762085}, {'label': 'LABEL_0', 'score': 0.9360643625259399}, {'label': 'LABEL_0', 'score': 0.8957539200782776}, {'label': 'LABEL_0', 'score': 0.9370844960212708}]


In [47]:
combined = predictions.copy()
for i in range(len(predictions)):
    if (predictions[i]["score"] < 0.6):
        if gg[i] > 0:
            combined[i]["label"] = "LABEL_0"
        elif gg[i] < 0:
            combined[i]["label"] = "LABEL_1"

In [58]:
low_confidences = []
for i in range(len(predictions)):
    if (predictions[i]["score"] < 0.6):
        low_confidences.append(i)

with open("low_confidences.txt", "w") as f:
    for i in low_confidences:
        f.write(test["sentence"][i] + "\n")
        f.write(str(predictions[i]) + "\n")
        f.write(str(gg[i]) + "\n")
        f.write("LABEL_" + str(test["labels"][i]) + "\n")

In [45]:
combined = [int(prediction["label"].replace("LABEL_", "")) for prediction in combined]
accuracy.compute(predictions=combined, references=test["labels"])

{'accuracy': 0.899875}