In [37]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np
import torch


#### Load model
model_path = '5CD-AI/Vietnamese-Sentiment-visobert'
tokenizer = AutoTokenizer.from_pretrained(model_path)
config = AutoConfig.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path).to("cuda")

In [38]:
import pandas as pd
dataset = pd.read_csv("/home/ldap-users-2/internship_2025/nguyenphuong-p/temp_data/socal_uit.csv")

In [39]:
def tokenize_data(text, tokenizer, max_length=256):
    result_tokenize = tokenizer(
        text,
        padding="max_length",
        max_length=max_length,
        truncation=True,
        return_tensors="pt",
    )

    return {
        "vietnamese_text_input_ids": result_tokenize["input_ids"],
        "vietnamese_text_attention_mask": result_tokenize["attention_mask"],
    }


In [40]:
tokenized_data = tokenize_data(dataset["VietnameseText"].tolist(), tokenizer)
tokenized_data

{'vietnamese_text_input_ids': tensor([[   0, 3441, 2539,  ...,    1,    1,    1],
         [   0, 8243,   62,  ...,    1,    1,    1],
         [   0, 2862,  133,  ...,    1,    1,    1],
         ...,
         [   0, 1415,  188,  ...,    1,    1,    1],
         [   0, 2763,  478,  ...,    1,    1,    1],
         [   0, 1415, 1882,  ...,    1,    1,    1]]),
 'vietnamese_text_attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]])}

In [41]:
from transformers import DataCollatorWithPadding
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = data    
    def __len__(self):
        return len(self.data["vietnamese_text_input_ids"])
    def __getitem__(self, index):
        return {
            "vietnamese_text_input_ids": self.data["vietnamese_text_input_ids"][index],
            "vietnamese_text_attention_mask": self.data["vietnamese_text_attention_mask"][index]
        }

def collate_fn(batch):
    vietnamese_text_input_ids = [sample["vietnamese_text_input_ids"] for sample in batch]
    vietnamese_text_attention_mask = [sample["vietnamese_text_attention_mask"] for sample in batch]
    padding_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True, return_tensors='pt')
    vietnamese_batch = padding_collator({'input_ids': vietnamese_text_input_ids, 'attention_mask': vietnamese_text_attention_mask})

    return {
        "vietnamese_text_input_ids": vietnamese_batch['input_ids'],
        "vietnamese_text_attention_mask": vietnamese_batch['attention_mask']
    }



In [42]:
custom_dataset = CustomDataset(tokenized_data)
data_loader = torch.utils.data.DataLoader(custom_dataset, batch_size=1024, collate_fn=collate_fn)
next(iter(data_loader))

{'vietnamese_text_input_ids': tensor([[   0, 3441, 2539,  ...,    1,    1,    1],
         [   0, 8243,   62,  ...,    1,    1,    1],
         [   0, 2862,  133,  ...,    1,    1,    1],
         ...,
         [   0, 1296,  713,  ...,    1,    1,    1],
         [   0, 1415,  387,  ...,    1,    1,    1],
         [   0,  379,  265,  ...,    1,    1,    1]]),
 'vietnamese_text_attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]])}

In [43]:
from tqdm.auto import tqdm

list_prob_neu = []
list_prob_pos = []
list_prob_neg = []
list_predictions = []

with torch.no_grad():
    for batch in tqdm(data_loader):
        vietnamese_text_input_ids = batch["vietnamese_text_input_ids"].to("cuda")
        vietnamese_text_attention_mask = batch["vietnamese_text_attention_mask"].to("cuda")

        logits = model(input_ids=vietnamese_text_input_ids, attention_mask=vietnamese_text_attention_mask).logits
        softmax_logits = torch.nn.functional.softmax(logits, dim=1)
        predictions = torch.argmax(logits, dim=1)

        list_prob_neu += softmax_logits[:, 2].to("cpu").tolist()
        list_prob_pos += softmax_logits[:, 1].to("cpu").tolist()
        list_prob_neg += softmax_logits[:, 0].to("cpu").tolist()
        list_predictions += predictions.to("cpu").tolist()

        



100%|██████████| 11/11 [00:14<00:00,  1.33s/it]


In [44]:
list_prob_pos

[0.9694488644599915,
 0.000746477278880775,
 0.05426632612943649,
 0.9982810020446777,
 0.0004608379094861448,
 0.046641115099191666,
 0.9987429976463318,
 0.9914767146110535,
 0.9746556878089905,
 0.997899055480957,
 0.7254965901374817,
 0.000870565592776984,
 0.24118782579898834,
 0.20065069198608398,
 0.0773378238081932,
 0.9085947871208191,
 0.001181308296509087,
 0.5221443176269531,
 0.0012541509931907058,
 0.9914626479148865,
 0.9992091059684753,
 0.9898133873939514,
 0.07804650813341141,
 0.3568938374519348,
 0.03765008971095085,
 0.0004328708164393902,
 0.9968442916870117,
 0.9080340266227722,
 0.9993324875831604,
 0.041649602353572845,
 0.10265374183654785,
 0.16483886539936066,
 0.8881280422210693,
 0.0005604763864539564,
 0.005859336815774441,
 0.001218693796545267,
 0.9771996140480042,
 0.3429979979991913,
 0.0008155546383932233,
 0.0011411152081564069,
 0.0013552875025197864,
 0.023655276745557785,
 0.0007004805374890566,
 0.9931459426879883,
 0.0007472747238352895,
 0.001

In [45]:
dataset["VisobertLabel"] = list_predictions
dataset["NegProb"] = list_prob_neg
dataset["NeuProb"] = list_prob_neu
dataset["PosProb"] = list_prob_pos
dataset

Unnamed: 0,VietnameseText,Label,EnglishText,VisobertLabel,NegProb,NeuProb,PosProb
0,Pin Sài tầm 50h cho pin 100/100. Camera ổn ......,"[[0, 31, 'BATTERY#POSITIVE'], [33, 42, 'CAMERA...",The battery is 50 hours for 100/100 battery.Go...,1,0.001704,0.028847,0.969449
1,Lag và hao pin là cái tóm tắt về máy. S...,"[[0, 3, 'PERFORMANCE#NEGATIVE'], [8, 15, 'BATT...",Lag and draining battery are a summary of the ...,0,0.999145,0.000108,0.000746
2,Tất cả đều ổn ngoại trừ lúc máy nóng lên thì p...,"[[79, 109, 'CAMERA#NEUTRAL'], [111, 169, 'BATT...","All are fine except when the device heats up, ...",0,0.485814,0.459919,0.054266
3,"Ok mua máy ở TGDD chính sách đổi trả rất tốt,r...","[[18, 217, 'SER&ACC#POSITIVE']]",OK buying a machine at TGDD very good exchange...,1,0.000447,0.001272,0.998281
4,"kiểu dáng thì đẹp,cầm chắc tay,nhưng loa nhỏ q...","[[0, 30, 'DESIGN#POSITIVE'], [37, 48, 'FEATURE...","The design is beautiful, holding firmly, but t...",2,0.004147,0.995392,0.000461
...,...,...,...,...,...,...,...
11117,"điện thoại mẫu mã thiết kế đẹp,chụp hình rõ né...","[[0, 30, 'DESIGN#POSITIVE'], [31, 47, 'CAMERA#...","Phone design beautiful design, clear photograp...",1,0.000668,0.002301,0.997031
11118,Tốt mình Sài được 20ngay rồi pin ok cau hình m...,"[[29, 35, 'BATTERY#POSITIVE'], [36, 49, 'PERFO...","I am good at 20ngay and the battery is ok, the...",2,0.067114,0.637889,0.294997
11119,"Máy mới mua, pin sụt nhanh, nhiều lúc bị đơ đứ...","[[13, 26, 'BATTERY#NEGATIVE'], [28, 53, 'PERFO...","The new device bought, the battery dropped qui...",0,0.999234,0.000422,0.000344
11120,Nhận máy m7. Hôm nay đc 5 hôm thì máy lỗi đen ...,"[[34, 49, 'SCREEN#NEGATIVE'], [288, 385, 'SER&...","Receive M7.Today, for 5 days, the device is bl...",0,0.999298,0.000200,0.000502


In [46]:

def convert_label(dataset_df: pd.DataFrame, current_negative, current_neutral, current_positive, field_name="VisobertLabel"):
    dataset_df[field_name] = dataset_df[field_name].apply(lambda x: 0 if x == current_negative else 1 if x == current_neutral else 2 if x == current_positive else x)
    return dataset_df

dataset = convert_label(dataset, 0, 2, 1)
dataset

Unnamed: 0,VietnameseText,Label,EnglishText,VisobertLabel,NegProb,NeuProb,PosProb
0,Pin Sài tầm 50h cho pin 100/100. Camera ổn ......,"[[0, 31, 'BATTERY#POSITIVE'], [33, 42, 'CAMERA...",The battery is 50 hours for 100/100 battery.Go...,2,0.001704,0.028847,0.969449
1,Lag và hao pin là cái tóm tắt về máy. S...,"[[0, 3, 'PERFORMANCE#NEGATIVE'], [8, 15, 'BATT...",Lag and draining battery are a summary of the ...,0,0.999145,0.000108,0.000746
2,Tất cả đều ổn ngoại trừ lúc máy nóng lên thì p...,"[[79, 109, 'CAMERA#NEUTRAL'], [111, 169, 'BATT...","All are fine except when the device heats up, ...",0,0.485814,0.459919,0.054266
3,"Ok mua máy ở TGDD chính sách đổi trả rất tốt,r...","[[18, 217, 'SER&ACC#POSITIVE']]",OK buying a machine at TGDD very good exchange...,2,0.000447,0.001272,0.998281
4,"kiểu dáng thì đẹp,cầm chắc tay,nhưng loa nhỏ q...","[[0, 30, 'DESIGN#POSITIVE'], [37, 48, 'FEATURE...","The design is beautiful, holding firmly, but t...",1,0.004147,0.995392,0.000461
...,...,...,...,...,...,...,...
11117,"điện thoại mẫu mã thiết kế đẹp,chụp hình rõ né...","[[0, 30, 'DESIGN#POSITIVE'], [31, 47, 'CAMERA#...","Phone design beautiful design, clear photograp...",2,0.000668,0.002301,0.997031
11118,Tốt mình Sài được 20ngay rồi pin ok cau hình m...,"[[29, 35, 'BATTERY#POSITIVE'], [36, 49, 'PERFO...","I am good at 20ngay and the battery is ok, the...",1,0.067114,0.637889,0.294997
11119,"Máy mới mua, pin sụt nhanh, nhiều lúc bị đơ đứ...","[[13, 26, 'BATTERY#NEGATIVE'], [28, 53, 'PERFO...","The new device bought, the battery dropped qui...",0,0.999234,0.000422,0.000344
11120,Nhận máy m7. Hôm nay đc 5 hôm thì máy lỗi đen ...,"[[34, 49, 'SCREEN#NEGATIVE'], [288, 385, 'SER&...","Receive M7.Today, for 5 days, the device is bl...",0,0.999298,0.000200,0.000502


In [47]:
# dataset.to_csv("/home/ldap-users-2/internship_2025/nguyenphuong-p/thesis/data/pretrain_data/non_neg_voz_visobert_labeled.csv", index=False)

In [48]:
dataset["VisobertLabel"].value_counts()

VisobertLabel
2    5072
0    4216
1    1834
Name: count, dtype: int64

In [49]:
dataset_general_labeled = dataset[dataset['Label'].str.contains('GENERAL', case=False, na=False)].copy()
dataset_general_labeled

Unnamed: 0,VietnameseText,Label,EnglishText,VisobertLabel,NegProb,NeuProb,PosProb
0,Pin Sài tầm 50h cho pin 100/100. Camera ổn ......,"[[0, 31, 'BATTERY#POSITIVE'], [33, 42, 'CAMERA...",The battery is 50 hours for 100/100 battery.Go...,2,0.001704,0.028847,0.969449
1,Lag và hao pin là cái tóm tắt về máy. S...,"[[0, 3, 'PERFORMANCE#NEGATIVE'], [8, 15, 'BATT...",Lag and draining battery are a summary of the ...,0,0.999145,0.000108,0.000746
6,Hài lòng về sản phẩm. Mọi thứ đêu rât tốt nghe...,"[[0, 41, 'GENERAL#POSITIVE'], [42, 89, 'FEATUR...",Satisfied about the product.Everything is very...,2,0.000439,0.000818,0.998743
7,"Mua được 1 tuần thấy máy quá OKPin trâu , máy ...","[[21, 31, 'GENERAL#POSITIVE'], [31, 39, 'BATTE...",Buy 1 week to see the machine is too Okpin buf...,2,0.002501,0.006023,0.991477
13,Mới mua sáng nay. Nhân viên phục vụ nhiệt tình...,"[[18, 46, 'SER&ACC#POSITIVE'], [57, 83, 'GENER...",Just bought this morning.Enthusiastic service ...,1,0.071821,0.727529,0.200651
...,...,...,...,...,...,...,...
11111,"Máy khá ngon biết cách dùng ko gì là ko thể, p...","[[0, 12, 'GENERAL#POSITIVE']]","The device is quite delicious, how to use noth...",2,0.000182,0.000445,0.999373
11113,Mới mua hôm qua điện thoại tốt,"[[16, 30, 'GENERAL#POSITIVE']]",Just bought yesterday the phone was good,2,0.048158,0.361035,0.590807
11115,Mới mua hôm 19/1....máy tạm ổn và khá tốt tron...,"[[20, 41, 'GENERAL#NEUTRAL'], [57, 92, 'CAMERA...",Newly bought on January 19 .... The device is ...,1,0.023736,0.680111,0.296153
11116,Máy xài tuyệt vời ko chê game liên quân thì te...,"[[0, 24, 'GENERAL#POSITIVE'], [65, 169, 'PERFO...",The machine uses greatly not criticizing the g...,2,0.006050,0.278600,0.715351


In [50]:
dataset_general_labeled['GeneralLabel'] = dataset_general_labeled['Label'].str.extract(r'GENERAL#(.*?)\'')

In [53]:
convert_label(dataset_general_labeled, "NEGATIVE", "NEUTRAL", "POSITIVE", "GeneralLabel")

Unnamed: 0,VietnameseText,Label,EnglishText,VisobertLabel,NegProb,NeuProb,PosProb,GeneralLabel
0,Pin Sài tầm 50h cho pin 100/100. Camera ổn ......,"[[0, 31, 'BATTERY#POSITIVE'], [33, 42, 'CAMERA...",The battery is 50 hours for 100/100 battery.Go...,2,0.001704,0.028847,0.969449,2
1,Lag và hao pin là cái tóm tắt về máy. S...,"[[0, 3, 'PERFORMANCE#NEGATIVE'], [8, 15, 'BATT...",Lag and draining battery are a summary of the ...,0,0.999145,0.000108,0.000746,0
6,Hài lòng về sản phẩm. Mọi thứ đêu rât tốt nghe...,"[[0, 41, 'GENERAL#POSITIVE'], [42, 89, 'FEATUR...",Satisfied about the product.Everything is very...,2,0.000439,0.000818,0.998743,2
7,"Mua được 1 tuần thấy máy quá OKPin trâu , máy ...","[[21, 31, 'GENERAL#POSITIVE'], [31, 39, 'BATTE...",Buy 1 week to see the machine is too Okpin buf...,2,0.002501,0.006023,0.991477,2
13,Mới mua sáng nay. Nhân viên phục vụ nhiệt tình...,"[[18, 46, 'SER&ACC#POSITIVE'], [57, 83, 'GENER...",Just bought this morning.Enthusiastic service ...,1,0.071821,0.727529,0.200651,2
...,...,...,...,...,...,...,...,...
11111,"Máy khá ngon biết cách dùng ko gì là ko thể, p...","[[0, 12, 'GENERAL#POSITIVE']]","The device is quite delicious, how to use noth...",2,0.000182,0.000445,0.999373,2
11113,Mới mua hôm qua điện thoại tốt,"[[16, 30, 'GENERAL#POSITIVE']]",Just bought yesterday the phone was good,2,0.048158,0.361035,0.590807,2
11115,Mới mua hôm 19/1....máy tạm ổn và khá tốt tron...,"[[20, 41, 'GENERAL#NEUTRAL'], [57, 92, 'CAMERA...",Newly bought on January 19 .... The device is ...,1,0.023736,0.680111,0.296153,1
11116,Máy xài tuyệt vời ko chê game liên quân thì te...,"[[0, 24, 'GENERAL#POSITIVE'], [65, 169, 'PERFO...",The machine uses greatly not criticizing the g...,2,0.006050,0.278600,0.715351,2


In [55]:
dataset_general_labeled.reset_index(drop=True, inplace=True)

In [65]:
different_labels = dataset_general_labeled[dataset_general_labeled['GeneralLabel'] != dataset_general_labeled['VisobertLabel']]

In [67]:
different_labels.reset_index(drop=True, inplace=True)

In [70]:
same_labels = dataset_general_labeled[dataset_general_labeled['GeneralLabel'] == dataset_general_labeled['VisobertLabel']]
same_labels


Unnamed: 0,VietnameseText,Label,EnglishText,VisobertLabel,NegProb,NeuProb,PosProb,GeneralLabel
0,Pin Sài tầm 50h cho pin 100/100. Camera ổn ......,"[[0, 31, 'BATTERY#POSITIVE'], [33, 42, 'CAMERA...",The battery is 50 hours for 100/100 battery.Go...,2,0.001704,0.028847,0.969449,2
1,Lag và hao pin là cái tóm tắt về máy. S...,"[[0, 3, 'PERFORMANCE#NEGATIVE'], [8, 15, 'BATT...",Lag and draining battery are a summary of the ...,0,0.999145,0.000108,0.000746,0
2,Hài lòng về sản phẩm. Mọi thứ đêu rât tốt nghe...,"[[0, 41, 'GENERAL#POSITIVE'], [42, 89, 'FEATUR...",Satisfied about the product.Everything is very...,2,0.000439,0.000818,0.998743,2
3,"Mua được 1 tuần thấy máy quá OKPin trâu , máy ...","[[21, 31, 'GENERAL#POSITIVE'], [31, 39, 'BATTE...",Buy 1 week to see the machine is too Okpin buf...,2,0.002501,0.006023,0.991477,2
6,"Thiết kế đẹp , kiểu dáng sang trọng. Nhất là p...","[[0, 35, 'DESIGN#POSITIVE'], [37, 58, 'BATTERY...","Beautiful design, luxurious design.Especially ...",2,0.005332,0.086073,0.908595,2
...,...,...,...,...,...,...,...,...
6625,Nói chung là OK trong tầm giá... Có nhiều tính...,"[[0, 29, 'GENERAL#POSITIVE'], [33, 55, 'FEATUR...","In general, OK in the price range ... There ar...",2,0.000231,0.000634,0.999135,2
6626,"Máy khá ngon biết cách dùng ko gì là ko thể, p...","[[0, 12, 'GENERAL#POSITIVE']]","The device is quite delicious, how to use noth...",2,0.000182,0.000445,0.999373,2
6627,Mới mua hôm qua điện thoại tốt,"[[16, 30, 'GENERAL#POSITIVE']]",Just bought yesterday the phone was good,2,0.048158,0.361035,0.590807,2
6628,Mới mua hôm 19/1....máy tạm ổn và khá tốt tron...,"[[20, 41, 'GENERAL#NEUTRAL'], [57, 92, 'CAMERA...",Newly bought on January 19 .... The device is ...,1,0.023736,0.680111,0.296153,1


In [71]:
same_labels["GeneralLabel"].value_counts()

GeneralLabel
2    3665
0    1104
1     122
Name: count, dtype: int64

In [72]:
dataset_general_labeled["GeneralLabel"].value_counts()

GeneralLabel
2    5169
0    1138
1     324
Name: count, dtype: int64

In [74]:
non_neg_voz_dataset = pd.read_csv("/home/ldap-users-2/internship_2025/nguyenphuong-p/thesis/data/pretrain_data/non_neg_voz_visobert_labeled.csv")
neg_voz_dataset = pd.read_csv("/home/ldap-users-2/internship_2025/nguyenphuong-p/thesis/data/pretrain_data/neg_voz_visobert_labeled.csv")


In [75]:
non_neg_voz_dataset

Unnamed: 0,VietnameseText,Label,EnglishText,VisobertLabel,NegProb,NeuProb,PosProb
0,Hagl mấy nay cho uống cương dương hết cả team à?,0,HAGL has given the whole team for all these te...,0,0.910380,0.083103,0.006516
1,Nếu cứ mặc kệ đời nghĩ gì thì quá dễ,0,If you keep thinking about what life is so easy,0,0.613928,0.081576,0.304496
2,Toàn tiền hơi chứ có gì mà nâng tầm thế fen,0,"All money is a bit, but nothing to raise the w...",0,0.999078,0.000144,0.000777
3,Cho em xin info ava của anh với ạ,0,Please give me your info ava,2,0.000109,0.002341,0.997550
4,Gửi từ LGE LM-G820N bằng vozFApp,0,Sent from LGE LM-G820N by vozfapp,1,0.004038,0.995796,0.000166
...,...,...,...,...,...,...,...
99995,ở đâu ra màn đó 1 củ vậy bạn? chỉ mình mua với,0,Where did it come out?Only I buy with,1,0.000408,0.999183,0.000409
99996,Thằng gỗ đợt này teamwork tốt hơn hẳn.,0,This wooden guy is much better in this time.,2,0.010745,0.004200,0.985055
99997,tao mà trong SG thì cũng đi đó.,0,"If I go in SG, I go there.",2,0.103966,0.272639,0.623395
99998,Tăng có 0.25. Ko làm được gì cả.,0,Increased by 0.25.Can't do anything.,0,0.996125,0.000869,0.003006


In [76]:
neg_voz_dataset

Unnamed: 0,VietnameseText,Label,EnglishText,VisobertLabel,NegProb,NeuProb,PosProb
0,"Đi mẹ nó còn gì là chân cẳng nữa, bọn lol này",1,"Go to his mother, what is the legs, these lols",0,0.999182,0.000118,0.000700
1,"Cá nhân vl, sút như cc mà cũng sút",1,"Individual VL, shot like cc but also shot",0,0.980441,0.000689,0.018869
2,"dm thằng linh lợn, bảo sao mc bay mẹ nó c1",1,"dm The guy Linh pig, tells the MC flying his m...",0,0.999352,0.000122,0.000526
3,đm nhỏ không học lớn viết báo là có thật luôn !,1,Small athletes who do not learn big newspaper ...,0,0.999123,0.000309,0.000568
4,4-0... đéo thể tin được TS nát vậy,1,4-0 ... I can't believe it,0,0.893340,0.000554,0.106106
...,...,...,...,...,...,...,...
24995,Đúng là 4.0 r bớt tin khoa học đi,1,It is true 4.0 r less scientific news,2,0.013318,0.000051,0.986631
24996,Con bắc kinh tự lái cấp độ mấy mà ngu vãi vậy nhỉ,1,How many levels of Beijing self -driving is so...,0,0.999042,0.000203,0.000754
24997,"đấy, cái này làm người ta ngứa dái này",1,"There, this makes people itchy",0,0.988507,0.000058,0.011435
24998,éo thấy đội lớn nào hỏi mua thằng Son nhỉ?,1,"I don't see a big team asking to buy a son, ri...",0,0.753284,0.244382,0.002334


In [97]:
pos_voz_dataset = non_neg_voz_dataset[non_neg_voz_dataset['VisobertLabel'] == 2].copy()
neu_voz_dataset = non_neg_voz_dataset[non_neg_voz_dataset['VisobertLabel'] == 1].copy()
neu_voz_dataset_sure = neu_voz_dataset[neu_voz_dataset['NeuProb'] > 0.5].copy()
pos_voz_dataset_sure = pos_voz_dataset[pos_voz_dataset['PosProb'] > 0.7].copy()



In [100]:
# Get randomly 3000 rows from pos_voz_dataset_sure
pos_voz_dataset_sure = pos_voz_dataset_sure.sample(n=3000)

# Get randomly 3000 rows from neu_voz_dataset_sure
neu_voz_dataset_sure = neu_voz_dataset_sure.sample(n=3000)

# Get randomly 3000 rows from neg_voz_dataset
neg_voz_dataset = neg_voz_dataset.sample(n=3000)

# Concatenate pos_voz_dataset_sure, neu_voz_dataset_sure, and neg_voz_dataset
concatenated_dataset = pd.concat([pos_voz_dataset_sure, neu_voz_dataset_sure, neg_voz_dataset]).reset_index(drop=True)


In [101]:
concatenated_dataset

Unnamed: 0,VietnameseText,Label,EnglishText,VisobertLabel,NegProb,NeuProb,PosProb
0,Khả năng tbn win đc chắc còn 20%,0,The possibility of Spain Win is probably 20%,2,0.000334,0.000055,0.999612
1,Bà con kìa Thắng heo @talatroi,0,"Behold, you win pig @talatroi",2,0.007975,0.014342,0.977682
2,Up lên cái nhỉ .................,0,Up to the small one .................,2,0.000623,0.020320,0.979057
3,"em nó cũng 99 rồi fen, đang đi làm",0,"Her brother is also 99, fen, working",2,0.002557,0.023045,0.974398
4,"60 KHÚC NÀO THẾ THÍM. TUI LONG KHÁNH, BIÊN HÒA ^^",0,"60 songs, aunt.My Long Khanh, Bien Hoa ^^",2,0.000485,0.008297,0.991217
...,...,...,...,...,...,...,...
8995,cái mày tự hào là lấy vợ có con à thằng rác rưởi,1,You are proud to get married with children,0,0.999579,0.000138,0.000283
8996,"Đánh chắn gì mà 2,3 tỉ? Điên à?",1,What to hit 2.3 billion?Crazy?,0,0.900879,0.000089,0.099032
8997,Công nhận thằng thớt dẩm thật.,1,Recognize that the cutting board is real.,0,0.913505,0.048834,0.037661
8998,Bằng kiều tuổi lol so với Kiệt lặc,1,Bang Kieu age LOL compared to Kiet Lac,0,0.996421,0.000077,0.003502


In [102]:

# Save the concatenated dataset to a CSV file
concatenated_dataset.to_csv('/home/ldap-users-2/internship_2025/nguyenphuong-p/thesis/data/pretrain_data/test_openai.csv', index=False)

In [120]:
pos_voz_dataset_sure.head(20)

Unnamed: 0,VietnameseText,Label,EnglishText,VisobertLabel,NegProb,NeuProb,PosProb
71329,Khả năng tbn win đc chắc còn 20%,0,The possibility of Spain Win is probably 20%,2,0.000334,5.5e-05,0.999612
48768,Bà con kìa Thắng heo @talatroi,0,"Behold, you win pig @talatroi",2,0.007975,0.014342,0.977682
43807,Up lên cái nhỉ .................,0,Up to the small one .................,2,0.000623,0.02032,0.979057
96085,"em nó cũng 99 rồi fen, đang đi làm",0,"Her brother is also 99, fen, working",2,0.002557,0.023045,0.974398
5554,"60 KHÚC NÀO THẾ THÍM. TUI LONG KHÁNH, BIÊN HÒA ^^",0,"60 songs, aunt.My Long Khanh, Bien Hoa ^^",2,0.000485,0.008297,0.991217
32582,@Không hình khó nói lắm mình team đùi nhé,0,"@No picture is hard to say, my thigh team",2,0.088819,0.074046,0.837135
28756,Bán hàng phổ thông nhưng mà cứ như bán mai thúy,0,"Sales in general, but it is like selling Mai Thuy",2,0.011161,0.000155,0.988684
55557,Cũng muốn lắm nhưng không có ô tô,0,I also want it but there is no car,2,0.075263,0.179332,0.745405
76932,tặng hoa thôi. 1 bông hoa nho nhỏ cũng đc,0,Just give flowers.1 small flower is also okay,2,0.00055,0.001271,0.998178
78661,Đam mê tiền nên lúc nào cũng đầy túi,0,Passionate money should always be full of pockets,2,0.000323,0.000203,0.999474


In [None]:
instruction_prompt = """
You are a virtual assistant helping me translate Vietnamese social media data into English.

```
Your task is to translate the sample data below into English.

Note that the dataset may contain slang, abbreviations, swear words, and offensive language. Please translate them as accurately as possible using equivalent English terms. This is crucial to ensure the meaning of the sentence is preserved.
Note:  Don't censor any words. 
```

Input format:
Sample <i>: <sentence ith>

Output format:

Sample <i>: <English version of each sample>
The output must strictly follow this format.
```
Sentences:
- Sample 0: Đi mẹ nó còn gì là chân cẳng nữa, bọn lol này
- Sample 1: Cá nhân vl, sút như cc mà cũng sút
- Sample 2: Sự thật nó nghĩ "CÓ CHÓ NÓ LẤY"	
- Sample 3: Thằng Harry đi vào đi ra toàn bị bơ. Nhục mặt.	
- Sample 4: Design đẹp quá nhỉ, không biết thực tế ntn.	
- Sample 5: tôi ủng hộ nhé, chỉ trích cc mấy thằng Oa Khấu	
- Sample 6: Đức đá bẩn mắt chết mẹ, cút về nước là hợp lý	
- Sample 7: Thơm quá thím ơi Hóng phát cạc	
- Sample 8: Dhs ...Saka óc chó bọn da đen óc chó	
- Sample 9: Nhập về cho có dịp ăn chơi, làm gì căng	
- Sample 10: Đam mê tiền nên lúc nào cũng đầy túi
- Sample 12: jisoo chuẩn form t thích luôn. rồ sé thì gầy quá	
- Sample 13: đọc cuốn quá bác ơi, hóng chap tiếp theo
- Sample 14: Chương 3: Chuyện trai gái nơi xứ người.	
- Sample 15: Link nào cơ fen, link thớt ae ở Nhật thì đây:
- Sample 16: tặng ví da đi thím hoặc giày cũng dc	
- Sample 17: Món bò kho cũng có nguồn gốc từ Pháp mà
- Sample 18: Ngày đầu gặp mà tặng cái đéo gì, bố thằng simp lõ	
- Sample 19: Biết là như cc sao vẫn sài? như cc thì vote bỏ đi	s

"""


len(instruction_prompt)

from openai import OpenAI

# Create an OpenAI client with your deepinfra token and endpoint
openai = OpenAI(
    api_key="I5iBXRlXYaew2fyI8bSviccXpKq6T9YF",
    base_url="https://api.deepinfra.com/v1/openai",
)

chat_completion = openai.chat.completions.create(
    model="deepseek-ai/DeepSeek-V3",
    messages=[{"role": "user", "content": f"{instruction_prompt}"}],
    temperature=0.5,
    
)

print(chat_completion.choices[0].message.content)
print(chat_completion.usage.prompt_tokens, chat_completion.usage.completion_tokens)


```
Sample 0: Go the fuck away, what’s left of your legs anyway, these assholes.
Sample 1: So fucking selfish, shoots like shit but still shoots.
Sample 2: The truth is he thinks, "Even a dog wouldn’t take it."
Sample 3: Harry keeps going in and out, always getting ignored. So embarrassing.
Sample 4: The design is so nice, but I wonder how it is in real life.
Sample 5: I support it, just criticize those damn Wokou guys.
Sample 6: Germany plays so dirty it’s fucking unbearable, they should just go home.
Sample 7: Smells so good, bro. Can’t wait for the next one.
Sample 8: Damn… Saka is such a dog-brain, these black guys are dog-brains.
Sample 9: Imported it just to have a chance to party, no need to stress.
Sample 10: Obsessed with money, so my pockets are always full.
Sample 12: Jisoo’s form is perfect, I love it. But if she’s too skinny, it’s a no.
Sample 13: Read it too fast, bro. Can’t wait for the next chapter.
Sample 14: Chapter 3: Love stories in a foreign land.
Sample 15: Which 