In [1]:
!pip install transformers==4.44.2 torch==2.3.0 underthesea==6.8.4

Collecting transformers==4.44.2
  Downloading transformers-4.44.2-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torch==2.3.0
  Downloading torch-2.3.0-cp311-cp311-manylinux1_x86_64.whl.metadata (26 kB)
Collecting underthesea==6.8.4
  Downloading underthesea-6.8.4-py3-none-any.whl.metadata (15 kB)
Collecting huggingface-hub<1.0,>=0.23.2 (from transformers==4.44.2)
  Downloading huggingface_hub-0.36.0-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers<0.20,>=0.19 (from transformers==4.44.2)
  Downloading tokenizers-0.19.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch==2.3.0)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch==2.3.0)
  Downloading nvidia_cuda_ru

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd
import numpy as np
import re
from underthesea import sent_tokenize

In [3]:
model_name = "wonrax/phobert-base-vietnamese-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.eval()

tokenizer_config.json:   0%|          | 0.00/285 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

bpe.codes: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/17.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/999 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/540M [00:00<?, ?B/s]

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(64001, 768, padding_idx=1)
      (position_embeddings): Embedding(258, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [4]:
id2label = model.config.id2label
print("✅ Loaded model:", model_name)
print("Label mapping:", id2label)

✅ Loaded model: wonrax/phobert-base-vietnamese-sentiment
Label mapping: {0: 'NEG', 1: 'POS', 2: 'NEU'}


In [5]:
def get_phobert_sentiment(text):
    if not isinstance(text, str) or text.strip() == "":
        return "neutral", 0.0
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        max_length=256,
        padding=True
    )
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
        label_id = torch.argmax(probs, dim=1).item()
        label = id2label[label_id]
        score = probs[0, 2].item() - probs[0, 0].item()  # positive - negative

    return label, round(score, 3)

In [6]:
df = pd.read_csv("/kaggle/input/cogdis-data/Combined_Data.csv")
print(f"Loaded {len(df)} rows")

Loaded 20092 rows


In [7]:
results = df["Original Text"].apply(lambda x: pd.Series(get_phobert_sentiment(x)))
results.columns = ["sentiment_label", "sentiment_score"]
df = pd.concat([df, results], axis=1)

In [8]:
emotion_lexicon = {
    "buồn": ["buồn", "tuyệt vọng", "cô đơn", "chán", "khóc", "tổn thương"],
    "tức giận": ["ghét", "giận", "bực", "tức", "điên", "phẫn nộ", "ức chế"],
    "tội lỗi": ["hối hận", "tội lỗi", "xấu hổ", "ăn năn", "day dứt"],
    "sợ hãi": ["sợ", "hoảng", "run", "lo lắng", "ám ảnh", "kinh khủng"],
    "vui": ["vui", "hạnh phúc", "vui vẻ", "phấn khởi", "thoải mái"]
}

def detect_emotion_category(text):
    if not isinstance(text, str) or text.strip() == "":
        return "không xác định"
    counts = {emo: sum(1 for w in words if re.search(rf"\b{w}\b", text.lower())) 
              for emo, words in emotion_lexicon.items()}
    dominant = max(counts, key=counts.get)
    return dominant if counts[dominant] > 0 else "không xác định"

df["emotion_category"] = df["Original Text"].apply(detect_emotion_category)

In [9]:
strong_emotion_words = [
    "ghét", "kinh khủng", "vô vọng", "đáng sợ", "sợ", "điên rồ",
    "tồi tệ", "khủng khiếp", "đau khổ", "hoảng loạn", "tức giận", "chán nản"
]

def count_strong_emotion(text):
    if not isinstance(text, str): return 0
    return sum(len(re.findall(rf"\b{w}\b", text.lower())) for w in strong_emotion_words)

df["strong_emotion_count"] = df["Original Text"].apply(count_strong_emotion)

In [10]:
def emotion_variance(text):
    if not isinstance(text, str) or len(text.strip()) == 0:
        return 0.0
    sentences = sent_tokenize(text)
    if len(sentences) < 2:
        return 0.0
    scores = [get_phobert_sentiment(s)[1] for s in sentences]
    return round(np.var(scores), 3)

df["emotion_variance"] = df["Original Text"].apply(emotion_variance)

In [11]:
df.head(5)

Unnamed: 0,Original Text,Label,Emotional Reasoning,Overgeneralization,Should Statements,Personalization,Mental Filter,Disqualifying the Positive,Jumping to Conclusions,Labeling and Mislabeling,Magnification and Minimization,All-or-Nothing Thinking,sentiment_label,sentiment_score,emotion_category,strong_emotion_count,emotion_variance
0,"Cô ấy luôn có những hành động kỳ lạ, ví dụ như...",0,0,0,0,0,0,0,0,0,0,0,NEG,-0.936,tức giận,1,0.0
1,"Nếu bố mẹ tôi biết tôi đang vật lộn thế nào, h...",1,0,0,0,0,0,0,1,0,0,0,NEG,-0.966,không xác định,0,0.0
2,"Trong năm thứ 4 và thứ 5 của sự nghiệp, tôi kh...",1,0,0,0,0,0,0,0,0,0,1,NEG,-0.979,không xác định,0,0.001
3,"Tôi muốn bị ốm, và tôi biết điều đó thật khủng...",0,0,0,0,0,0,0,0,0,0,0,NEG,-0.604,không xác định,1,0.0
4,Trước khi chúng tôi bắt đầu hẹn hò và trong nă...,0,0,0,0,0,0,0,0,0,0,0,POS,0.1,không xác định,0,0.139


In [12]:
df.to_csv("Sentiment & Emotion Features.csv", index=False)