<a href="https://colab.research.google.com/github/vanha2301/AIR-absa-rt/blob/main/lixicon_based.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install underthesea pandas scikit-learn


Collecting underthesea
  Downloading underthesea-8.3.0-py3-none-any.whl.metadata (14 kB)
Collecting python-crfsuite>=0.9.6 (from underthesea)
  Downloading python_crfsuite-0.9.11-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)
Collecting underthesea_core==1.0.5 (from underthesea)
  Downloading underthesea_core-1.0.5-cp312-cp312-manylinux2010_x86_64.whl.metadata (1.4 kB)
Downloading underthesea-8.3.0-py3-none-any.whl (8.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.3/8.3 MB[0m [31m49.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading underthesea_core-1.0.5-cp312-cp312-manylinux2010_x86_64.whl (978 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m978.4/978.4 kB[0m [31m42.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading python_crfsuite-0.9.11-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m41.3 MB/s[0m eta [36m0:0

In [None]:
import re
from collections import defaultdict

import pandas as pd
from underthesea import word_tokenize
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [None]:

# =========================
# 1. Đường dẫn file cấu hình (KHÔNG dùng negative_words_vi.txt)
# =========================

LEXICON_PATH = "VietSentiWordnet_Ver1.3.5.txt"
STOPWORDS_PATH = "vietnamese-stopwords-dash.txt"
NEGATION_MARKERS_PATH = "vietnamese_negation_markers_full.txt"  # list từ phủ định


# Ngưỡng phân lớp 3 nhãn
POS_THRESHOLD = 0.05
NEG_THRESHOLD = -0.05

# Từ nhấn mạnh (intensifiers)
INTENSIFIERS = {
    "rất": 1.5,
    "cực_kì": 1.8,
    "cực_kỳ": 1.8,
    "cực_kỳ": 1.8,
    "quá": 1.3,
    "siêu": 1.8,
    "vô_cùng": 1.8,
    "khá": 1.2,
    "hơi": 0.7,   # làm nhẹ cảm xúc
}

# Số token nhìn lùi lại để bắt phủ định
NEGATION_WINDOW = 3

In [None]:

# =========================
# 2. Load lexicon, stopwords, negations
# =========================

def load_vnsenti_lexicon(path: str) -> dict:
    """Đọc VietSentiWordNet và trả về: word -> sentiment_score"""
    lexicon_sum = defaultdict(float)
    lexicon_count = defaultdict(int)

    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line or line.startswith("#"):
                continue

            parts = line.split("\t")
            if len(parts) < 5:
                continue

            pos_tag, synset_id, pos_score, neg_score, synset_terms = parts[:5]

            try:
                pos_score = float(pos_score.replace(",", "."))
                neg_score = float(neg_score.replace(",", "."))
            except ValueError:
                continue

            score = pos_score - neg_score

            for term in synset_terms.split():
                lemma = term.split("#")[0].strip()
                if not lemma:
                    continue
                lexicon_sum[lemma] += score
                lexicon_count[lemma] += 1

    lexicon = {}
    for w, s in lexicon_sum.items():
        lexicon[w] = s / lexicon_count[w]
    return lexicon


def load_stopwords(path: str) -> set:
    sw = set()
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            w = line.strip()
            if w:
                sw.add(w)
    return sw


def load_negation_markers(path: str) -> set:
    markers = set()
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            w = line.strip()
            if w:
                markers.add(w)
    return markers


def build_full_lexicon(main_path: str = LEXICON_PATH) -> dict:
    """Chỉ dùng VietSentiWordNet (không gộp negative_words_vi.txt)."""
    print(">>> Đang load VietSentiWordNet...")
    lex_main = load_vnsenti_lexicon(main_path)
    print(f"    Số từ trong VietSentiWordNet: {len(lex_main)}")
    return lex_main



In [None]:

# =========================
# 3. Tiền xử lý & tách từ
# =========================

URL_RE = re.compile(r"http\S+|www\.\S+")
NON_ALPHA_RE = re.compile(r"[^0-9a-zA-ZÀ-ỹ_ ]+")


def normalize_text(text: str) -> str:
    if not isinstance(text, str):
        text = str(text)

    text = text.lower()
    text = URL_RE.sub(" ", text)
    text = text.replace("\n", " ")
    text = NON_ALPHA_RE.sub(" ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text


def tokenize_vi(text: str) -> list:
    return word_tokenize(text, format="text").split()


def preprocess_and_tokenize(text: str, stopwords: set, negation_words: set) -> list:
    text = normalize_text(text)
    tokens = tokenize_vi(text)

    filtered = []
    for t in tokens:
        t = t.strip()
        if not t:
            continue
        if t in negation_words:
            filtered.append(t)
        else:
            if t not in stopwords:
                filtered.append(t)

    return filtered



In [None]:

# =========================
# 4. Tính điểm sentiment
# =========================

def sentiment_score(
    tokens: list,
    lexicon: dict,
    negation_words: set,
    use_negation: bool = True,
    use_intensifier: bool = True,
) -> float:
    total = 0.0

    for i, tok in enumerate(tokens):
        # Từ phủ định chỉ dùng để đảo dấu, không tự mang score
        if tok in negation_words:
            continue

        if tok not in lexicon:
            continue

        score = lexicon[tok]

        # Intensifier ngay trước
        if use_intensifier and i > 0:
            prev_tok = tokens[i - 1]
            if prev_tok in INTENSIFIERS:
                score *= INTENSIFIERS[prev_tok]

        # Phủ định trong cửa sổ trước đó
        if use_negation:
            for j in range(max(0, i - NEGATION_WINDOW), i):
                if tokens[j] in negation_words:
                    score *= -1
                    break

        total += score

    return total


def score_to_label(
    score: float,
    pos_threshold: float = POS_THRESHOLD,
    neg_threshold: float = NEG_THRESHOLD,
) -> str:
    if score > pos_threshold:
        return "positive"
    elif score < neg_threshold:
        return "negative"
    else:
        return "neutral"


def predict_sentiment(
    text: str,
    lexicon: dict,
    stopwords: set,
    negation_words: set,
) -> tuple:
    tokens = preprocess_and_tokenize(text, stopwords, negation_words)
    s = sentiment_score(tokens, lexicon, negation_words)
    label = score_to_label(s)
    return label, s


In [None]:

# =========================
# 5. Evaluate trên dataset CSV
# =========================

def evaluate_on_csv(
    csv_path: str,
    text_col: str = "text",
    label_col: str = "label",
    lexicon_path: str = LEXICON_PATH,
    stopwords_path: str = STOPWORDS_PATH,
    negation_markers_path: str = NEGATION_MARKERS_PATH,
):
    print(">>> Đang build lexicon từ VietSentiWordNet...")
    lexicon = build_full_lexicon(lexicon_path)

    print(">>> Đang load stopwords...")
    stopwords = load_stopwords(stopwords_path)
    print(f"    Số stopwords: {len(stopwords)}")

    print(">>> Đang load từ phủ định...")
    negation_words = load_negation_markers(negation_markers_path)
    print(f"    Số từ phủ định: {len(negation_words)}")

    print(">>> Đang load dataset...")
    df = pd.read_csv(csv_path)

    assert text_col in df.columns, f"Không thấy cột text: {text_col}"
    assert label_col in df.columns, f"Không thấy cột label: {label_col}"

    y_true = df[label_col].astype(str).tolist()
    y_pred = []
    scores = []

    print(">>> Đang dự đoán trên dataset...")
    for text in df[text_col]:
        label, score = predict_sentiment(text, lexicon, stopwords, negation_words)
        y_pred.append(label)
        scores.append(score)

    df["pred_label"] = y_pred
    df["pred_score"] = scores

    print("\n=== Classification report (macro) ===")
    print(classification_report(y_true, y_pred, digits=3))

    acc = accuracy_score(y_true, y_pred)
    print(f"Accuracy: {acc:.4f}")

    print("\n=== Confusion matrix (rows: true, cols: pred) ===")
    labels = ["negative", "neutral", "positive"]
    cm = confusion_matrix(y_true, y_pred, labels=labels)
    print("labels:", labels)
    print(cm)

    return df


In [None]:

# =========================
# 6. Demo nhanh
# =========================


print(">>> Khởi tạo lexicon, stopwords, negation words để demo...")
lexicon = build_full_lexicon(LEXICON_PATH)
stopwords = load_stopwords(STOPWORDS_PATH)
negation_words = load_negation_markers(NEGATION_MARKERS_PATH)

examples = [
    "Điện thoại này rất tốt, pin trâu và màn hình đẹp.",
    "Sản phẩm quá tệ, dùng được vài hôm là hỏng.",
    "Tạm ổn, không có gì đặc biệt.",
    "Chất lượng không tốt như mong đợi.",
    "Mình không hề thất vọng, thậm chí rất hài lòng.",
    "Hoàn toàn không đáng tiền, quá thất vọng.",
    "Không tệ như mình nghĩ, dùng cũng ổn.",
]

print("\n=== DEMO CÂU LẺ ===")
for sent in examples:
    label, score = predict_sentiment(sent, lexicon, stopwords, negation_words)
    print(f"{sent} -> {label} (score={score:.3f})")


>>> Khởi tạo lexicon, stopwords, negation words để demo...
>>> Đang load VietSentiWordNet...
    Số từ trong VietSentiWordNet: 1226


FileNotFoundError: [Errno 2] No such file or directory: 'vietnamese_negation_markers_full.txt'

In [None]:
import numpy as np

# ============================
# 1. STRING SUBSEQUENCE KERNEL
# ============================

def string_kernel(s, t, max_len=5, lam=0.5):
    """
    s, t: strings
    max_len: length of subsequences
    lam: decay parameter
    """
    s = s.replace(" ", "")
    t = t.replace(" ", "")

    K = 0.0
    for L in range(1, max_len + 1):
        for i in range(len(s) - L + 1):
            subseq = s[i:i+L]
            count = t.count(subseq)
            if count > 0:
                K += (lam ** L) * count
    return K


# ============================
# 2. TÍNH POS SCORE GIỐNG VSWN
# ============================

def senti_score(gloss, pos_seeds, neg_seeds):
    sim_pos = sum(string_kernel(gloss, g) for g in pos_seeds)
    sim_neg = sum(string_kernel(gloss, g) for g in neg_seeds)

    if sim_pos + sim_neg == 0:
        return 0.0, 0.0   # edge case

    Pos = sim_pos / (sim_pos + sim_neg)
    Neg = sim_neg / (sim_pos + sim_neg)
    return Pos, Neg


# ============================
# 3. TEST VỚI GLOSS "HẤP DẪN"
# ============================

gloss_hap_dan = """
thích nhìn, say mê vẻ đẹp; quần áo hấp dẫn; mô tả quyển sách vở hình minh họa hấp dẫn
"""

# Tập seed POS mẫu (đơn giản hóa để minh hoạ)
pos_seeds = [
    "đẹp, thu hút, lôi cuốn, quyến rũ",
    "dễ chịu, hấp dẫn",
    "say mê, thích thú"
]

# Tập seed NEG mẫu (đơn giản hóa)
neg_seeds = [
    "xấu, tệ hại, kinh khủng",
    "khó chịu, kém hấp dẫn"
]

Pos, Neg = senti_score(gloss_hap_dan, pos_seeds, neg_seeds)

print("POS SCORE =", Pos)
print("NEG SCORE =", Neg)
