In [1]:
!pip install underthesea==6.8.4
!pip install scikit-learn==1.5.2
!pip install pandas==2.2.2
!pip install numpy==1.26.4
!pip install tqdm==4.66.5

Collecting underthesea==6.8.4
  Downloading underthesea-6.8.4-py3-none-any.whl.metadata (15 kB)
Collecting python-crfsuite>=0.9.6 (from underthesea==6.8.4)
  Downloading python_crfsuite-0.9.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)
Collecting underthesea-core==1.0.4 (from underthesea==6.8.4)
  Downloading underthesea_core-1.0.4-cp311-cp311-manylinux2010_x86_64.whl.metadata (1.7 kB)
Downloading underthesea-6.8.4-py3-none-any.whl (20.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.9/20.9 MB[0m [31m51.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading underthesea_core-1.0.4-cp311-cp311-manylinux2010_x86_64.whl (657 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m657.8/657.8 kB[0m [31m38.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading python_crfsuite-0.9.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB

In [2]:
import pandas as pd
import numpy as np
import re
from underthesea import word_tokenize, pos_tag
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from tqdm import tqdm

tqdm.pandas()

In [3]:
def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r'[^\w\s]', '', text)   # bỏ dấu câu
    text = re.sub(r'\s+', ' ', text).strip()
    return text

NEGATIVE_WORDS = [
    "không", "chẳng", "chưa", "chưa từng", "không bao giờ", "chẳng bao giờ",
    "chưa bao giờ", "mãi mãi", "luôn luôn", "tất cả", "mọi người", "ai cũng"
]

def neg_extreme_features(text):
    tokens = word_tokenize(text, format="text").split()
    count = sum(1 for t in tokens if t in NEGATIVE_WORDS)
    return count / max(1, len(tokens))

EMO_LEXICON = [
    "tệ", "tồi", "vô dụng", "buồn", "sợ", "tức giận", "hối hận", "tuyệt vọng",
    "vui", "hạnh phúc", "may mắn", "bi quan", "thất vọng", "đau khổ", "lo lắng"
]

def emotion_features(text):
    tokens = word_tokenize(text, format="text").split()
    emo_count = sum(1 for t in tokens if t in EMO_LEXICON)
    return emo_count / max(1, len(tokens))

def pos_ratio_features(text):
    tagged = pos_tag(text)
    total = len(tagged)
    if total == 0:
        return pd.Series({"verb_ratio": 0, "adj_ratio": 0, "pron_ratio": 0})
    verbs = sum(1 for w, p in tagged if p.startswith("V"))
    adjs = sum(1 for w, p in tagged if p.startswith("A"))
    prons = sum(1 for w, p in tagged if w in ["tôi", "mình", "ta", "chúng tôi", "bạn", "họ"])
    return pd.Series({
        "verb_ratio": verbs / total,
        "adj_ratio": adjs / total,
        "pron_ratio": prons / total
    })

def length_features(text):
    words = word_tokenize(text, format="text").split()
    return pd.Series({
        "word_count": len(words),
        "avg_word_length": np.mean([len(w) for w in words]) if words else 0
    })

def ngram_features(texts, ngram_range=(1,3), max_features=300):
    vectorizer = TfidfVectorizer(
        tokenizer=lambda x: word_tokenize(x, format="text").split(),
        ngram_range=ngram_range,
        max_features=max_features
    )
    X = vectorizer.fit_transform(texts)
    return pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

def extract_lexical_features(df, text_col="Original Text"):
    print("🔧 Tiền xử lý dữ liệu ...")
    df["clean_text"] = df[text_col].progress_apply(preprocess_text)

    print("🧩 Trích xuất các đặc trưng cơ bản ...")
    df["neg_ratio"] = df["clean_text"].progress_apply(neg_extreme_features)
    df["emo_ratio"] = df["clean_text"].progress_apply(emotion_features)

    pos_df = df["clean_text"].progress_apply(pos_ratio_features)
    len_df = df["clean_text"].progress_apply(length_features)

    df_features = pd.concat([df, pos_df, len_df], axis=1)

    print("Sinh TF-IDF N-gram features ...")
    ngram_df = ngram_features(df_features["clean_text"])

    # Ghép tất cả
    final_df = pd.concat([df_features.reset_index(drop=True), ngram_df.reset_index(drop=True)], axis=1)
    return final_df

In [4]:
df = pd.read_csv("/kaggle/input/cogdis-data/Combined_Data.csv")
df.head(5)

Unnamed: 0,Original Text,Label,Emotional Reasoning,Overgeneralization,Should Statements,Personalization,Mental Filter,Disqualifying the Positive,Jumping to Conclusions,Labeling and Mislabeling,Magnification and Minimization,All-or-Nothing Thinking
0,"Cô ấy luôn có những hành động kỳ lạ, ví dụ như...",0,0,0,0,0,0,0,0,0,0,0
1,"Nếu bố mẹ tôi biết tôi đang vật lộn thế nào, h...",1,0,0,0,0,0,0,1,0,0,0
2,"Trong năm thứ 4 và thứ 5 của sự nghiệp, tôi kh...",1,0,0,0,0,0,0,0,0,0,1
3,"Tôi muốn bị ốm, và tôi biết điều đó thật khủng...",0,0,0,0,0,0,0,0,0,0,0
4,Trước khi chúng tôi bắt đầu hẹn hò và trong nă...,0,0,0,0,0,0,0,0,0,0,0


In [5]:
lexical_df = extract_lexical_features(df)

🔧 Tiền xử lý dữ liệu ...


100%|██████████| 20092/20092 [00:00<00:00, 49056.84it/s]


🧩 Trích xuất các đặc trưng cơ bản ...


100%|██████████| 20092/20092 [00:34<00:00, 575.12it/s] 
100%|██████████| 20092/20092 [00:35<00:00, 570.72it/s] 
100%|██████████| 20092/20092 [01:50<00:00, 181.06it/s]
100%|██████████| 20092/20092 [00:43<00:00, 458.26it/s] 


Sinh TF-IDF N-gram features ...


In [6]:
lexical_df.head(5)

Unnamed: 0,Original Text,Label,Emotional Reasoning,Overgeneralization,Should Statements,Personalization,Mental Filter,Disqualifying the Positive,Jumping to Conclusions,Labeling and Mislabeling,...,đến,đều,để,đồng_nghiệp,đột_nhiên,đời,đủ,đứa,ấy,ở
0,"Cô ấy luôn có những hành động kỳ lạ, ví dụ như...",0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.390131,0.0
1,"Nếu bố mẹ tôi biết tôi đang vật lộn thế nào, h...",1,0,0,0,0,0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Trong năm thứ 4 và thứ 5 của sự nghiệp, tôi kh...",1,0,0,0,0,0,0,0,0,...,0.099525,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Tôi muốn bị ốm, và tôi biết điều đó thật khủng...",0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Trước khi chúng tôi bắt đầu hẹn hò và trong nă...,0,0,0,0,0,0,0,0,0,...,0.134349,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.14886,0.0


In [7]:
lexical_df.to_csv("lexical_features.csv", index=False)