In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Đọc dữ liệu

In [3]:
train = pd.read_csv("./Data/Original/ViCTSD_train.csv")
val = pd.read_csv("./Data/Original/ViCTSD_valid.csv")
test = pd.read_csv("./Data/Original/ViCTSD_test.csv")
data = pd.concat([train, val, test], ignore_index=True)

In [4]:
train = train[['Comment', 'Constructiveness']]
val = val[['Comment', 'Constructiveness']]
test = test[['Comment', 'Constructiveness']]
data = data[['Comment', 'Constructiveness']]

# Định nghĩa thống kê

In [12]:
def get_text_stats(text_series):
    all_tokens = [str(text).split() for text in text_series]
    
    lengths = [len(tokens) for tokens in all_tokens]
    avg_len = np.mean(lengths)
    
    flat_tokens = [word.lower() for tokens in all_tokens for word in tokens]
    vocab_size = len(set(flat_tokens))
    
    return avg_len, vocab_size, lengths

In [25]:
def plot_sentiment_distribution(df, title_suffix=""):
    text_col = 'Comment' # Tên cột text của bạn
    # Chuyển sang dạng dọc để đếm
    sentiment_order = ['Constructiveness', 'Non-Constructiveness']
    
    # Vẽ countplot
    ax = sns.countplot(data=df,
                       order=sentiment_order, 
                       palette='coolwarm')
    
    # Thêm số liệu trên đầu cột (như code bạn yêu cầu)
    for container in ax.containers:
        ax.bar_label(container, padding=3, fontsize=10) 

    # Trang trí
    plt.title(f'Sentiment Distribution of {title_suffix}', fontsize=14)
    
    if not df.empty:
        plt.ylim(0, df['Polarity'].value_counts().max() * 1.1)
    
    plt.tight_layout()
    plt.show()


# Thống kê

In [14]:
avg_len, vocab_size, lengths = get_text_stats(data['Comment'])
print(f"1. Avg. Length: {avg_len:.2f}")
print(f"2. Vocab Size: {vocab_size}")
print()

avg_len, vocab_size, lengths = get_text_stats(train['Comment'])
print(f"1. Avg. Length Train: {avg_len:.2f}")
print(f"2. Vocab Size: {vocab_size}")
print()

avg_len, vocab_size, lengths = get_text_stats(val['Comment'])
print(f"1. Avg. Length Val: {avg_len:.2f}")
print(f"2. Vocab Size: {vocab_size}")
print()

avg_len, vocab_size, lengths = get_text_stats(test['Comment'])
print(f"1. Avg. Length Test: {avg_len:.2f}")
print(f"2. Vocab Size: {vocab_size}")

1. Avg. Length: 29.38
2. Vocab Size: 17945

1. Avg. Length Train: 29.22
2. Vocab Size: 14410

1. Avg. Length Val: 30.19
2. Vocab Size: 7399

1. Avg. Length Test: 28.92
2. Vocab Size: 4581


In [18]:
column_name = 'Comment'  

def get_vocab_set(df, col):
    vocab = set()
    for text in df[col].astype(str):
        words = text.lower().split() 
        vocab.update(words)
    return vocab

vocab_train = get_vocab_set(train, column_name)
vocab_val = get_vocab_set(val, column_name)
vocab_test = get_vocab_set(test, column_name)

print(f"Tổng số lượng Vocab trong tập Train: {len(vocab_train)}")

oov_val = vocab_val - vocab_train
oov_test = vocab_test - vocab_train

print(f"No. words in Val not in Train set: {len(oov_val)}")
print(f"No. words in Test not in Train set: {len(oov_test)}")

Tổng số lượng Vocab trong tập Train: 14410
No. words in Val not in Train set: 2535
No. words in Test not in Train set: 1101
