In [1]:
import pandas as pd
import numpy as np

# Load Dataset

In [None]:
train = pd.read_csv("./Data/Preprocessed/ViCTSD_train-clean.csv")
val = pd.read_csv("./Data/Preprocessed/ViCTSD_valid-clean.csv")
test = pd.read_csv("./Data/Preprocessed/ViCTSD_test-clean.csv")

In [3]:
X_train = train['Comment_clean'].tolist()
y_train = train['Constructiveness']

X_val = val['Comment_clean'].tolist()
y_val = val['Constructiveness']

X_test = test['Comment_clean'].tolist()
y_test = test['Constructiveness']

# PhoW2V

In [4]:
from gensim.models import KeyedVectors

In [5]:
w2v = KeyedVectors.load_word2vec_format("./PhoW2V/word2vec_vi_words_100dims.bin", binary=True)
print("Số từ vựng:", len(w2v))
print("Kích thước vector:", w2v.vector_size)

Số từ vựng: 1587507
Kích thước vector: 100


In [6]:
def get_phow2v_vectors(texts, w2v_model):
    dim = w2v_model.vector_size
    out = []
    for text in texts:
        toks = text.split() 
        vecs = [w2v_model[w] for w in toks if w in w2v_model]
        out.append(np.mean(vecs, axis=0) if vecs else np.zeros(dim, dtype=np.float32))
    return np.vstack(out).astype(np.float32)

# Embedding

In [7]:
X_train_phow2v = get_phow2v_vectors(X_train, w2v)
X_val_phow2v = get_phow2v_vectors(X_val, w2v)
X_test_phow2v = get_phow2v_vectors(X_test, w2v)

print("Train phoW2V shape:", X_train_phow2v.shape)
print("Val phoW2V shape:", X_val_phow2v.shape)
print("Test phoW2V shape:", X_test_phow2v.shape)

Train phoW2V shape: (7000, 100)
Val phoW2V shape: (2000, 100)
Test phoW2V shape: (1000, 100)


In [None]:
df_phow2v_train = pd.concat([pd.DataFrame(X_train_phow2v), y_train.reset_index(drop=True)], axis=1)
df_phow2v_train.to_csv("./Data/Embedding/ViCTSD_train-phoW2V.csv", index=False)

df_phow2v_val = pd.concat([pd.DataFrame(X_val_phow2v), y_val.reset_index(drop=True)], axis=1)
df_phow2v_val.to_csv("./Data/Embedding/ViCTSD_val-phoW2V.csv", index=False)

df_phow2v_test = pd.concat([pd.DataFrame(X_test_phow2v), y_test.reset_index(drop=True)], axis=1)
df_phow2v_test.to_csv("./Data/Embedding/ViCTSD_test-phoW2V.csv", index=False)