In [1]:
import pandas as pd
import numpy as np

# Load Dataset

In [11]:
train = pd.read_csv("Data/Preprocessed/1-train-clean.csv")
dev = pd.read_csv("Data/Preprocessed/2-val-clean.csv")
test = pd.read_csv("Data/Preprocessed/3-test-clean.csv")

In [12]:
X_train = train['review_clean'].tolist()
y_train = train.drop(columns=['review_clean'])

X_dev = dev['review_clean'].tolist()
y_dev = dev.drop(columns=['review_clean'])

X_test = test['review_clean'].tolist()
y_test = test.drop(columns=['review_clean'])

# PhoW2V

In [4]:
from gensim.models import KeyedVectors

In [5]:
w2v = KeyedVectors.load_word2vec_format("./PhoW2V/word2vec_vi_words_100dims.bin", binary=True)
print("Số từ vựng:", len(w2v))
print("Kích thước vector:", w2v.vector_size)

Số từ vựng: 1587507
Kích thước vector: 100


In [6]:
def get_phow2v_vectors(texts, w2v_model):
    dim = w2v_model.vector_size
    out = []
    for text in texts:
        toks = text.split() 
        vecs = [w2v_model[w] for w in toks if w in w2v_model]
        out.append(np.mean(vecs, axis=0) if vecs else np.zeros(dim, dtype=np.float32))
    return np.vstack(out).astype(np.float32)

# Embedding

In [13]:
X_train_phow2v = get_phow2v_vectors(X_train, w2v)
X_dev_phow2v = get_phow2v_vectors(X_dev, w2v)
X_test_phow2v = get_phow2v_vectors(X_test, w2v)

print("Train phoW2V shape:", X_train_phow2v.shape)
print("Dev phoW2V shape:", X_dev_phow2v.shape)
print("Test phoW2V shape:", X_test_phow2v.shape)

Train phoW2V shape: (1658, 100)
Dev phoW2V shape: (359, 100)
Test phoW2V shape: (372, 100)


In [12]:
df_phow2v_train = pd.concat([pd.DataFrame(X_train_phow2v), y_train.reset_index(drop=True)], axis=1)
df_phow2v_train.to_csv("./Data/Embedding/1-train-phoW2V.csv", index=False)

df_phow2v_dev = pd.concat([pd.DataFrame(X_dev_phow2v), y_dev.reset_index(drop=True)], axis=1)
df_phow2v_dev.to_csv("./Data/Embedding/2-val-phoW2V.csv", index=False)

df_phow2v_test = pd.concat([pd.DataFrame(X_test_phow2v), y_test.reset_index(drop=True)], axis=1)
df_phow2v_test.to_csv("./Data/Embedding/3-test-phoW2V.csv", index=False)