In [2]:
import pandas as pd
import numpy as np

# Load Dataset

In [3]:
train = pd.read_csv("./VLSP2018_Hotel/Preprocessed/1-VLSP2018-SA-Hotel-train-clean.csv")
dev = pd.read_csv("./VLSP2018_Hotel/Preprocessed/2-VLSP2018-SA-Hotel-dev-clean.csv")
test = pd.read_csv("./VLSP2018_Hotel/Preprocessed/3-VLSP2018-SA-Hotel-test-clean.csv")

In [4]:
X_train = train['review_clean'].tolist()
y_train = train.drop(columns=['review_clean'])

X_dev = dev['review_clean'].tolist()
y_dev = dev.drop(columns=['review_clean'])

X_test = test['review_clean'].tolist()
y_test = test.drop(columns=['review_clean'])

# PhoW2V

In [None]:
from gensim.models import KeyedVectors

In [None]:
w2v = KeyedVectors.load_word2vec_format("./PhoW2V/word2vec_vi_words_100dims.bin", binary=True)
print("Số từ vựng:", len(w2v))
print("Kích thước vector:", w2v.vector_size)

Số từ vựng: 1587507
Kích thước vector: 100


In [9]:
def get_phow2v_vectors(texts, w2v_model):
    dim = w2v_model.vector_size
    out = []
    for text in texts:
        toks = text.split() 
        vecs = [w2v_model[w] for w in toks if w in w2v_model]
        out.append(np.mean(vecs, axis=0) if vecs else np.zeros(dim, dtype=np.float32))
    return np.vstack(out).astype(np.float32)

# PhoBERT

In [5]:
from transformers import AutoTokenizer, AutoModel
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")
model = AutoModel.from_pretrained("vinai/phobert-base")

**PhoBERT mean 4 last hidden**

In [23]:
def get_phobert_vectors(texts, tokenizer, model, batch_size=16, max_length=128):
    device = torch.device("cpu")
    model.to(device).eval()

    # Bật lấy toàn bộ hidden states
    model.config.output_hidden_states = True

    vecs = []
    with torch.inference_mode():
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i+batch_size]
            toks = tokenizer(
                batch,
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=max_length,
            )
            toks = {k: v.to(device) for k, v in toks.items()}

            # Lấy tất cả hidden_states thay vì chỉ last_hidden_state
            outputs = model(**toks, output_hidden_states=True, return_dict=True)
            hidden_states = outputs.hidden_states[-4:]        # lấy 4 lớp cuối
            hidden = torch.stack(hidden_states, dim=0).mean(0)  # trung bình 4 lớp cuối  → [B, T, H] = [16, 128, 768]

            # Mean pooling các token theo attention mask -> vector snentence
            mask = toks["attention_mask"].unsqueeze(-1)       # [B, T, 1]
            sent = (hidden * mask).sum(1) / mask.sum(1).clamp(min=1)  # [B, H]

            vecs.append(sent.cpu().numpy().astype(np.float32))

    return np.vstack(vecs)


**PhoBERT concat 4 last hidden**

In [7]:
def get_phobert_vectors_concat(texts, tokenizer, model, batch_size=16, max_length=128):
    device = torch.device("cpu")
    model.to(device).eval()

    # Bật lấy toàn bộ hidden states
    model.config.output_hidden_states = True 

    vecs = []
    with torch.inference_mode():
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i+batch_size]
            toks = tokenizer(
                batch,
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=max_length,
            )
            toks = {k: v.to(device) for k, v in toks.items()}

            outputs = model(**toks, output_hidden_states=True, return_dict=True)
            hidden_states = outputs.hidden_states[-4:] 
            hidden = torch.cat(hidden_states, dim=-1) 

            mask = toks["attention_mask"].unsqueeze(-1)
            sent = (hidden * mask).sum(1) / mask.sum(1).clamp(min=1)

            vecs.append(sent.cpu().numpy().astype(np.float32))

    return np.vstack(vecs)

# Embedding

## PhoW2V

In [52]:
X_train_phow2v = get_phow2v_vectors(X_train, w2v)
X_dev_phow2v = get_phow2v_vectors(X_dev, w2v)
X_test_phow2v = get_phow2v_vectors(X_test, w2v)

print("Train phoW2V shape:", X_train_phow2v.shape)
print("Dev phoW2V shape:", X_dev_phow2v.shape)
print("Test phoW2V shape:", X_test_phow2v.shape)

Train phoW2V shape: (3000, 100)
Dev phoW2V shape: (2000, 100)
Test phoW2V shape: (600, 100)


In [None]:
# df_phow2v_train = pd.concat([pd.DataFrame(X_train_phow2v), y_train.reset_index(drop=True)], axis=1)
# df_phow2v_train.to_csv("./VLSP2018_Hotel/Embedding/phoW2V_train.csv", index=False)

# df_phow2v_dev = pd.concat([pd.DataFrame(X_dev_phow2v), y_dev.reset_index(drop=True)], axis=1)
# df_phow2v_dev.to_csv("./VLSP2018_Hotel/Embedding/phoW2V_dev.csv", index=False)

# df_phow2v_test = pd.concat([pd.DataFrame(X_test_phow2v), y_test.reset_index(drop=True)], axis=1)
# df_phow2v_test.to_csv("./VLSP2018_Hotel/Embedding/phoW2V_test.csv", index=False)

## PhoBERT mean 4 last hidden

In [None]:
X_train_phoBERT = get_phobert_vectors(X_train, tokenizer, model, batch_size=16, max_length=128)
X_dev_phoBERT = get_phobert_vectors(X_dev, tokenizer, model, batch_size=16, max_length=128)
X_test_phoBERT = get_phobert_vectors(X_test, tokenizer, model, batch_size=16, max_length=128)

print("Train phoBERT shape:", X_train_phoBERT.shape)
print("Dev phoBERT shape:", X_dev_phoBERT.shape)
print("Test phoBERT shape:", X_test_phoBERT.shape)

Train phoBERT shape: (3000, 768)
Dev phoBERT shape: (2000, 768)
Test phoBERT shape: (600, 768)


In [None]:
# df_phoBERT_train = pd.concat([pd.DataFrame(X_train_phoBERT), y_train.reset_index(drop=True)], axis=1)
# df_phoBERT_train.to_csv("./VLSP2018_Hotel/Embedding/phoBERT_mean_train.csv", index=False)

# df_phoBERT_dev = pd.concat([pd.DataFrame(X_dev_phoBERT), y_dev.reset_index(drop=True)], axis=1)
# df_phoBERT_dev.to_csv("./VLSP2018_Hotel/Embedding/phoBERT_mean_dev.csv", index=False)

# df_phoBERT_test = pd.concat([pd.DataFrame(X_test_phoBERT), y_test.reset_index(drop=True)], axis=1)
# df_phoBERT_test.to_csv("./VLSP2018_Hotel/Embedding/phoBERT_mean_test.csv", index=False)

## PhoBERT concat 4 last hidden

In [8]:
X_train_phoBERT_concat = get_phobert_vectors_concat(X_train, tokenizer, model, batch_size=16, max_length=128)
X_dev_phoBERT_concat = get_phobert_vectors_concat(X_dev, tokenizer, model, batch_size=16, max_length=128)
X_test_phoBERT_concat = get_phobert_vectors_concat(X_test, tokenizer, model, batch_size=16, max_length=128)

print("Train phoBERT shape:", X_train_phoBERT_concat.shape)
print("Dev phoBERT shape:", X_dev_phoBERT_concat.shape)
print("Test phoBERT shape:", X_test_phoBERT_concat.shape)

Train phoBERT shape: (3000, 3072)
Dev phoBERT shape: (2000, 3072)
Test phoBERT shape: (600, 3072)


In [None]:
df_phoBERT_concat_train = pd.concat([pd.DataFrame(X_train_phoBERT_concat), y_train.reset_index(drop=True)], axis=1)
df_phoBERT_concat_train.to_csv("./VLSP2018_Hotel/Embedding/phoBERT_concat_train.csv", index=False)

df_phoBERT_concat_dev = pd.concat([pd.DataFrame(X_dev_phoBERT_concat), y_dev.reset_index(drop=True)], axis=1)
df_phoBERT_concat_dev.to_csv("./VLSP2018_Hotel/Embedding/phoBERT_concat_dev.csv", index=False)

df_phoBERT_concat_test = pd.concat([pd.DataFrame(X_test_phoBERT_concat), y_test.reset_index(drop=True)], axis=1)
df_phoBERT_concat_test.to_csv("./VLSP2018_Hotel/Embedding/phoBERT_concat_test.csv", index=False)