# **Read reviews data**

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('reviews.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8697 entries, 0 to 8696
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   customer_id  8697 non-null   int64 
 1   product_id   8697 non-null   int64 
 2   rating       8697 non-null   int64 
 3   content      2928 non-null   object
 4   title        8697 non-null   object
 5   thank_count  8697 non-null   int64 
dtypes: int64(4), object(2)
memory usage: 407.8+ KB


In [4]:
df.head()

Unnamed: 0,customer_id,product_id,rating,content,title,thank_count
0,2119875,12416734,5,,Cực kì hài lòng,0
1,22628730,12416734,4,Đóng gói ok . Chất lượng OK. Nội dung ko hay lắm,Hài lòng,0
2,21351313,12416734,5,cực hài lòng. giao sớm hơn dự kiến. sách chất ...,Cực kì hài lòng,0
3,14148210,12416734,5,,Cực kì hài lòng,0
4,522878,12416734,4,,Hài lòng,0


# **Tokenize 'content' within** *VisoBERT*

In [5]:
!pip install transformers SentencePiece



In [6]:
from transformers import AutoModel, AutoTokenizer
import torch

model= AutoModel.from_pretrained('uitnlp/visobert')
tokenizer = AutoTokenizer.from_pretrained('uitnlp/visobert')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/644 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/390M [00:00<?, ?B/s]

Some weights of XLMRobertaModel were not initialized from the model checkpoint at uitnlp/visobert and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/390M [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/471k [00:00<?, ?B/s]

In [9]:
import re
def clean_social(text: str) -> str:
    if not isinstance(text, str):
        return ""
    # drop URLs and standalone @handles, keep emojis/teencode/diacritics
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'(^|\s)@\w+\b', ' ', text)
    return text.strip()

df['content'] = df['content'].apply(lambda x: clean_social(x))

In [14]:
df['content'].isna().count()

np.int64(8697)

In [10]:
MAX_LEN = 128

encoded = tokenizer(
    df["content"].fillna("").tolist(),
    padding="longest",
    truncation=True,
    max_length=MAX_LEN,
    return_attention_mask=True,
)

df["input_ids"] = [ids for ids in encoded["input_ids"]]
df["attention_mask"] = [m for m in encoded["attention_mask"]]

df["tokens"] = [tokenizer.convert_ids_to_tokens(ids) for ids in df["input_ids"]]

In [18]:
#if need later
from torch.utils.data import Dataset

class CommentDataset(Dataset):
    def __init__(self, df, text_col="comment_clean", max_len=128):
        self.texts = df[text_col].fillna("").tolist()
        self.max_len = max_len
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, i):
        enc = tokenizer(
            self.texts[i],
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt",
        )
        return {k: v.squeeze(0) for k, v in enc.items()}

In [20]:
df['thank_count'].unique()

array([ 0,  1,  3,  2,  4, 18,  8, 19, 52, 12,  6, 13, 15])

In [21]:
df['rating'].unique()

array([5, 4, 3, 2, 1])

In [22]:
df.drop(columns='title', inplace=True)

In [23]:
df.head()

Unnamed: 0,customer_id,product_id,rating,content,thank_count,input_ids,attention_mask,tokens
0,2119875,12416734,5,,0,"[0, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[<s>, </s>, <pad>, <pad>, <pad>, <pad>, <pad>,..."
1,22628730,12416734,4,Đóng gói ok . Chất lượng OK. Nội dung ko hay lắm,0,"[0, 1422, 462, 387, 140, 1162, 414, 2477, 1466...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[<s>, ▁Đóng, ▁gói, ▁ok, ▁., ▁Chất, ▁lượng, ▁OK..."
2,21351313,12416734,5,cực hài lòng. giao sớm hơn dự kiến. sách chất ...,0,"[0, 788, 743, 510, 14660, 306, 892, 231, 1213,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[<s>, ▁cực, ▁hài, ▁lòng, ., ▁giao, ▁sớm, ▁hơn,..."
3,14148210,12416734,5,,0,"[0, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[<s>, </s>, <pad>, <pad>, <pad>, <pad>, <pad>,..."
4,522878,12416734,4,,0,"[0, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[<s>, </s>, <pad>, <pad>, <pad>, <pad>, <pad>,..."


# **Save tokenized comment data**

In [24]:
df.to_csv('tokenized_comments.csv', index=False)

In [19]:
# !pip install pandas pyarrow



In [None]:
# df.to_parquet("comments_data.parquet", engine='pyarrow')
# df_loaded = pd.read_parquet("comments_data.parquet", engine='pyarrow')