## Prepare dataset using `datasets`

In [9]:
from datasets import load_dataset, Dataset
import pandas as pd

In [10]:
# load dataset and drop label 2
data_address = "/home/yejoon/attention/kr3.parquet"
data = pd.read_parquet(data_address)
data = data.loc[data['Rating'] != 2]
data.reset_index(inplace=True, drop=True)

# convert to hgf Dataset
raw_dataset = Dataset.from_pandas(data)

In [11]:
raw_dataset

Dataset({
    features: ['Rating', 'Review'],
    num_rows: 459207
})

In [13]:
# train test split
dataset = raw_dataset.train_test_split(test_size=0.2, shuffle=True, seed=217)
dataset


DatasetDict({
    train: Dataset({
        features: ['Rating', 'Review'],
        num_rows: 367365
    })
    test: Dataset({
        features: ['Rating', 'Review'],
        num_rows: 91842
    })
})

In [14]:
from transformers import BertTokenizer
checkpoint = 'bert-base-multilingual-cased'
tokenizer = BertTokenizer.from_pretrained(checkpoint)

In [10]:
# tokenizing function
def tokenize_func(example):
    # padding will be done later
    return tokenizer(example['Review'], truncation=True)

tokenized_dataset = dataset.map(tokenize_func, batched=True)

  0%|          | 0/368 [00:00<?, ?ba/s]

  0%|          | 0/92 [00:00<?, ?ba/s]

In [11]:
# Remove and rename columns
tokenized_dataset = tokenized_dataset.remove_columns(['Review']) # tokenized 'Review' is saved in other columns
tokenized_dataset = tokenized_dataset.rename_column('Rating', 'labels')  # BERT in hgf need a parameter named 'labels'
tokenized_dataset.set_format('torch')


In [12]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['labels', 'attention_mask', 'input_ids', 'token_type_ids'],
        num_rows: 367365
    })
    test: Dataset({
        features: ['labels', 'attention_mask', 'input_ids', 'token_type_ids'],
        num_rows: 91842
    })
})

In [13]:
# save tokenized dataset
tokenized_dataset.save_to_disk('tokenized')