In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer
import transformers
import pandas as pd
from torch.utils.data import Dataset

def preprocess(RAW_DATA, TOKENIZER, MAX_LENGTH=1024):
    dataset =  TOKENIZER(
                           RAW_DATA,
                           truncation=True,
                           padding="max_length",
                           max_length=MAX_LENGTH,
                           return_tensors="pt",
    )
    return dataset
    # tokenized_datasets = {}
    # for text in RAW_DATA:
    #     tokenized_datasets[key] = shuffled_datasets[key].map(
    #         lambda x: tokenizer(
    #             x["text"],
    #             truncation=True,
    #             padding="max_length",
    #             max_length=MAX_LENGTH,
    #             return_tensors="pt",
    #         )
    #     )

#     return tokenized_datasets
# def make_supervised_data_module(
#     tokenizer: transformers.PreTrainedTokenizer, data_args
# ) -> Dict:
#     """Make dataset and collator for supervised fine-tuning."""
#     dataset_cls = (
#         LazySupervisedDataset if data_args.lazy_preprocess else SupervisedDataset
#     )
#     rank0_print("Loading data...")

#     train_json = json.load(open(data_args.data_path, "r"))
#     train_dataset = dataset_cls(train_json, tokenizer=tokenizer)

#     if data_args.eval_data_path:
#         eval_json = json.load(open(data_args.eval_data_path, "r"))
#         eval_dataset = dataset_cls(eval_json, tokenizer=tokenizer)
#     else:
#         eval_dataset = None

#     return dict(train_dataset=train_dataset, eval_dataset=eval_dataset)


class SupervisedDataset(Dataset):
    """Dataset for supervised fine-tuning."""

    def __init__(self, raw_data, tokenizer: transformers.PreTrainedTokenizer):
        super(SupervisedDataset, self).__init__()

        sources = [example for example in raw_data]
        data_dict = preprocess(RAW_DATA=sources, TOKENIZER=tokenizer)

        self.input_ids = data_dict["input_ids"]
        self.labels = data_dict["input_ids"]
        self.attention_mask = data_dict["attention_mask"]

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, i):
        return dict(
            input_ids=self.input_ids[i],
            labels=self.input_ids[i],
            attention_mask=self.attention_mask[i],
        )

In [2]:
data1 = pd.read_parquet("/home/shawn/nvme/vl_research/jerry-agent/Revisiting-Vicuna/SlimPajama-6B/data/test-00000-of-00001-9f769cf7ce219017.parquet")

In [3]:
data1['text'].to_numpy()

array(['Regional NYC to Embark on Annual Count of Street Homeless\nNYC to Embark on Annual Count of Street Homeless\nMonday, February 8, 2016 at 2:48 am | כ"ט שבט תשע"ו\nA homeless man sleeps on the shuttle train between Times Square and Grand Central Terminal in Manhattan, New York, Jan. 23. (Reuters/Carlo Allegri)\nThousands of volunteers will fan out across New York City for the city\'s annual one-night count of homeless people living on the streets.\nThe canvass on Monday night, known as the HOPE Count, is getting extra attention this year as the nation\'s largest city grapples with an uptick in homelessness that has dominated front pages and consumed Mayor Bill de Blasio\'s City Hall.\nCity officials have lined up more than 3,500 volunteers who will receive a couple of hours training before being dispatched to the city\'s streets and subway stations. A year ago, volunteers counted 3,182 people living unsheltered.\nMany city officials have publicly said they believe the number is h

In [4]:
import os
from tqdm import tqdm

dir_path = "Revisiting-Vicuna/SlimPajama-6B/data"
data_path = os.listdir(dir_path)
train_raw_data, valid_raw_data, test_raw_data = [], [], []

for path in tqdm(data_path):
    data = pd.read_parquet(os.path.join(dir_path, path))['text'].to_list()
    if path[:4] == 'test':
        test_raw_data += data
    elif path[:5] == 'train':
        train_raw_data += data
    elif path[:5] == 'valid':
        valid_raw_data += data

100%|██████████| 50/50 [00:41<00:00,  1.20it/s]


In [5]:
len(train_raw_data), len(valid_raw_data), len(test_raw_data)

(5489000, 9347, 9346)

In [8]:
tokenizer = AutoTokenizer.from_pretrained("lmsys/vicuna-7b-v1.5")
train_data_loader = SupervisedDataset(train_raw_data, tokenizer)
valid_data_loader = SupervisedDataset(valid_raw_data, tokenizer)
test_data_loader = SupervisedDataset(test_raw_data, tokenizer)

In [7]:
test_data_loader.__len__()

9346