In [1]:
import os
import pandas as pd
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset
from transformers import LlamaConfig, LlamaForCausalLM

# Assuming the model is already defined as per your existing script
import json

with open('configs/config.json', 'r') as file:
    config_dict = json.load(file)

# Create the configuration object
config = LlamaConfig.from_dict(config_dict)

# Initialize the LlamaForCausalLM model
model = LlamaForCausalLM(config)


def tokenize_function(examples, tokenizer, max_length=1024):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=max_length)

class SlimPajamaDataset(Dataset):
    """Dataset for SlimPajama with lazy loading."""

    def __init__(self, file_paths, tokenizer):
        self.file_paths = file_paths
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        file_path = self.file_paths[idx]
        data = pd.read_parquet(file_path)['text'].tolist()
        tokenized_data = tokenize_function({"text": data}, self.tokenizer)
        return {key: torch.tensor(val) for key, val in tokenized_data.items()}

def get_file_paths(dataset_path, prefix):
    return [os.path.join(dataset_path, file) for file in os.listdir(dataset_path) if file.startswith(prefix)]

# Main
dataset_path = '/home/shawn/nvme/vl_research/jerry-agent/SlimPajama-6B/data'
tokenizer = AutoTokenizer.from_pretrained("lmsys/vicuna-7b-v1.5")

# File paths for lazy loading
train_file_paths = [get_file_paths(dataset_path, 'train')[0]]
valid_file_paths = get_file_paths(dataset_path, 'valid')
test_file_paths = get_file_paths(dataset_path, 'test')






  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_dataset = SlimPajamaDataset(train_file_paths, tokenizer)


In [3]:
valid_dataset = SlimPajamaDataset(valid_file_paths, tokenizer)


In [4]:
test_dataset = SlimPajamaDataset(test_file_paths, tokenizer)

In [5]:
# # Datasets
# train_dataset = SlimPajamaDataset(train_file_paths, tokenizer)
# valid_dataset = SlimPajamaDataset(valid_file_paths, tokenizer)
# test_dataset = SlimPajamaDataset(test_file_paths, tokenizer)

# Huggingface Trainer setup
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_strategy="steps",
    logging_steps=50,  # Log every 50 steps
    report_to="all"  # or use "none" if you don't want to report metrics
)

In [6]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [7]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=data_collator
)

# Train
trainer.train()



Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mwel019[0m. Use [1m`wandb login --relogin`[0m to force relogin


You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


ValueError: Caught ValueError in replica 0 on device 0.
Original Traceback (most recent call last):
  File "/home/shawn/anaconda3/envs/bliva/lib/python3.9/site-packages/torch/nn/parallel/parallel_apply.py", line 64, in _worker
    output = module(*input, **kwargs)
  File "/home/shawn/anaconda3/envs/bliva/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/shawn/anaconda3/envs/bliva/lib/python3.9/site-packages/transformers/models/llama/modeling_llama.py", line 806, in forward
    outputs = self.model(
  File "/home/shawn/anaconda3/envs/bliva/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/shawn/anaconda3/envs/bliva/lib/python3.9/site-packages/transformers/models/llama/modeling_llama.py", line 623, in forward
    batch_size, seq_length = input_ids.shape
ValueError: too many values to unpack (expected 2)


Error in callback <bound method _WandbInit._pause_backend of <wandb.sdk.wandb_init._WandbInit object at 0x7f2fd4374340>> (for post_run_cell), with arguments args (<ExecutionResult object at 7f2fd43a5eb0, execution_count=7 error_before_exec=None error_in_exec=Caught ValueError in replica 0 on device 0.
Original Traceback (most recent call last):
  File "/home/shawn/anaconda3/envs/bliva/lib/python3.9/site-packages/torch/nn/parallel/parallel_apply.py", line 64, in _worker
    output = module(*input, **kwargs)
  File "/home/shawn/anaconda3/envs/bliva/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/shawn/anaconda3/envs/bliva/lib/python3.9/site-packages/transformers/models/llama/modeling_llama.py", line 806, in forward
    outputs = self.model(
  File "/home/shawn/anaconda3/envs/bliva/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
    return forward_call(*input, **kwar

TypeError: _pause_backend() takes 1 positional argument but 2 were given

In [None]:
# Evaluate
trainer.evaluate()