In [1]:
# from transformers import AutoTokenizer
# tokenizer = AutoTokenizer.from_pretrained("/mnt/cluster_storage/vicuna-13b/")

In [2]:
NUM_WORKERS = 16
BATCH_SIZE_PER_WORKER = 4

In [3]:
import ray
import os

@ray.remote(num_gpus=1)
def download_vicuna_13b(rank):
    if not os.path.exists("/tmp/vicuna-13b"):
        print(f"{rank}: Downloading vicuna model")
        os.system("aws s3 sync s3://large-dl-models-mirror/restricted/models--lmsys--vicuna-13b-delta-v1.1/main-safetensors/ /tmp/vicuna-13b >NUL 2>&1")
    print(f"{rank}: Finished")
    return True


tasks = [download_vicuna_13b.remote(i) for i in range(NUM_WORKERS)]
ray.get(tasks)

2023-06-23 23:38:59,574	INFO worker.py:1426 -- Connecting to existing Ray cluster at address: 10.0.104.253:6379...
2023-06-23 23:38:59,639	INFO worker.py:1607 -- Connected to Ray cluster. View the dashboard at [1m[32mhttps://session-vzyh3916u4zwmf1es6fazmbrgm.i.anyscaleuserdata-staging.com [39m[22m
2023-06-23 23:38:59,642	INFO packaging.py:346 -- Pushing file package 'gcs://_ray_pkg_71de2ab32420c3943f2aef46eb349233.zip' (0.05MiB) to Ray cluster...
2023-06-23 23:38:59,643	INFO packaging.py:359 -- Successfully pushed file package 'gcs://_ray_pkg_71de2ab32420c3943f2aef46eb349233.zip'.


[2m[36m(download_vicuna_13b pid=5790, ip=10.0.123.219)[0m 3: Finished


[True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True]

In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
config = AutoConfig.from_pretrained("/tmp/vicuna-13b")
HIDDEN_SIZE = config.hidden_size

In [5]:
import ray
from ray.air.config import CheckpointConfig, RunConfig, ScalingConfig
import torch
import transformers

from datasets import load_dataset
from ray.data.preprocessors import BatchMapper, Chain
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
from deepspeed.ops.adam import DeepSpeedCPUAdam

# MODEL_NAME = "lmsys/vicuna-13b-delta-v1.1"
# MODEL_NAME = "EleutherAI/gpt-j-6B"
# MODEL_NAME = "tiiuae/falcon-7b"
# MODEL_NAME = "mosaicml/mpt-7b"
MODEL_NAME = "/tmp/vicuna-13b"

# hf_dataset = load_dataset("xtreme", "MLQA.en.en")
hf_dataset = load_dataset("cosmos_qa")["train"]
ray_dataset = ray.data.from_huggingface(hf_dataset).limit(8000)

PROMPT_TEMPLATE = """
Context: {context}
Question: {question}
Based on the context, the answer to the question would be: {answer} </s>
"""

def fill_prompt(batch):
    batch["input_sentence"] = batch.apply(
        lambda row: PROMPT_TEMPLATE.format(
            context=row["context"],
            question=row["question"],
            answer=row["answer0"]
        ),
        axis=1,
    )
    return batch[["input_sentence"]]


def tokenize(batch):
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    tokenizer.pad_token = tokenizer.eos_token
    ret = tokenizer(
        list(batch["input_sentence"]),
        truncation=True,
        max_length=180,
        padding="max_length",
        return_tensors="np",
    )
    ret["labels"] = ret["input_ids"].copy()
    return dict(ret)


prompt_mapper = BatchMapper(fill_prompt, batch_format="pandas")
tokenize_mapper = BatchMapper(tokenize, batch_format="pandas")
preprocessor = Chain(prompt_mapper, tokenize_mapper)

import torch
import pytorch_lightning as pl
import transformers.deepspeed


class ZeRO3Config:
    def __init__(self, pl_module):
        self.config = pl_module.trainer.strategy.config

    def __call__(self, *args, **kwargs):
        return self

    def is_zero3(self) -> bool:
        return True


def enable_transformers_pretrained_deepspeed_sharding(
    pl_module: "pl.LightningModule",
) -> None:
    transformers.deepspeed._hf_deepspeed_config_weak_ref = ZeRO3Config(pl_module)

from accelerate import init_empty_weights


class Falcon7BModel(pl.LightningModule):
    def __init__(self, inference=False):
        super().__init__()
        torch.backends.cuda.matmul.allow_tf32 = True
        if inference:
            with init_empty_weights():
                config = AutoConfig.from_pretrained(MODEL_NAME, trust_remote_code=True)
                self.model = AutoModelForCausalLM.from_config(config, trust_remote_code=True)
            self.model.tie_weights()
        # else:
        #     enable_transformers_pretrained_deepspeed_sharding(self)
        #     self.model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, trust_remote_code=True)
        #     self.model.tie_weights()
        
    def setup(self, stage) -> None:
        if not hasattr(self, "model"):
            print("Config :", self.trainer.strategy.config)
            enable_transformers_pretrained_deepspeed_sharding(self)
            self.model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, trust_remote_code=True)
            print(self.model)

    def forward(self, batch):
        outputs = self.model(
            batch["input_ids"],
            attention_mask=batch["attention_mask"],
            labels=batch["labels"],
        )
        return outputs.loss

    def training_step(self, batch, batch_idx):
        torch.cuda.empty_cache()
        loss = self.forward(batch)
        self.log("train_loss", loss, prog_bar=True, on_step=True, sync_dist=True)
        return loss

    def configure_optimizers(self):
        return DeepSpeedCPUAdam(self.parameters(), lr=1e-5, fp32_optimizer_states=False)

[2023-06-23 23:39:03,430] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)


Found cached dataset cosmos_qa (/home/ray/.cache/huggingface/datasets/cosmos_qa/default/0.1.0/3e18538cbfdb2c04189b16642715f0f6da3e97ed5df0aadcec3641245b2cf157)


  0%|          | 0/3 [00:00<?, ?it/s]


Learn more here: https://docs.ray.io/en/master/data/faq.html#migrating-to-strict-mode[0m


In [6]:
from pytorch_lightning.callbacks import TQDMProgressBar


# Create a customized progress bar for LightningTrainer
class FalconProgressBar(TQDMProgressBar):
    def __init__(self, num_iters_per_epoch, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.num_iters_per_epoch = num_iters_per_epoch

    def on_train_epoch_start(self, trainer, *_):
        super().on_train_epoch_start(trainer, *_)
        self.train_progress_bar.reset(self.num_iters_per_epoch)


total_batches = ray_dataset.count()
num_iters_per_epoch = total_batches // (NUM_WORKERS * BATCH_SIZE_PER_WORKER)
progress_bar = FalconProgressBar(num_iters_per_epoch)

2023-06-23 23:39:06,846	INFO streaming_executor.py:91 -- Executing DAG InputDataBuffer[Input] -> LimitOperator[limit=8000]
2023-06-23 23:39:06,847	INFO streaming_executor.py:92 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
2023-06-23 23:39:06,847	INFO streaming_executor.py:94 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

In [7]:
from ray.train.lightning import LightningTrainer, LightningConfigBuilder

# GROUP_SIZE = 1.7e7
GROUP_SIZE = 1e8

deepspeed_configs = {
    "zero_allow_untested_optimizer": True,
    # "fp16": {
    #     "enabled": True,
    #     "initial_scale_power": 8,
    # },
    "bf16": {
        "enabled": True
    },
    "zero_optimization": {
        "stage": 3,
        "offload_optimizer": {
            "device": "cpu",
            "pin_memory": True
        },
        # "offload_param": {
        #     "device": "cpu",
        #     "pin_memory": True
        # },
        "overlap_comm": True,
        "contiguous_gradients": True,
        # "sub_group_size": GROUP_SIZE,
        "reduce_bucket_size": HIDDEN_SIZE * HIDDEN_SIZE,
        "stage3_prefetch_bucket_size": 0.9 * HIDDEN_SIZE * HIDDEN_SIZE,
        "stage3_param_persistence_threshold": 10 * HIDDEN_SIZE,
        # "stage3_max_live_parameters": 2e8,
        # "stage3_max_reuse_distance": 2e8,
    },
    # "activation_checkpointing":{
    #     "partition_activations":True,
    #     "cpu_checkpointing":True,
    # },
    # "autotuning": {"enabled": True}
}

lightning_config = (
    LightningConfigBuilder()
    .module(cls=Falcon7BModel)
    .trainer(
        max_epochs=1, accelerator="gpu", precision="bf16-mixed", callbacks=[progress_bar], accumulate_grad_batches=2
    )
    .strategy(
        name="deepspeed",
        # stage=3,
        config=deepspeed_configs
        # offload_optimizer=True,
        # offload_parameters=True,
        # offload_params_device="cpu",
        # offload_optimizer_device="cpu",
        # partition_activations=True,
        # cpu_checkpointing=True,
        # contiguous_gradients=True,
        # reduce_bucket_size=2e8,
        # allgather_bucket_size="auto",
        # sub_group_size="auto",
    )
    .checkpointing(save_top_k=0, save_weights_only=True, save_last=True)
    .build()
)

In [8]:
trainer = LightningTrainer(
    lightning_config=lightning_config,
    run_config=RunConfig(
        name="vicuna-13b-finetune", 
        storage_path="s3://anyscale-staging-data-cld-kvedzwag2qa8i5bjxuevf5i7/yunxuanx-test/vicuna-test",
        checkpoint_config=CheckpointConfig(
            num_to_keep=1,
            _checkpoint_keep_all_ranks=True,
            _checkpoint_upload_from_workers=True
        ),
    ),
    scaling_config=ScalingConfig(num_workers=NUM_WORKERS, use_gpu=True, resources_per_worker={"CPU": 15, "GPU": 1}),
    datasets={"train": ray_dataset},
    datasets_iter_config={"batch_size": BATCH_SIZE_PER_WORKER},
    preprocessor=preprocessor,
)



In [9]:
trainer.fit()

0,1
Current time:,2023-06-24 00:59:43
Running for:,01:20:32.10
Memory:,4.3/62.1 GiB

Trial name,status,loc,iter,total time (s),train_loss,epoch,step
LightningTrainer_d3e90_00000,TERMINATED,10.0.84.83:5998,1,4757.51,1.46094,0,62


[2m[36m(pid=5998, ip=10.0.84.83)[0m [2023-06-23 23:39:15,728] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2m[36m(download_vicuna_13b pid=42002, ip=10.0.121.227)[0m 15: Finished[32m [repeated 15x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/ray-logging.html#log-deduplication for more options.)[0m


[2m[36m(LightningTrainer pid=5998, ip=10.0.84.83)[0m The `preprocessor` arg to Trainer is deprecated. Apply preprocessor transformations ahead of time by calling `preprocessor.transform(ds)`. Support for the preprocessor arg will be dropped in a future release.
[2m[36m(LightningTrainer pid=5998, ip=10.0.84.83)[0m [33mImportant: Ray Data requires schemas for all datasets in Ray 2.5. This means that standalone Python objects are no longer supported. In addition, the default batch format is fixed to NumPy. To revert to legacy behavior temporarily, set the environment variable RAY_DATA_STRICT_MODE=0 on all cluster processes.
[2m[36m(LightningTrainer pid=5998, ip=10.0.84.83)[0m 
[2m[36m(LightningTrainer pid=5998, ip=10.0.84.83)[0m Learn more here: https://docs.ray.io/en/master/data/faq.html#migrating-to-strict-mode[0m
[2m[36m(LightningTrainer pid=5998, ip=10.0.84.83)[0m Starting distributed worker processes: ['6053 (10.0.84.83)', '6364 (10.0.95.215)', '6074 (10.0.79.156)', 

(pid=5998, ip=10.0.84.83) - RandomizeBlockOrder 1:   0%|          | 0/1 [00:00<?, ?it/s]

(pid=5998, ip=10.0.84.83) Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

[2m[36m(RayTrainWorker pid=6074, ip=10.0.79.156)[0m [2023-06-23 23:39:26,513] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)


[2m[36m(RayTrainWorker pid=6053, ip=10.0.84.83)[0m GPU available: True (cuda), used: True
[2m[36m(RayTrainWorker pid=6053, ip=10.0.84.83)[0m TPU available: False, using: 0 TPU cores
[2m[36m(RayTrainWorker pid=6053, ip=10.0.84.83)[0m IPU available: False, using: 0 IPUs
[2m[36m(RayTrainWorker pid=6053, ip=10.0.84.83)[0m HPU available: False, using: 0 HPUs
[2m[36m(RayTrainWorker pid=6087, ip=10.0.72.137)[0m initializing deepspeed distributed: GLOBAL_RANK: 5, MEMBER: 6/16
[2m[36m(RayTrainWorker pid=6087, ip=10.0.72.137)[0m Missing logger folder: /home/ray/ray_results/vicuna-13b-finetune/LightningTrainer_d3e90_00000_0_2023-06-23_23-39-11/rank_all/lightning_logs


[2m[36m(RayTrainWorker pid=5975, ip=10.0.66.239)[0m Config : {'zero_allow_untested_optimizer': True, 'bf16': {'enabled': True}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'cpu', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'reduce_bucket_size': 26214400, 'stage3_prefetch_bucket_size': 23592960.0, 'stage3_param_persistence_threshold': 51200}, 'gradient_accumulation_steps': 2, 'train_micro_batch_size_per_gpu': 1, 'gradient_clipping': 0.0}


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]
[2m[36m(RayTrainWorker pid=5975, ip=10.0.66.239)[0m initializing deepspeed distributed: GLOBAL_RANK: 7, MEMBER: 8/16[32m [repeated 15x across cluster][0m
[2m[36m(RayTrainWorker pid=5975, ip=10.0.66.239)[0m Missing logger folder: /home/ray/ray_results/vicuna-13b-finetune/LightningTrainer_d3e90_00000_0_2023-06-23_23-39-11/rank_all/lightning_logs[32m [repeated 15x across cluster][0m
Loading checkpoint shards:  33%|███▎      | 1/3 [00:09<00:18,  9.23s/it]
Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s][32m [repeated 15x across cluster][0m
Loading checkpoint shards:  67%|██████▋   | 2/3 [00:23<00:11, 11.95s/it][32m [repeated 16x across cluster][0m
Loading checkpoint shards: 100%|██████████| 3/3 [00:31<00:00, 10.65s/it]
Loading checkpoint shards:  67%|██████▋   | 2/3 [00:28<00:14, 14.13s/it][32m [repeated 15x across cluster][0m


[2m[36m(RayTrainWorker pid=5970, ip=10.0.87.254)[0m LlamaForCausalLM(
[2m[36m(RayTrainWorker pid=5970, ip=10.0.87.254)[0m   (model): LlamaModel(
[2m[36m(RayTrainWorker pid=5970, ip=10.0.87.254)[0m     (embed_tokens): Embedding(32000, 5120, padding_idx=0)
[2m[36m(RayTrainWorker pid=5970, ip=10.0.87.254)[0m     (layers): ModuleList(
[2m[36m(RayTrainWorker pid=5970, ip=10.0.87.254)[0m       (0-39): 40 x LlamaDecoderLayer(
[2m[36m(RayTrainWorker pid=5970, ip=10.0.87.254)[0m         (self_attn): LlamaAttention(
[2m[36m(RayTrainWorker pid=5970, ip=10.0.87.254)[0m           (q_proj): Linear(in_features=5120, out_features=5120, bias=False)
[2m[36m(RayTrainWorker pid=5970, ip=10.0.87.254)[0m           (k_proj): Linear(in_features=5120, out_features=5120, bias=False)
[2m[36m(RayTrainWorker pid=5970, ip=10.0.87.254)[0m           (v_proj): Linear(in_features=5120, out_features=5120, bias=False)
[2m[36m(RayTrainWorker pid=5970, ip=10.0.87.254)[0m           (o_proj): Li

[2m[36m(RayTrainWorker pid=5941, ip=10.0.117.132)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading checkpoint shards: 100%|██████████| 3/3 [00:32<00:00, 10.68s/it][32m [repeated 14x across cluster][0m


[2m[36m(RayTrainWorker pid=6053, ip=10.0.84.83)[0m LlamaForCausalLM([32m [repeated 15x across cluster][0m
[2m[36m(RayTrainWorker pid=6053, ip=10.0.84.83)[0m   (model): LlamaModel([32m [repeated 15x across cluster][0m
[2m[36m(RayTrainWorker pid=6053, ip=10.0.84.83)[0m     (embed_tokens): Embedding(32000, 5120, padding_idx=0)[32m [repeated 15x across cluster][0m
[2m[36m(RayTrainWorker pid=6053, ip=10.0.84.83)[0m     (layers): ModuleList([32m [repeated 15x across cluster][0m
[2m[36m(RayTrainWorker pid=6053, ip=10.0.84.83)[0m       (0-39): 40 x LlamaDecoderLayer([32m [repeated 15x across cluster][0m
[2m[36m(RayTrainWorker pid=6053, ip=10.0.84.83)[0m         (self_attn): LlamaAttention([32m [repeated 15x across cluster][0m
[2m[36m(RayTrainWorker pid=6053, ip=10.0.84.83)[0m           (q_proj): Linear(in_features=5120, out_features=5120, bias=False)[32m [repeated 15x across cluster][0m
[2m[36m(RayTrainWorker pid=6053, ip=10.0.84.83)[0m           (k_proj):

[2m[36m(RayTrainWorker pid=6074, ip=10.0.79.156)[0m Using /home/ray/.cache/torch_extensions/py310_cu118 as PyTorch extensions root...
[2m[36m(RayTrainWorker pid=6087, ip=10.0.72.137)[0m Detected CUDA files, patching ldflags
[2m[36m(RayTrainWorker pid=6087, ip=10.0.72.137)[0m Emitting ninja build file /home/ray/.cache/torch_extensions/py310_cu118/cpu_adam/build.ninja...
[2m[36m(RayTrainWorker pid=6087, ip=10.0.72.137)[0m Building extension module cpu_adam...
[2m[36m(RayTrainWorker pid=6087, ip=10.0.72.137)[0m Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
[2m[36m(RayTrainWorker pid=5970, ip=10.0.87.254)[0m Loading extension module cpu_adam...


[2m[36m(RayTrainWorker pid=5970, ip=10.0.87.254)[0m ninja: no work to do.
[2m[36m(RayTrainWorker pid=5970, ip=10.0.87.254)[0m Time to load cpu_adam op: 2.373521566390991 seconds


[2m[36m(RayTrainWorker pid=5970, ip=10.0.87.254)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0][32m [repeated 15x across cluster][0m
Loading checkpoint shards: 100%|██████████| 3/3 [00:38<00:00, 12.86s/it]
[2m[36m(RayTrainWorker pid=5966, ip=10.0.90.111)[0m Building extension module utils...
[2m[36m(RayTrainWorker pid=5970, ip=10.0.87.254)[0m Loading extension module utils...


[2m[36m(RayTrainWorker pid=5970, ip=10.0.87.254)[0m Time to load utils op: 0.0738983154296875 seconds
[2m[36m(RayTrainWorker pid=6053, ip=10.0.84.83)[0m Parameter Offload: Total persistent parameters: 414720 in 81 params
[2m[36m(RayTrainWorker pid=6053, ip=10.0.84.83)[0m ninja: no work to do.[32m [repeated 31x across cluster][0m
[2m[36m(RayTrainWorker pid=48011, ip=10.0.121.227)[0m Time to load cpu_adam op: 2.3655214309692383 seconds[32m [repeated 15x across cluster][0m
[2m[36m(RayTrainWorker pid=17232)[0m Time to load utils op: 0.0006968975067138672 seconds[32m [repeated 16x across cluster][0m


[2m[36m(RayTrainWorker pid=17232)[0m No modifications detected for re-loaded extension module utils, skipping build step...
[2m[36m(RayTrainWorker pid=17232)[0m Using /home/ray/.cache/torch_extensions/py310_cu118 as PyTorch extensions root...[32m [repeated 32x across cluster][0m
[2m[36m(RayTrainWorker pid=48011, ip=10.0.121.227)[0m Detected CUDA files, patching ldflags[32m [repeated 15x across cluster][0m
[2m[36m(RayTrainWorker pid=6053, ip=10.0.84.83)[0m Emitting ninja build file /home/ray/.cache/torch_extensions/py310_cu118/utils/build.ninja...[32m [repeated 31x across cluster][0m
[2m[36m(RayTrainWorker pid=48011, ip=10.0.121.227)[0m Building extension module cpu_adam...[32m [repeated 15x across cluster][0m
[2m[36m(RayTrainWorker pid=6053, ip=10.0.84.83)[0m Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)[32m [repeated 31x across cluster][0m
[2m[36m(RayTrainWorker pid=48011, ip=10.0.121.227

Epoch 0:   0%|          | 0/125 [00:00<?, ?it/s])[0m 
Epoch 0:   1%|          | 1/125 [00:38<1:18:32, 38.00s/it, v_num=0, train_loss=4.120]
[2m[36m(RayTrainWorker pid=6053, ip=10.0.84.83)[0m Time to load utils op: 0.00030040740966796875 seconds[32m [repeated 15x across cluster][0m
Epoch 0:   2%|▏         | 2/125 [01:15<1:17:29, 37.80s/it, v_num=0, train_loss=4.160]
Epoch 0:   2%|▏         | 3/125 [01:49<1:14:21, 36.57s/it, v_num=0, train_loss=3.750]
Epoch 0:   3%|▎         | 4/125 [02:33<1:17:35, 38.47s/it, v_num=0, train_loss=3.690]
Epoch 0:   4%|▍         | 5/125 [03:08<1:15:35, 37.80s/it, v_num=0, train_loss=2.300]
Epoch 0:   5%|▍         | 6/125 [03:48<1:15:30, 38.07s/it, v_num=0, train_loss=2.480]
Epoch 0:   6%|▌         | 7/125 [04:22<1:13:44, 37.50s/it, v_num=0, train_loss=2.300]
Epoch 0:   6%|▋         | 8/125 [05:03<1:14:02, 37.97s/it, v_num=0, train_loss=2.220]
Epoch 0:   7%|▋         | 9/125 [05:37<1:12:27, 37.48s/it, v_num=0, train_loss=1.980]
Epoch 0:   8%|▊         

[2m[36m(RayTrainWorker pid=6053, ip=10.0.84.83)[0m No modifications detected for re-loaded extension module utils, skipping build step...[32m [repeated 15x across cluster][0m
[2m[36m(RayTrainWorker pid=6053, ip=10.0.84.83)[0m Using /home/ray/.cache/torch_extensions/py310_cu118 as PyTorch extensions root...[32m [repeated 15x across cluster][0m
[2m[36m(RayTrainWorker pid=6053, ip=10.0.84.83)[0m Loading extension module utils...[32m [repeated 15x across cluster][0m


Epoch 0: 100%|██████████| 125/125 [1:16:55<00:00, 36.92s/it, v_num=0, train_loss=1.460]


[2m[36m(RayTrainWorker pid=6110, ip=10.0.120.86)[0m Uploading checkpoint files from worker rank 3 to cloud URI s3://anyscale-staging-data-cld-kvedzwag2qa8i5bjxuevf5i7/yunxuanx-test/vicuna-test/vicuna-13b-finetune/LightningTrainer_d3e90_00000_0_2023-06-23_23-39-11/checkpoint_000000.
[2m[36m(RayTrainWorker pid=5845, ip=10.0.64.104)[0m Done uploading checkpoint files.
[2m[36m(RayTrainWorker pid=6364, ip=10.0.95.215)[0m Uploading checkpoint files from worker rank 1 to cloud URI s3://anyscale-staging-data-cld-kvedzwag2qa8i5bjxuevf5i7/yunxuanx-test/vicuna-test/vicuna-13b-finetune/LightningTrainer_d3e90_00000_0_2023-06-23_23-39-11/checkpoint_000000.[32m [repeated 15x across cluster][0m
[2m[36m(RayTrainWorker pid=6053, ip=10.0.84.83)[0m `Trainer.fit` stopped: `max_epochs=1` reached.
[2m[36m(RayTrainWorker pid=6053, ip=10.0.84.83)[0m Done uploading checkpoint files.[32m [repeated 15x across cluster][0m


Epoch 0: 100%|██████████| 125/125 [1:17:48<00:00, 37.35s/it, v_num=0, train_loss=1.460]


[2m[36m(LightningTrainer pid=5998, ip=10.0.84.83)[0m Uploading trial artifacts took 22.242 s, which may be a performance bottleneck. Consider saving fewer/smaller artifacts to the trial log directory, or disable artifact syncing with `SyncConfig(sync_artifacts=False)`.
2023-06-24 00:59:43,285	INFO tune.py:1148 -- Total run time: 4832.17 seconds (4789.95 seconds for the tuning loop).


Result(
  metrics={'_report_on': 'train_epoch_end', 'train_loss': 1.4609375, 'epoch': 0, 'step': 62, 'should_checkpoint': True, 'done': True, 'trial_id': 'd3e90_00000', 'experiment_tag': '0'},
  path='s3://anyscale-staging-data-cld-kvedzwag2qa8i5bjxuevf5i7/yunxuanx-test/vicuna-test/vicuna-13b-finetune/LightningTrainer_d3e90_00000_0_2023-06-23_23-39-11',
  checkpoint=LightningCheckpoint(uri=s3://anyscale-staging-data-cld-kvedzwag2qa8i5bjxuevf5i7/yunxuanx-test/vicuna-test/vicuna-13b-finetune/LightningTrainer_d3e90_00000_0_2023-06-23_23-39-11/checkpoint_000000)
)