In [1]:
NUM_WORKERS = 16
BATCH_SIZE_PER_WORKER = 8
MODEL_NAME = "/tmp/vicuna-13b"


In [2]:
import ray
import os

@ray.remote(num_gpus=1)
def download_vicuna_13b(rank):
    if not os.path.exists(MODEL_NAME):
        print(f"Rank {rank}: Downloading vicuna model...")
        os.system(
            f"aws s3 sync s3://large-dl-models-mirror/restricted/models--lmsys--vicuna-13b-delta-v1.1/main-safetensors/ {MODEL_NAME} >NUL 2>&1"
        )
    print(f"{rank}: Download finished!")


tasks = [download_vicuna_13b.remote(i) for i in range(NUM_WORKERS)]
ray.get(tasks)


2023-06-27 17:19:54,231	INFO worker.py:1426 -- Connecting to existing Ray cluster at address: 10.0.11.88:6379...
2023-06-27 17:19:54,348	INFO worker.py:1607 -- Connected to Ray cluster. View the dashboard at [1m[32mhttps://session-vzyh3916u4zwmf1es6fazmbrgm.i.anyscaleuserdata-staging.com [39m[22m
2023-06-27 17:19:54,350	INFO packaging.py:346 -- Pushing file package 'gcs://_ray_pkg_78cceba9b6954f107800abddcdb68b76.zip' (0.15MiB) to Ray cluster...
2023-06-27 17:19:54,351	INFO packaging.py:359 -- Successfully pushed file package 'gcs://_ray_pkg_78cceba9b6954f107800abddcdb68b76.zip'.


[2m[36m(download_vicuna_13b pid=213755)[0m 0: Download finished!


[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [3]:
RELATION_TEMPLATE = {
    0: "[0]Cause-Effect({e1},{e2})",
    1: "[1]Cause-Effect({e2},{e1})",
    2: "[2]Component-Whole({e1},{e2})",
    3: "[3]Component-Whole({e2},{e1})",
    4: "[4]Content-Container({e1},{e2})",
    5: "[5]Content-Container({e2},{e1})",
    6: "[6]Entity-Destination({e1},{e2})",
    7: "[7]Entity-Destination({e2},{e1})",
    8: "[8]Entity-Origin({e1},{e2})",
    9: "[9]Entity-Origin({e2},{e1})",
    10: "[10]Instrument-Agency({e1},{e2})",
    11: "[11]Instrument-Agency({e2},{e1})",
    12: "[12]Member-Collection({e1},{e2})",
    13: "[13]Member-Collection({e2},{e1})",
    14: "[14]Message-Topic({e1},{e2})",
    15: "[15]Message-Topic({e2},{e1})",
    16: "[16]Product-Producer({e1},{e2})",
    17: "[17]Product-Producer({e2},{e1})",
    18: "[18]Unknown({e1},{e2})",
}

PROMPT_TEMPLATE = "{sentence}\nIn the above sentence, the relationship between the two tagged entities is: {relation}"


In [4]:
import re
import json
from datasets import load_dataset
from transformers import AutoTokenizer
from ray.data.preprocessors import BatchMapper, Chain

hf_dataset = load_dataset("sem_eval_2010_task_8")
hf_dataset["val"] = hf_dataset["test"].rename_column("sentence", "test_sentence")
ray_dataset = {
    "train": ray.data.from_huggingface(hf_dataset["train"]),
    "val": ray.data.from_huggingface(hf_dataset["val"]),
}


def fill_prompt(batch):
    # Format train data
    if "sentence" in batch:
        # Extract two tagged entities
        batch["e1"] = batch["sentence"].apply(
            lambda x: re.search(r"<e1>(.*?)</e1>", x).group(1)
        )
        batch["e2"] = batch["sentence"].apply(
            lambda x: re.search(r"<e2>(.*?)</e2>", x).group(1)
        )
        batch["input_sentence"] = batch.apply(
            lambda row: PROMPT_TEMPLATE.format(
                sentence=row["sentence"],
                relation=RELATION_TEMPLATE[row["relation"]].format(
                    e1=row["e1"], e2=row["e2"]
                ),
            )
            + "</s>",
            axis=1,
        )
    # Format test data
    else:
        batch["input_sentence"] = batch.apply(
            lambda row: PROMPT_TEMPLATE.format(
                sentence=row["test_sentence"], relation=""
            ),
            axis=1,
        )
    return batch[["input_sentence", "relation"]]


def tokenize(batch):
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, padding_side="left")
    tokenizer.pad_token = tokenizer.eos_token
    ret = tokenizer(
        list(batch["input_sentence"]),
        truncation=True,
        max_length=128,
        padding="max_length",
        return_tensors="np",
    )
    ret["labels"] = ret["input_ids"].copy()
    ret["relation"] = batch["relation"]
    return dict(ret)


prompt_mapper = BatchMapper(fill_prompt, batch_format="pandas")
tokenize_mapper = BatchMapper(tokenize, batch_format="pandas")
preprocessor = Chain(prompt_mapper, tokenize_mapper)


Found cached dataset sem_eval_2010_task_8 (/home/ray/.cache/huggingface/datasets/sem_eval_2010_task_8/default/1.0.0/8545d1995bbbade386acf5c4e2bef5589d8387ae0a93356407dfb54cdb234416)


  0%|          | 0/2 [00:00<?, ?it/s]


Learn more here: https://docs.ray.io/en/master/data/faq.html#migrating-to-strict-mode[0m


In [5]:
import torch
import transformers
import pytorch_lightning as pl
from transformers import AutoTokenizer, AutoModelForCausalLM
from deepspeed.ops.adam import DeepSpeedCPUAdam


class ZeRO3Config:
    def __init__(self, pl_module):
        self.config = pl_module.trainer.strategy.config

    def __call__(self, *args, **kwargs):
        return self

    def is_zero3(self) -> bool:
        return True


def enable_transformers_pretrained_deepspeed_sharding(
    pl_module: "pl.LightningModule",
) -> None:
    transformers.deepspeed._hf_deepspeed_config_weak_ref = ZeRO3Config(pl_module)


class Vicuna13BModel(pl.LightningModule):
    def __init__(self, inference=False):
        super().__init__()
        torch.backends.cuda.matmul.allow_tf32 = True
        self.tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, padding_side="left")
        if inference:
            with init_empty_weights():
                self.model_config = AutoConfig.from_pretrained(
                    MODEL_NAME, trust_remote_code=True
                )
                self.model = AutoModelForCausalLM.from_config(
                    self.model_config, trust_remote_code=True
                )
            self.model.tie_weights()

        self.predictions = []
        self.corrects = []

    def setup(self, stage) -> None:
        if not hasattr(self, "model"):
            enable_transformers_pretrained_deepspeed_sharding(self)
            self.model = AutoModelForCausalLM.from_pretrained(
                MODEL_NAME, trust_remote_code=True
            )
        if self.global_rank == 0:
            print("DeepSpeed Configs: ", self.trainer.strategy.config)
            print("Model Archetecture: ", self.model)

    def forward(self, batch):
        outputs = self.model(
            batch["input_ids"],
            attention_mask=batch["attention_mask"],
            labels=batch["labels"],
        )
        return outputs.loss

    def training_step(self, batch, batch_idx):
        torch.cuda.empty_cache()
        loss = self.forward(batch)
        self.log("train_loss", loss, prog_bar=True, on_step=True, sync_dist=True)
        return loss

    def validation_step(self, batch, batch_idx):
        output_tokens = self.model.generate(batch["input_ids"], max_new_tokens=16)
        output_sents = self.tokenizer.batch_decode(
            output_tokens, skip_special_tokens=True
        )

        relations = batch["relation"]
        for rid, sent in zip(relations, output_sents):
            correct = f"[{int(rid)}]" in sent
            self.predictions.append({"output": sent, "correct": correct})
            self.corrects.append(correct)

    def on_validation_epoch_end(self):
        # Dump predictions
        with open(f"/tmp/predictions.json", "w") as fout:
            for prediction in self.predictions:
                fout.write(json.dumps(prediction) + "\n")

        # Report aggregated metrics
        self.log("val_acc", sum(self.corrects) / len(self.corrects), sync_dist=True)
        self.corrects.clear()
        self.predictions.clear()

    def configure_optimizers(self):
        return DeepSpeedCPUAdam(self.parameters(), lr=1e-5)


[2023-06-27 17:20:05,117] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [6]:
from pytorch_lightning.callbacks import TQDMProgressBar


# Create a customized progress bar for LightningTrainer
class FalconProgressBar(TQDMProgressBar):
    def __init__(self, num_iters_per_epoch, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.num_iters_per_epoch = num_iters_per_epoch

    def on_train_epoch_start(self, trainer, *_):
        super().on_train_epoch_start(trainer, *_)
        self.train_progress_bar.reset(self.num_iters_per_epoch)


total_batches = ray_dataset["train"].count()
num_iters_per_epoch = total_batches // (NUM_WORKERS * BATCH_SIZE_PER_WORKER)
progress_bar = FalconProgressBar(num_iters_per_epoch)


In [7]:
from ray.train.lightning import LightningTrainer, LightningConfigBuilder
from transformers import AutoConfig

config = AutoConfig.from_pretrained(MODEL_NAME)
HIDDEN_SIZE = config.hidden_size

# We are using default values from huggingface
deepspeed_configs = {
    "zero_allow_untested_optimizer": True,
    "bf16": {"enabled": True},
    "zero_optimization": {
        "stage": 3,
        "offload_optimizer": {"device": "cpu", "pin_memory": True},
        "overlap_comm": True,
        "contiguous_gradients": True,
        "reduce_bucket_size": HIDDEN_SIZE * HIDDEN_SIZE,
        "stage3_prefetch_bucket_size": 0.9 * HIDDEN_SIZE * HIDDEN_SIZE,
        "stage3_param_persistence_threshold": 10 * HIDDEN_SIZE,
        "stage3_gather_16bit_weights_on_model_save": True,
    },
}

lightning_config = (
    LightningConfigBuilder()
    .module(cls=Vicuna13BModel)
    .trainer(
        max_epochs=1,
        accelerator="gpu",
        precision="bf16-mixed",
        callbacks=[progress_bar],
        accumulate_grad_batches=2,
        limit_val_batches=1,
        num_sanity_val_steps=0,
    )
    .strategy(name="deepspeed", config=deepspeed_configs)
    .checkpointing(save_top_k=0, save_weights_only=True, save_last=True)
    .build()
)


In [8]:
from ray.air.config import CheckpointConfig, RunConfig, ScalingConfig

trainer = LightningTrainer(
    lightning_config=lightning_config,
    run_config=RunConfig(
        name="vicuna-13b-relation-extraction",
        storage_path="s3://anyscale-staging-data-cld-kvedzwag2qa8i5bjxuevf5i7/yunxuanx-test/vicuna-test",
        checkpoint_config=CheckpointConfig(
            num_to_keep=1,
            _checkpoint_keep_all_ranks=True,
            _checkpoint_upload_from_workers=True,
        ),
    ),
    scaling_config=ScalingConfig(
        num_workers=NUM_WORKERS,
        use_gpu=True,
        resources_per_worker={"CPU": 15, "GPU": 1},
    ),
    datasets=ray_dataset,
    datasets_iter_config={"batch_size": BATCH_SIZE_PER_WORKER},
    preprocessor=preprocessor,
)




In [9]:
result = trainer.fit()


0,1
Current time:,2023-06-27 18:09:19
Running for:,00:49:13.10
Memory:,4.4/62.1 GiB

Trial name,status,loc,iter,total time (s),train_loss,val_acc,epoch
LightningTrainer_88935_00000,TERMINATED,10.0.6.248:98259,1,2708.96,0.691406,0,0


[2m[36m(pid=98259, ip=10.0.6.248)[0m [2023-06-27 17:20:12,362] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2m[36m(download_vicuna_13b pid=99058, ip=10.0.25.123)[0m 13: Download finished![32m [repeated 15x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/ray-logging.html#log-deduplication for more options.)[0m


[2m[36m(LightningTrainer pid=98259, ip=10.0.6.248)[0m The `preprocessor` arg to Trainer is deprecated. Apply preprocessor transformations ahead of time by calling `preprocessor.transform(ds)`. Support for the preprocessor arg will be dropped in a future release.
[2m[36m(LightningTrainer pid=98259, ip=10.0.6.248)[0m [33mImportant: Ray Data requires schemas for all datasets in Ray 2.5. This means that standalone Python objects are no longer supported. In addition, the default batch format is fixed to NumPy. To revert to legacy behavior temporarily, set the environment variable RAY_DATA_STRICT_MODE=0 on all cluster processes.
[2m[36m(LightningTrainer pid=98259, ip=10.0.6.248)[0m 
[2m[36m(LightningTrainer pid=98259, ip=10.0.6.248)[0m Learn more here: https://docs.ray.io/en/master/data/faq.html#migrating-to-strict-mode[0m
[2m[36m(LightningTrainer pid=98259, ip=10.0.6.248)[0m Starting distributed worker processes: ['98320 (10.0.6.248)', '98862 (10.0.20.190)', '98215 (10.0.40

(pid=98259, ip=10.0.6.248) - RandomizeBlockOrder 1:   0%|          | 0/1 [00:00<?, ?it/s]

(pid=98259, ip=10.0.6.248) Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

[2m[36m(LightningTrainer pid=98259, ip=10.0.6.248)[0m Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[MapBatches(BatchMapper._transform_pandas)->MapBatches(BatchMapper._transform_pandas)] -> AllToAllOperator[RandomizeBlockOrder]
[2m[36m(LightningTrainer pid=98259, ip=10.0.6.248)[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
[2m[36m(LightningTrainer pid=98259, ip=10.0.6.248)[0m Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


[2m[36m(RayTrainWorker pid=98862, ip=10.0.20.190)[0m [2023-06-27 17:20:23,557] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)


[2m[36m(RayTrainWorker pid=98320, ip=10.0.6.248)[0m GPU available: True (cuda), used: True
[2m[36m(RayTrainWorker pid=98320, ip=10.0.6.248)[0m TPU available: False, using: 0 TPU cores
[2m[36m(RayTrainWorker pid=98320, ip=10.0.6.248)[0m IPU available: False, using: 0 IPUs
[2m[36m(RayTrainWorker pid=98320, ip=10.0.6.248)[0m HPU available: False, using: 0 HPUs
[2m[36m(RayTrainWorker pid=98320, ip=10.0.6.248)[0m `Trainer(limit_val_batches=1)` was configured so 1 batch will be used.
[2m[36m(RayTrainWorker pid=98856, ip=10.0.56.182)[0m initializing deepspeed distributed: GLOBAL_RANK: 13, MEMBER: 14/16
[2m[36m(RayTrainWorker pid=98990, ip=10.0.41.124)[0m Missing logger folder: /home/ray/ray_results/vicuna-13b-relation-extraction/LightningTrainer_88935_00000_0_2023-06-27_17-20-06/rank_all/lightning_logs




Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]
[2m[36m(RayTrainWorker pid=98999, ip=10.0.49.254)[0m initializing deepspeed distributed: GLOBAL_RANK: 12, MEMBER: 13/16[32m [repeated 15x across cluster][0m
[2m[36m(RayTrainWorker pid=98679, ip=10.0.48.92)[0m Missing logger folder: /home/ray/ray_results/vicuna-13b-relation-extraction/LightningTrainer_88935_00000_0_2023-06-27_17-20-06/rank_all/lightning_logs[32m [repeated 15x across cluster][0m
Loading checkpoint shards:  33%|███▎      | 1/3 [00:26<00:53, 26.60s/it]
Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s][32m [repeated 15x across cluster][0m
Loading checkpoint shards:  33%|███▎      | 1/3 [00:44<01:28, 44.08s/it][32m [repeated 15x across cluster][0m
Loading checkpoint shards:  67%|██████▋   | 2/3 [01:09<00:36, 36.08s/it]
Loading checkpoint shards:  67%|██████▋   | 2/3 [01:09<00:36, 36.10s/it]
Loading checkpoint shards:  67%|██████▋   | 2/3 [01:28<00:44, 44.07s/it][32m [repeated 14x a

[2m[36m(RayTrainWorker pid=98320, ip=10.0.6.248)[0m DeepSpeed Configs:  {'zero_allow_untested_optimizer': True, 'bf16': {'enabled': True}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'cpu', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'reduce_bucket_size': 26214400, 'stage3_prefetch_bucket_size': 23592960.0, 'stage3_param_persistence_threshold': 51200, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 2, 'train_micro_batch_size_per_gpu': 1, 'gradient_clipping': 0.0}
[2m[36m(RayTrainWorker pid=98320, ip=10.0.6.248)[0m Model Archetecture:  LlamaForCausalLM(
[2m[36m(RayTrainWorker pid=98320, ip=10.0.6.248)[0m   (model): LlamaModel(
[2m[36m(RayTrainWorker pid=98320, ip=10.0.6.248)[0m     (embed_tokens): Embedding(32000, 5120, padding_idx=0)
[2m[36m(RayTrainWorker pid=98320, ip=10.0.6.248)[0m     (layers): ModuleList(
[2m[36m(RayTrainWorker pid=98320, ip=10.0.6.248)[0m       (0-39): 40 x Lla

[2m[36m(RayTrainWorker pid=98856, ip=10.0.56.182)[0m Using /home/ray/.cache/torch_extensions/py310_cu118 as PyTorch extensions root...
[2m[36m(RayTrainWorker pid=98999, ip=10.0.49.254)[0m Detected CUDA files, patching ldflags
[2m[36m(RayTrainWorker pid=98999, ip=10.0.49.254)[0m Emitting ninja build file /home/ray/.cache/torch_extensions/py310_cu118/cpu_adam/build.ninja...
[2m[36m(RayTrainWorker pid=98999, ip=10.0.49.254)[0m Building extension module cpu_adam...
[2m[36m(RayTrainWorker pid=98999, ip=10.0.49.254)[0m Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
[2m[36m(RayTrainWorker pid=98856, ip=10.0.56.182)[0m Loading extension module cpu_adam...
[2m[36m(RayTrainWorker pid=98856, ip=10.0.56.182)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0][32m [repeated 15x across cluster][0m
Loading checkpoint shards: 100%|██████████| 3/3 [01:55<00:00, 38.52s/it]


[2m[36m(RayTrainWorker pid=98856, ip=10.0.56.182)[0m ninja: no work to do.
[2m[36m(RayTrainWorker pid=98856, ip=10.0.56.182)[0m Time to load cpu_adam op: 2.3617641925811768 seconds


[2m[36m(RayTrainWorker pid=97932, ip=10.0.43.220)[0m Building extension module utils...
[2m[36m(RayTrainWorker pid=98862, ip=10.0.20.190)[0m Loading extension module utils...


[2m[36m(RayTrainWorker pid=98862, ip=10.0.20.190)[0m Time to load utils op: 0.07697224617004395 seconds
[2m[36m(RayTrainWorker pid=98320, ip=10.0.6.248)[0m Parameter Offload: Total persistent parameters: 414720 in 81 params
[2m[36m(RayTrainWorker pid=98320, ip=10.0.6.248)[0m ninja: no work to do.[32m [repeated 31x across cluster][0m
[2m[36m(RayTrainWorker pid=98250, ip=10.0.19.249)[0m Time to load cpu_adam op: 2.3788340091705322 seconds[32m [repeated 15x across cluster][0m
[2m[36m(RayTrainWorker pid=98679, ip=10.0.48.92)[0m Time to load utils op: 0.0003402233123779297 seconds[32m [repeated 16x across cluster][0m


[2m[36m(RayTrainWorker pid=98679, ip=10.0.48.92)[0m No modifications detected for re-loaded extension module utils, skipping build step...
[2m[36m(RayTrainWorker pid=98679, ip=10.0.48.92)[0m Using /home/ray/.cache/torch_extensions/py310_cu118 as PyTorch extensions root...[32m [repeated 32x across cluster][0m
[2m[36m(RayTrainWorker pid=98862, ip=10.0.20.190)[0m Detected CUDA files, patching ldflags[32m [repeated 15x across cluster][0m
[2m[36m(RayTrainWorker pid=98320, ip=10.0.6.248)[0m Emitting ninja build file /home/ray/.cache/torch_extensions/py310_cu118/utils/build.ninja...[32m [repeated 31x across cluster][0m
[2m[36m(RayTrainWorker pid=98862, ip=10.0.20.190)[0m Building extension module cpu_adam...[32m [repeated 15x across cluster][0m
[2m[36m(RayTrainWorker pid=98320, ip=10.0.6.248)[0m Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)[32m [repeated 31x across cluster][0m
[2m[36m(RayTrainWor

Epoch 0:   0%|          | 0/62 [00:00<?, ?it/s]48)[0m 


[2m[36m(RayTrainWorker pid=98320, ip=10.0.6.248)[0m   rank_zero_warn(
[2m[36m(RayTrainWorker pid=98320, ip=10.0.6.248)[0m   rank_zero_warn(


Epoch 0:   2%|▏         | 1/62 [00:40<41:00, 40.34s/it, v_num=0, train_loss=4.560]
[2m[36m(RayTrainWorker pid=98320, ip=10.0.6.248)[0m Time to load utils op: 0.0008089542388916016 seconds[32m [repeated 15x across cluster][0m
Epoch 0:   3%|▎         | 2/62 [01:20<40:14, 40.23s/it, v_num=0, train_loss=4.590]
Epoch 0:   5%|▍         | 3/62 [01:56<38:01, 38.67s/it, v_num=0, train_loss=3.940]
Epoch 0:   6%|▋         | 4/62 [02:37<38:09, 39.47s/it, v_num=0, train_loss=3.840]
Epoch 0:   8%|▊         | 5/62 [03:12<36:33, 38.48s/it, v_num=0, train_loss=3.810]
Epoch 0:  10%|▉         | 6/62 [03:52<36:10, 38.76s/it, v_num=0, train_loss=3.410]
Epoch 0:  11%|█▏        | 7/62 [04:27<35:01, 38.20s/it, v_num=0, train_loss=1.560]
Epoch 0:  13%|█▎        | 8/62 [05:07<34:32, 38.38s/it, v_num=0, train_loss=1.570]
Epoch 0:  15%|█▍        | 9/62 [05:43<33:42, 38.15s/it, v_num=0, train_loss=1.200]
Epoch 0:  16%|█▌        | 10/62 [06:23<33:15, 38.38s/it, v_num=0, train_loss=1.200]
Epoch 0:  18%|█▊      

[2m[36m(RayTrainWorker pid=214121)[0m Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[MapBatches(BatchMapper._transform_pandas)->MapBatches(BatchMapper._transform_pandas)] -> AllToAllOperator[RandomizeBlockOrder]
[2m[36m(RayTrainWorker pid=214121)[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
[2m[36m(RayTrainWorker pid=214121)[0m Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`
[2m[36m(RayTrainWorker pid=98320, ip=10.0.6.248)[0m No modifications detected for re-loaded extension module utils, skipping build step...[32m [repeated 15x across cluster][0m
[2m[36m(RayTrainWorker pid=98320, ip=10.0.6.248)[0m Using /home/ray/.cache/torch_extensions/py310_cu118 as PyTorch extensions root...[32m [repeated 15x across clust

(pid=98788, ip=10.0.61.179) - RandomizeBlockOrder 1:   0%|          | 0/1 [00:00<?, ?it/s]

(pid=98788, ip=10.0.61.179) Running 0:   0%|          | 0/1 [00:00<?, ?it/s]



(pid=98320, ip=10.0.6.248) Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

(pid=98320, ip=10.0.6.248) - RandomizeBlockOrder: 0 active, 0 queued, 0.0 MiB objects, 0 output 1:   0%|      …

(pid=98990, ip=10.0.41.124) - RandomizeBlockOrder 1:   0%|          | 0/1 [00:00<?, ?it/s]

(pid=98990, ip=10.0.41.124) Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

(pid=98856, ip=10.0.56.182) - RandomizeBlockOrder 1:   0%|          | 0/1 [00:00<?, ?it/s]

(pid=98856, ip=10.0.56.182) Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

(pid=99151, ip=10.0.25.123) - RandomizeBlockOrder 1:   0%|          | 0/1 [00:00<?, ?it/s]

(pid=99151, ip=10.0.25.123) Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

(pid=99256, ip=10.0.3.211) - RandomizeBlockOrder 1:   0%|          | 0/1 [00:00<?, ?it/s]

(pid=99256, ip=10.0.3.211) Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

(pid=98999, ip=10.0.49.254) - RandomizeBlockOrder 1:   0%|          | 0/1 [00:00<?, ?it/s]

(pid=98999, ip=10.0.49.254) Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

(pid=98215, ip=10.0.40.238) - RandomizeBlockOrder 1:   0%|          | 0/1 [00:00<?, ?it/s]

(pid=98215, ip=10.0.40.238) Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

(pid=98197, ip=10.0.20.181) - RandomizeBlockOrder 1:   0%|          | 0/1 [00:00<?, ?it/s]

(pid=98197, ip=10.0.20.181) Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

(pid=98679, ip=10.0.48.92) - RandomizeBlockOrder 1:   0%|          | 0/1 [00:00<?, ?it/s]

(pid=98679, ip=10.0.48.92) Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

(pid=98250, ip=10.0.19.249) - RandomizeBlockOrder 1:   0%|          | 0/1 [00:00<?, ?it/s]

(pid=98250, ip=10.0.19.249) Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

(pid=99040, ip=10.0.16.85) - RandomizeBlockOrder 1:   0%|          | 0/1 [00:00<?, ?it/s]

(pid=99040, ip=10.0.16.85) Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

(pid=98862, ip=10.0.20.190) - RandomizeBlockOrder 1:   0%|          | 0/1 [00:00<?, ?it/s]

(pid=98862, ip=10.0.20.190) Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

(pid=98834, ip=10.0.24.99) - RandomizeBlockOrder 1:   0%|          | 0/1 [00:00<?, ?it/s]

(pid=98834, ip=10.0.24.99) Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

(pid=214121) - RandomizeBlockOrder 1:   0%|          | 0/1 [00:00<?, ?it/s]

(pid=214121) Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

(pid=97932, ip=10.0.43.220) - RandomizeBlockOrder 1:   0%|          | 0/1 [00:00<?, ?it/s]

(pid=97932, ip=10.0.43.220) Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

[2m[36m(RayTrainWorker pid=98320, ip=10.0.6.248)[0m 
Validation: 0it [00:00, ?it/s][A0, ip=10.0.6.248)[0m 
Validation:   0%|          | 0/1 [00:00<?, ?it/s][A0m 
Validation DataLoader 0:   0%|          | 0/1 [00:00<?, ?it/s][A


[2m[36m(RayTrainWorker pid=98834, ip=10.0.24.99)[0m Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[MapBatches(BatchMapper._transform_pandas)->MapBatches(BatchMapper._transform_pandas)] -> AllToAllOperator[RandomizeBlockOrder][32m [repeated 15x across cluster][0m
[2m[36m(RayTrainWorker pid=98834, ip=10.0.24.99)[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)[32m [repeated 15x across cluster][0m
[2m[36m(RayTrainWorker pid=98834, ip=10.0.24.99)[0m Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`[32m [repeated 15x across cluster][0m


[2m[36m(RayTrainWorker pid=98320, ip=10.0.6.248)[0m 
Epoch 0: : 63it [40:38, 38.71s/it, v_num=0, train_loss=0.691]5.66s/it][A
Epoch 0: : 63it [40:38, 38.71s/it, v_num=0, train_loss=0.691]         [A


[2m[36m(RayTrainWorker pid=99256, ip=10.0.3.211)[0m Uploading checkpoint files from worker rank 5 to cloud URI s3://anyscale-staging-data-cld-kvedzwag2qa8i5bjxuevf5i7/yunxuanx-test/vicuna-test/vicuna-13b-relation-extraction/LightningTrainer_88935_00000_0_2023-06-27_17-20-06/checkpoint_000000.
[2m[36m(RayTrainWorker pid=97932, ip=10.0.43.220)[0m Done uploading checkpoint files.
[2m[36m(RayTrainWorker pid=214121)[0m Uploading checkpoint files from worker rank 11 to cloud URI s3://anyscale-staging-data-cld-kvedzwag2qa8i5bjxuevf5i7/yunxuanx-test/vicuna-test/vicuna-13b-relation-extraction/LightningTrainer_88935_00000_0_2023-06-27_17-20-06/checkpoint_000000.[32m [repeated 14x across cluster][0m
[2m[36m(RayTrainWorker pid=99256, ip=10.0.3.211)[0m Done uploading checkpoint files.[32m [repeated 13x across cluster][0m
[2m[36m(RayTrainWorker pid=99151, ip=10.0.25.123)[0m Uploading checkpoint files from worker rank 15 to cloud URI s3://anyscale-staging-data-cld-kvedzwag2qa8i5bjx

Epoch 0: : 63it [42:19, 40.30s/it, v_num=0, train_loss=0.691]


[2m[36m(RayTrainWorker pid=98320, ip=10.0.6.248)[0m `Trainer.fit` stopped: `max_epochs=1` reached.
[2m[36m(LightningTrainer pid=98259, ip=10.0.6.248)[0m Uploading trial artifacts took 23.895 s, which may be a performance bottleneck. Consider saving fewer/smaller artifacts to the trial log directory, or disable artifact syncing with `SyncConfig(sync_artifacts=False)`.
2023-06-27 18:09:19,438	INFO tune.py:1148 -- Total run time: 2953.18 seconds (2812.50 seconds for the tuning loop).


In [13]:
!cat /tmp/prediction_5.json

{"output": "The most common <e1>audits</e1> were about <e2>waste</e2> and recycling.\nIn the above sentence, the relationship between the two tagged entities is: ", "correct": false}
{"output": "The <e1>company</e1> fabricates plastic <e2>chairs</e2>.\nIn the above sentence, the relationship between the two tagged entities is: ", "correct": false}
{"output": "The school <e1>master</e1> teaches the lesson with a <e2>stick</e2>.\nIn the above sentence, the relationship between the two tagged entities is: ", "correct": false}
{"output": "The suspect dumped the dead <e1>body</e1> into a local <e2>reservoir</e2>.\nIn the above sentence, the relationship between the two tagged entities is: ", "correct": false}
{"output": "Avian <e1>influenza</e1> is an infectious disease of birds caused by type A strains of the influenza <e2>virus</e2>.\nIn the above sentence, the relationship between the two tagged entities is: ", "correct": false}
{"output": "The <e1>ear</e1> of the African <e2>elephant</e

In [11]:
# from ray.train.lightning import LightningCheckpoint

# ckpt = LightningCheckpoint.from_uri("s3://anyscale-staging-data-cld-kvedzwag2qa8i5bjxuevf5i7/yunxuanx-test/vicuna-test/vicuna-13b-relation-extraction/LightningTrainer_22972_00000_0_2023-06-26_16-06-51/checkpoint_000000")


In [12]:
# !aws s3 sync s3://anyscale-staging-data-cld-kvedzwag2qa8i5bjxuevf5i7/yunxuanx-test/vicuna-test/vicuna-13b-relation-extraction/LightningTrainer_22972_00000_0_2023-06-26_16-06-51/checkpoint_000000 /tmp/vicuna-re-ckpt
