In [2]:
    !pip install accelerate
    !pip install -i https://pypi.org/simple/ bitsandbytes
    !pip install peft transformers trl datasets
    !pip install deepspeed

Looking in indexes: https://pypi.org/simple/
Collecting deepspeed
  Downloading deepspeed-0.14.2.tar.gz (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting hjson (from deepspeed)
  Downloading hjson-3.1.0-py3-none-any.whl.metadata (2.6 kB)
Collecting ninja (from deepspeed)
  Downloading ninja-1.11.1.1-py2.py3-none-macosx_10_9_universal2.macosx_10_9_x86_64.macosx_11_0_arm64.macosx_11_0_universal2.whl.metadata (5.3 kB)
Collecting pynvml (from deepspeed)
  Downloading pynvml-11.5.0-py3-none-any.whl.metadata (7.8 kB)
Downloading hjson-3.1.0-py3-none-any.whl (54 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.0/54.0 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ninja-1.11.1.1-py2.py3-none-macosx_10_9_universal2.macosx_10_9_x86_64.macosx_11_0_arm64.macosx_11_0_universal2.whl (270 kB)
[2K   

In [None]:
!pip install flash-attn --no-build-isolation

Collecting flash-attn
  Downloading flash_attn-2.5.8.tar.gz (2.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting einops (from flash-attn)
  Downloading einops-0.8.0-py3-none-any.whl (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.2/43.2 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: flash-attn
  Building wheel for flash-attn (setup.py) ... [?25l[?25hdone
  Created wheel for flash-attn: filename=flash_attn-2.5.8-cp310-cp310-linux_x86_64.whl size=120853537 sha256=53979129f883680327bf5d13027cd014e2d054f4fb5b8856916686ae315e57d6
  Stored in directory: /root/.cache/pip/wheels/9b/5b/2b/dea8af4e954161c49ef1941938afcd91bb93689371ed12a226
Successfully built flash-attn
Installing collected packages: einops, flash-attn
Successfully installed einops-0.8.0 flash-attn-2.5.8


In [4]:
import sys
import logging

import datasets
from datasets import load_dataset
from peft import LoraConfig
import torch
import transformers
from trl import SFTTrainer
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig

"""
A simple example on using SFTTrainer and Accelerate to finetune Phi-3 models. For
a more advanced example, please follow HF alignment-handbook/scripts/run_sft.py.
This example has utilized DeepSpeed ZeRO3 offload to reduce the memory usage. The
script can be run on V100 or later generation GPUs. Here are some suggestions on
futher reducing memory consumption:
    - reduce batch size
    - decrease lora dimension
    - restrict lora target modules
Please follow these steps to run the script:
1. Install dependencies:
    conda install -c conda-forge accelerate
    pip3 install -i https://pypi.org/simple/ bitsandbytes
    pip3 install peft transformers trl datasets
    pip3 install deepspeed
2. Setup accelerate and deepspeed config based on the machine used:
    accelerate config
Here is a sample config for deepspeed zero3:
    compute_environment: LOCAL_MACHINE
    debug: false
    deepspeed_config:
      gradient_accumulation_steps: 1
      offload_optimizer_device: none
      offload_param_device: none
      zero3_init_flag: true
      zero3_save_16bit_model: true
      zero_stage: 3
    distributed_type: DEEPSPEED
    downcast_bf16: 'no'
    enable_cpu_affinity: false
    machine_rank: 0
    main_training_function: main
    mixed_precision: bf16
    num_machines: 1
    num_processes: 4
    rdzv_backend: static
    same_network: true
    tpu_env: []
    tpu_use_cluster: false
    tpu_use_sudo: false
    use_cpu: false
3. check accelerate config:
    accelerate env
4. Run the code:
    accelerate launch sample_finetune.py
"""

logger = logging.getLogger(__name__)


###################
# Hyper-parameters
###################
training_config = {
    "bf16": False,
    "do_eval": False,
    "learning_rate": 5.0e-06,
    "log_level": "info",
    "logging_steps": 20,
    "logging_strategy": "steps",
    "lr_scheduler_type": "cosine",
    "num_train_epochs": 1,
    "max_steps": -1,
    "output_dir": "./checkpoint_dir",
    "overwrite_output_dir": True,
    "per_device_eval_batch_size": 4,
    "per_device_train_batch_size": 4,
    "remove_unused_columns": True,
    "save_steps": 100,
    "save_total_limit": 1,
    "seed": 0,
    "gradient_checkpointing": True,
    "gradient_checkpointing_kwargs":{"use_reentrant": False},
    "gradient_accumulation_steps": 1,
    "warmup_ratio": 0.2,
    }

peft_config = {
    "r": 16,
    "lora_alpha": 32,
    "lora_dropout": 0.05,
    "bias": "none",
    "task_type": "CAUSAL_LM",
    "target_modules": "all-linear",
    "modules_to_save": None,
}



  warn("The installed version of bitsandbytes was compiled without GPU support. "


'NoneType' object has no attribute 'cadam32bit_grad_fp32'


ImportError: cannot import name 'top_k_top_p_filtering' from 'transformers' (/opt/anaconda3/lib/python3.11/site-packages/transformers/__init__.py)

In [None]:
train_conf = TrainingArguments(**training_config)
peft_conf = LoraConfig(**peft_config)

###############
# Setup logging
###############
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    handlers=[logging.StreamHandler(sys.stdout)],
)
log_level = train_conf.get_process_log_level()
logger.setLevel(log_level)
datasets.utils.logging.set_verbosity(log_level)
transformers.utils.logging.set_verbosity(log_level)
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()

# Log on each process a small summary
logger.warning(
    f"Process rank: {train_conf.local_rank}, device: {train_conf.device}, n_gpu: {train_conf.n_gpu}"
    + f" distributed training: {bool(train_conf.local_rank != -1)}, 16-bits training: {train_conf.fp16}"
)
logger.info(f"Training/evaluation parameters {train_conf}")
logger.info(f"PEFT parameters {peft_conf}")


################
# Modle Loading
################
checkpoint_path = "microsoft/Phi-3-mini-4k-instruct"
# checkpoint_path = "microsoft/Phi-3-mini-128k-instruct"
model_kwargs = dict(
    use_cache=False,
    trust_remote_code=True,
    attn_implementation="flash_attention_2",  # loading the model with flash-attenstion support
    torch_dtype=torch.bfloat16,
    device_map=None
)

INFO:__main__:Training/evaluation parameters TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'gradient_accumulation_kwargs': None},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_steps=None,
evaluation_strategy=no,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params':

In [None]:

model = AutoModelForCausalLM.from_pretrained(checkpoint_path, **model_kwargs)
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
tokenizer.model_max_length = 2048
tokenizer.pad_token = tokenizer.unk_token  # use unk rather than eos token to prevent endless generation
tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
tokenizer.padding_side = 'right'



[INFO|configuration_utils.py:726] 2024-05-10 06:27:59,827 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--microsoft--Phi-3-mini-4k-instruct/snapshots/920b6cf52a79ecff578cc33f61922b23cbc88115/config.json
[INFO|configuration_utils.py:726] 2024-05-10 06:27:59,880 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--microsoft--Phi-3-mini-4k-instruct/snapshots/920b6cf52a79ecff578cc33f61922b23cbc88115/config.json
[INFO|configuration_utils.py:789] 2024-05-10 06:27:59,887 >> Model config Phi3Config {
  "_name_or_path": "microsoft/Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-4k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-4k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "si

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO|modeling_utils.py:4170] 2024-05-10 06:28:36,600 >> All model checkpoint weights were used when initializing Phi3ForCausalLM.

[INFO|modeling_utils.py:4178] 2024-05-10 06:28:36,607 >> All the weights of Phi3ForCausalLM were initialized from the model checkpoint at microsoft/Phi-3-mini-4k-instruct.
If your task is similar to the task the model of the checkpoint was trained on, you can already use Phi3ForCausalLM for predictions without further training.


generation_config.json:   0%|          | 0.00/172 [00:00<?, ?B/s]

[INFO|configuration_utils.py:883] 2024-05-10 06:28:36,734 >> loading configuration file generation_config.json from cache at /root/.cache/huggingface/hub/models--microsoft--Phi-3-mini-4k-instruct/snapshots/920b6cf52a79ecff578cc33f61922b23cbc88115/generation_config.json
[INFO|configuration_utils.py:928] 2024-05-10 06:28:36,735 >> Generate config GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": [
    32000,
    32001,
    32007
  ],
  "pad_token_id": 32000
}



tokenizer_config.json:   0%|          | 0.00/3.17k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/568 [00:00<?, ?B/s]

[INFO|tokenization_utils_base.py:2087] 2024-05-10 06:28:37,350 >> loading file tokenizer.model from cache at /root/.cache/huggingface/hub/models--microsoft--Phi-3-mini-4k-instruct/snapshots/920b6cf52a79ecff578cc33f61922b23cbc88115/tokenizer.model
[INFO|tokenization_utils_base.py:2087] 2024-05-10 06:28:37,353 >> loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--microsoft--Phi-3-mini-4k-instruct/snapshots/920b6cf52a79ecff578cc33f61922b23cbc88115/tokenizer.json
[INFO|tokenization_utils_base.py:2087] 2024-05-10 06:28:37,356 >> loading file added_tokens.json from cache at /root/.cache/huggingface/hub/models--microsoft--Phi-3-mini-4k-instruct/snapshots/920b6cf52a79ecff578cc33f61922b23cbc88115/added_tokens.json
[INFO|tokenization_utils_base.py:2087] 2024-05-10 06:28:37,359 >> loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--microsoft--Phi-3-mini-4k-instruct/snapshots/920b6cf52a79ecff578cc33f61922b23cbc88115/special_tokens_map

In [None]:

##################
# Data Processing
##################
def apply_chat_template(
    example,
    tokenizer,
):
    messages = example["messages"]
    # Add an empty system message if there is none
    if messages[0]["role"] != "system":
        messages.insert(0, {"role": "system", "content": ""})
    example["text"] = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=False)
    return example

raw_dataset = load_dataset("HuggingFaceH4/ultrachat_200k")
train_dataset = raw_dataset["train_sft"]
test_dataset = raw_dataset["test_sft"]
column_names = list(train_dataset.features)

processed_train_dataset = train_dataset.map(
    apply_chat_template,
    fn_kwargs={"tokenizer": tokenizer},
    num_proc=10,
    remove_columns=column_names,
    desc="Applying chat template to train_sft",
)

processed_test_dataset = test_dataset.map(
    apply_chat_template,
    fn_kwargs={"tokenizer": tokenizer},
    num_proc=10,
    remove_columns=column_names,
    desc="Applying chat template to test_sft",
)


###########
# Training
###########
trainer = SFTTrainer(
    model=model,
    args=train_conf,
    peft_config=peft_conf,
    train_dataset=processed_train_dataset,
    eval_dataset=processed_test_dataset,
    max_seq_length=2048,
    dataset_text_field="text",
    tokenizer=tokenizer,
    packing=True
)
train_result = trainer.train()
metrics = train_result.metrics
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()


#############
# Evaluation
#############
tokenizer.padding_side = 'left'
metrics = trainer.evaluate()
metrics["eval_samples"] = len(processed_test_dataset)
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)


# ############
# # Save model
# ############
trainer.save_model(train_conf.output_dir)

https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k/resolve/f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb/README.md not found in cache or force_download set to True, downloading to /root/.cache/huggingface/datasets/downloads/2308d4ab5d4abace4441eb331fb4a84950783ac5fad54bdd2b1bb0e6d874829c.3ecb0462117b7560fd07d072dc04e5ed70d29811052bd142b028753831bc8e72.incomplete
INFO:datasets.utils.file_utils:https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k/resolve/f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb/README.md not found in cache or force_download set to True, downloading to /root/.cache/huggingface/datasets/downloads/2308d4ab5d4abace4441eb331fb4a84950783ac5fad54bdd2b1bb0e6d874829c.3ecb0462117b7560fd07d072dc04e5ed70d29811052bd142b028753831bc8e72.incomplete


Downloading readme:   0%|          | 0.00/4.44k [00:00<?, ?B/s]

storing https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k/resolve/f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb/README.md in cache at /root/.cache/huggingface/datasets/downloads/2308d4ab5d4abace4441eb331fb4a84950783ac5fad54bdd2b1bb0e6d874829c.3ecb0462117b7560fd07d072dc04e5ed70d29811052bd142b028753831bc8e72
INFO:datasets.utils.file_utils:storing https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k/resolve/f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb/README.md in cache at /root/.cache/huggingface/datasets/downloads/2308d4ab5d4abace4441eb331fb4a84950783ac5fad54bdd2b1bb0e6d874829c.3ecb0462117b7560fd07d072dc04e5ed70d29811052bd142b028753831bc8e72
creating metadata file for /root/.cache/huggingface/datasets/downloads/2308d4ab5d4abace4441eb331fb4a84950783ac5fad54bdd2b1bb0e6d874829c.3ecb0462117b7560fd07d072dc04e5ed70d29811052bd142b028753831bc8e72
INFO:datasets.utils.file_utils:creating metadata file for /root/.cache/huggingface/datasets/downloads/2308d4ab5d4abace4441eb331fb4a84950783a

Downloading data:   0%|          | 0.00/244M [00:00<?, ?B/s]

storing hf://datasets/HuggingFaceH4/ultrachat_200k@f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb/data/train_sft-00000-of-00003-a3ecf92756993583.parquet in cache at /root/.cache/huggingface/datasets/downloads/2ea1c4b4b741066bce560c9899ac1e726de3810da0926439e2a8a9188efcf481
INFO:datasets.utils.file_utils:storing hf://datasets/HuggingFaceH4/ultrachat_200k@f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb/data/train_sft-00000-of-00003-a3ecf92756993583.parquet in cache at /root/.cache/huggingface/datasets/downloads/2ea1c4b4b741066bce560c9899ac1e726de3810da0926439e2a8a9188efcf481
creating metadata file for /root/.cache/huggingface/datasets/downloads/2ea1c4b4b741066bce560c9899ac1e726de3810da0926439e2a8a9188efcf481
INFO:datasets.utils.file_utils:creating metadata file for /root/.cache/huggingface/datasets/downloads/2ea1c4b4b741066bce560c9899ac1e726de3810da0926439e2a8a9188efcf481
hf://datasets/HuggingFaceH4/ultrachat_200k@f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb/data/train_sft-00001-of-00003-0a1804bcb6ae68c6

Downloading data:   0%|          | 0.00/244M [00:00<?, ?B/s]

storing hf://datasets/HuggingFaceH4/ultrachat_200k@f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb/data/train_sft-00001-of-00003-0a1804bcb6ae68c6.parquet in cache at /root/.cache/huggingface/datasets/downloads/9ea1b34139edf850223cf94ec86dce854653fede94fc6e028153720541d27f3d
INFO:datasets.utils.file_utils:storing hf://datasets/HuggingFaceH4/ultrachat_200k@f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb/data/train_sft-00001-of-00003-0a1804bcb6ae68c6.parquet in cache at /root/.cache/huggingface/datasets/downloads/9ea1b34139edf850223cf94ec86dce854653fede94fc6e028153720541d27f3d
creating metadata file for /root/.cache/huggingface/datasets/downloads/9ea1b34139edf850223cf94ec86dce854653fede94fc6e028153720541d27f3d
INFO:datasets.utils.file_utils:creating metadata file for /root/.cache/huggingface/datasets/downloads/9ea1b34139edf850223cf94ec86dce854653fede94fc6e028153720541d27f3d
hf://datasets/HuggingFaceH4/ultrachat_200k@f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb/data/train_sft-00002-of-00003-ee46ed25cfae92c6

Downloading data:   0%|          | 0.00/244M [00:00<?, ?B/s]

storing hf://datasets/HuggingFaceH4/ultrachat_200k@f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb/data/train_sft-00002-of-00003-ee46ed25cfae92c6.parquet in cache at /root/.cache/huggingface/datasets/downloads/9dcbe7bdea54ea9a4858bd20ce46c02987c2858888facb1b9a5955586b73dc2b
INFO:datasets.utils.file_utils:storing hf://datasets/HuggingFaceH4/ultrachat_200k@f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb/data/train_sft-00002-of-00003-ee46ed25cfae92c6.parquet in cache at /root/.cache/huggingface/datasets/downloads/9dcbe7bdea54ea9a4858bd20ce46c02987c2858888facb1b9a5955586b73dc2b
creating metadata file for /root/.cache/huggingface/datasets/downloads/9dcbe7bdea54ea9a4858bd20ce46c02987c2858888facb1b9a5955586b73dc2b
INFO:datasets.utils.file_utils:creating metadata file for /root/.cache/huggingface/datasets/downloads/9dcbe7bdea54ea9a4858bd20ce46c02987c2858888facb1b9a5955586b73dc2b
hf://datasets/HuggingFaceH4/ultrachat_200k@f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb/data/test_sft-00000-of-00001-f7dfac4afe5b93f4.

Downloading data:   0%|          | 0.00/81.2M [00:00<?, ?B/s]

storing hf://datasets/HuggingFaceH4/ultrachat_200k@f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb/data/test_sft-00000-of-00001-f7dfac4afe5b93f4.parquet in cache at /root/.cache/huggingface/datasets/downloads/45acaa106dce5932b3a2c04aed51da0a1c1a155e409733297d737d8f4d8bacfe
INFO:datasets.utils.file_utils:storing hf://datasets/HuggingFaceH4/ultrachat_200k@f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb/data/test_sft-00000-of-00001-f7dfac4afe5b93f4.parquet in cache at /root/.cache/huggingface/datasets/downloads/45acaa106dce5932b3a2c04aed51da0a1c1a155e409733297d737d8f4d8bacfe
creating metadata file for /root/.cache/huggingface/datasets/downloads/45acaa106dce5932b3a2c04aed51da0a1c1a155e409733297d737d8f4d8bacfe
INFO:datasets.utils.file_utils:creating metadata file for /root/.cache/huggingface/datasets/downloads/45acaa106dce5932b3a2c04aed51da0a1c1a155e409733297d737d8f4d8bacfe
hf://datasets/HuggingFaceH4/ultrachat_200k@f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb/data/train_gen-00000-of-00003-a6c9fb894be3e50b.p

Downloading data:   0%|          | 0.00/244M [00:00<?, ?B/s]

storing hf://datasets/HuggingFaceH4/ultrachat_200k@f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb/data/train_gen-00000-of-00003-a6c9fb894be3e50b.parquet in cache at /root/.cache/huggingface/datasets/downloads/430ce1e710e84ea42e9afea2187a6b5fe037624689232f264bb75c83bf40a06a
INFO:datasets.utils.file_utils:storing hf://datasets/HuggingFaceH4/ultrachat_200k@f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb/data/train_gen-00000-of-00003-a6c9fb894be3e50b.parquet in cache at /root/.cache/huggingface/datasets/downloads/430ce1e710e84ea42e9afea2187a6b5fe037624689232f264bb75c83bf40a06a
creating metadata file for /root/.cache/huggingface/datasets/downloads/430ce1e710e84ea42e9afea2187a6b5fe037624689232f264bb75c83bf40a06a
INFO:datasets.utils.file_utils:creating metadata file for /root/.cache/huggingface/datasets/downloads/430ce1e710e84ea42e9afea2187a6b5fe037624689232f264bb75c83bf40a06a
hf://datasets/HuggingFaceH4/ultrachat_200k@f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb/data/train_gen-00001-of-00003-d6a0402e417f35ca

Downloading data:   0%|          | 0.00/243M [00:00<?, ?B/s]

storing hf://datasets/HuggingFaceH4/ultrachat_200k@f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb/data/train_gen-00001-of-00003-d6a0402e417f35ca.parquet in cache at /root/.cache/huggingface/datasets/downloads/608fc5291d763c102a581a38289e5c0ee3ef71fbb67972f303f321c151416837
INFO:datasets.utils.file_utils:storing hf://datasets/HuggingFaceH4/ultrachat_200k@f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb/data/train_gen-00001-of-00003-d6a0402e417f35ca.parquet in cache at /root/.cache/huggingface/datasets/downloads/608fc5291d763c102a581a38289e5c0ee3ef71fbb67972f303f321c151416837
creating metadata file for /root/.cache/huggingface/datasets/downloads/608fc5291d763c102a581a38289e5c0ee3ef71fbb67972f303f321c151416837
INFO:datasets.utils.file_utils:creating metadata file for /root/.cache/huggingface/datasets/downloads/608fc5291d763c102a581a38289e5c0ee3ef71fbb67972f303f321c151416837
hf://datasets/HuggingFaceH4/ultrachat_200k@f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb/data/train_gen-00002-of-00003-c0db75b92a2f48fd

Downloading data:   0%|          | 0.00/243M [00:00<?, ?B/s]

storing hf://datasets/HuggingFaceH4/ultrachat_200k@f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb/data/train_gen-00002-of-00003-c0db75b92a2f48fd.parquet in cache at /root/.cache/huggingface/datasets/downloads/cd410113c226b7528b8052ed6fcf6cf406d84e754c38c02589df6d1d8553c70f
INFO:datasets.utils.file_utils:storing hf://datasets/HuggingFaceH4/ultrachat_200k@f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb/data/train_gen-00002-of-00003-c0db75b92a2f48fd.parquet in cache at /root/.cache/huggingface/datasets/downloads/cd410113c226b7528b8052ed6fcf6cf406d84e754c38c02589df6d1d8553c70f
creating metadata file for /root/.cache/huggingface/datasets/downloads/cd410113c226b7528b8052ed6fcf6cf406d84e754c38c02589df6d1d8553c70f
INFO:datasets.utils.file_utils:creating metadata file for /root/.cache/huggingface/datasets/downloads/cd410113c226b7528b8052ed6fcf6cf406d84e754c38c02589df6d1d8553c70f
hf://datasets/HuggingFaceH4/ultrachat_200k@f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb/data/test_gen-00000-of-00001-3d4cd8309148a71f.

Downloading data:   0%|          | 0.00/80.4M [00:00<?, ?B/s]

storing hf://datasets/HuggingFaceH4/ultrachat_200k@f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb/data/test_gen-00000-of-00001-3d4cd8309148a71f.parquet in cache at /root/.cache/huggingface/datasets/downloads/a5835499e7ef071327532513d9bdf5ec3ab8597bc6b37d0823c2dd2ed1375921
INFO:datasets.utils.file_utils:storing hf://datasets/HuggingFaceH4/ultrachat_200k@f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb/data/test_gen-00000-of-00001-3d4cd8309148a71f.parquet in cache at /root/.cache/huggingface/datasets/downloads/a5835499e7ef071327532513d9bdf5ec3ab8597bc6b37d0823c2dd2ed1375921
creating metadata file for /root/.cache/huggingface/datasets/downloads/a5835499e7ef071327532513d9bdf5ec3ab8597bc6b37d0823c2dd2ed1375921
INFO:datasets.utils.file_utils:creating metadata file for /root/.cache/huggingface/datasets/downloads/a5835499e7ef071327532513d9bdf5ec3ab8597bc6b37d0823c2dd2ed1375921
Downloading took 0.0 min
INFO:datasets.download.download_manager:Downloading took 0.0 min
Checksum Computation took 0.0 min
INFO:dat

Generating train_sft split:   0%|          | 0/207865 [00:00<?, ? examples/s]

Generating test_sft split
INFO:datasets.builder:Generating test_sft split


Generating test_sft split:   0%|          | 0/23110 [00:00<?, ? examples/s]

Generating train_gen split
INFO:datasets.builder:Generating train_gen split


Generating train_gen split:   0%|          | 0/256032 [00:00<?, ? examples/s]

KeyboardInterrupt: 