ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_sft.py recipes/constitutional-ai/sft/config_{grok,anthropic}.yaml


In [10]:
import logging
import random
import sys

import datasets
import torch
import transformers
from transformers import AutoModelForCausalLM, set_seed

from alignment.configs import DataArguments, DPOConfig, H4ArgumentParser, ModelArguments, SFTConfig
from alignment.data import apply_chat_template, get_datasets
from alignment.decontaminate import decontaminate_humaneval
from alignment.model_utils import (
    get_checkpoint,
    get_kbit_device_map,
    get_peft_config,
    get_quantization_config,
    get_tokenizer,
    is_adapter_model,
)

from trl import SFTTrainer, setup_chat_format

logger = logging.getLogger(__name__)


In [26]:
sys.argv = ["notebook", 'configs/sft_config.yaml']

parser = H4ArgumentParser((ModelArguments, DataArguments, SFTConfig))
model_args, data_args, training_args = parser.parse()

# Set seed for reproducibility
set_seed(training_args.seed)

[INFO|training_args.py:2169] 2024-11-17 21:41:10,513 >> PyTorch: setting up devices


In [16]:
model_args

ModelArguments(base_model_revision=None, model_name_or_path='mistralai/Mistral-7B-Instruct-v0.1', model_revision='main', model_code_revision=None, torch_dtype='bfloat16', tokenizer_name_or_path=None, trust_remote_code=False, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False, bnb_4bit_quant_storage='uint8')

In [7]:
data_args

DataArguments(chat_template="{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}", dataset_mixer={'HuggingFaceH4/cai-conversation-harmless': 1.0, 'HuggingFaceH4/ultrachat_200k': 1.0}, text_column='text', dataset_splits=['train_sft', 'test_sft'], dataset_configs=None, preprocessing_num_workers=12, truncation_side=None, auto_insert_empty_system_msg=True)

In [8]:
training_args



In [11]:
###############
# Setup logging
###############
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    handlers=[logging.StreamHandler(sys.stdout)],
)
log_level = training_args.get_process_log_level()
logger.setLevel(log_level)
datasets.utils.logging.set_verbosity(log_level)
transformers.utils.logging.set_verbosity(log_level)
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()

# Log on each process a small summary
logger.warning(
    f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
    + f" distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
)
logger.info(f"Model parameters {model_args}")
logger.info(f"Data parameters {data_args}")
logger.info(f"Training/evaluation parameters {training_args}")

2024-11-17 21:01:06 - INFO - __main__ - Model parameters ModelArguments(base_model_revision=None, model_name_or_path='mistralai/Mistral-7B-v0.1', model_revision='main', model_code_revision=None, torch_dtype='bfloat16', tokenizer_name_or_path=None, trust_remote_code=False, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False, bnb_4bit_quant_storage='uint8')
2024-11-17 21:01:06 - INFO - __main__ - Data parameters DataArguments(chat_template="{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_genera

In [12]:
# Check for last checkpoint
last_checkpoint = get_checkpoint(training_args)
if last_checkpoint is not None and training_args.resume_from_checkpoint is None:
    logger.info(f"Checkpoint detected, resuming training at {last_checkpoint=}.")

In [54]:
print(raw_datasets['train'][-1]['text'])

<|system|>
</s>
<|user|>
What is the historical significance of the pink stone extracted from Zacatecas quarry in Mexican architecture?</s>
<|assistant|>
The pink stone extracted from Zacatecas quarry in Mexican architecture is historically significant because it was widely used in the construction of colonial architecture in Mexico during the 16th century. This stone is a type of sandstone called cantera rosa, which has a pinkish color and is relatively easy to cut and carve. It was popular among Spanish architects and builders because of its beauty, durability, and availability in the region.

Zacatecas became an important source of this pink stone during the colonial period, and it was used extensively in the construction of cathedrals, churches, government buildings, and private residences throughout Mexico. Some notable examples of cantera rosa architecture include the Cathedral of Our Lady of the Assumption in Mexico City, the Cathedral of Puebla, and the Templo de San Francisco 

In [56]:
test_data = get_datasets(
    data_args,
    splits=data_args.dataset_splits,
    configs=data_args.dataset_configs,
    columns_to_keep=["messages", "chosen", "rejected", "prompt", "completion", "label"],
)

Overwrite dataset info from restored data version if exists.


2024-11-17 22:27:53 - INFO - datasets.builder - Overwrite dataset info from restored data version if exists.


Loading Dataset info from /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464


2024-11-17 22:27:53 - INFO - datasets.info - Loading Dataset info from /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464


Found cached dataset cai-conversation-harmless (/arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464)


2024-11-17 22:27:53 - INFO - datasets.builder - Found cached dataset cai-conversation-harmless (/arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464)


Loading Dataset info from /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464


2024-11-17 22:27:53 - INFO - datasets.info - Loading Dataset info from /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464


Overwrite dataset info from restored data version if exists.


2024-11-17 22:27:54 - INFO - datasets.builder - Overwrite dataset info from restored data version if exists.


Loading Dataset info from /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464


2024-11-17 22:27:54 - INFO - datasets.info - Loading Dataset info from /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464


Found cached dataset cai-conversation-harmless (/arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464)


2024-11-17 22:27:55 - INFO - datasets.builder - Found cached dataset cai-conversation-harmless (/arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464)


Loading Dataset info from /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464


2024-11-17 22:27:55 - INFO - datasets.info - Loading Dataset info from /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464


Overwrite dataset info from restored data version if exists.


2024-11-17 22:27:56 - INFO - datasets.builder - Overwrite dataset info from restored data version if exists.


Loading Dataset info from /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___ultrachat_200k/default/0.0.0/8049631c405ae6576f93f445c6b8166f76f5505a


2024-11-17 22:27:56 - INFO - datasets.info - Loading Dataset info from /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___ultrachat_200k/default/0.0.0/8049631c405ae6576f93f445c6b8166f76f5505a


Found cached dataset ultrachat_200k (/arc/home/obriaint/huggingface/datasets/HuggingFaceH4___ultrachat_200k/default/0.0.0/8049631c405ae6576f93f445c6b8166f76f5505a)


2024-11-17 22:27:56 - INFO - datasets.builder - Found cached dataset ultrachat_200k (/arc/home/obriaint/huggingface/datasets/HuggingFaceH4___ultrachat_200k/default/0.0.0/8049631c405ae6576f93f445c6b8166f76f5505a)


Loading Dataset info from /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___ultrachat_200k/default/0.0.0/8049631c405ae6576f93f445c6b8166f76f5505a


2024-11-17 22:27:56 - INFO - datasets.info - Loading Dataset info from /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___ultrachat_200k/default/0.0.0/8049631c405ae6576f93f445c6b8166f76f5505a


Overwrite dataset info from restored data version if exists.


2024-11-17 22:27:58 - INFO - datasets.builder - Overwrite dataset info from restored data version if exists.


Loading Dataset info from /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___ultrachat_200k/default/0.0.0/8049631c405ae6576f93f445c6b8166f76f5505a


2024-11-17 22:27:58 - INFO - datasets.info - Loading Dataset info from /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___ultrachat_200k/default/0.0.0/8049631c405ae6576f93f445c6b8166f76f5505a


Found cached dataset ultrachat_200k (/arc/home/obriaint/huggingface/datasets/HuggingFaceH4___ultrachat_200k/default/0.0.0/8049631c405ae6576f93f445c6b8166f76f5505a)


2024-11-17 22:27:58 - INFO - datasets.builder - Found cached dataset ultrachat_200k (/arc/home/obriaint/huggingface/datasets/HuggingFaceH4___ultrachat_200k/default/0.0.0/8049631c405ae6576f93f445c6b8166f76f5505a)


Loading Dataset info from /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___ultrachat_200k/default/0.0.0/8049631c405ae6576f93f445c6b8166f76f5505a


2024-11-17 22:27:58 - INFO - datasets.info - Loading Dataset info from /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___ultrachat_200k/default/0.0.0/8049631c405ae6576f93f445c6b8166f76f5505a


Some of the datasets have disparate format. Resetting the format of the concatenated dataset.


2024-11-17 22:27:58 - INFO - datasets.arrow_dataset - Some of the datasets have disparate format. Resetting the format of the concatenated dataset.


Loading cached shuffled indices for dataset at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-ed939b0339b874d1.arrow


2024-11-17 22:27:58 - INFO - datasets.arrow_dataset - Loading cached shuffled indices for dataset at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-ed939b0339b874d1.arrow


Some of the datasets have disparate format. Resetting the format of the concatenated dataset.


2024-11-17 22:27:58 - INFO - datasets.arrow_dataset - Some of the datasets have disparate format. Resetting the format of the concatenated dataset.


Loading cached shuffled indices for dataset at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-e4c1e667b15edf7c.arrow


2024-11-17 22:27:58 - INFO - datasets.arrow_dataset - Loading cached shuffled indices for dataset at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-e4c1e667b15edf7c.arrow


In [13]:
###############
# Load datasets
###############
raw_datasets = get_datasets(
    data_args,
    splits=data_args.dataset_splits,
    configs=data_args.dataset_configs,
    columns_to_keep=["messages", "chosen", "rejected", "prompt", "completion", "label"],
)
logger.info(
    f"Training on the following datasets and their proportions: {[split + ' : ' + str(dset.num_rows) for split, dset in raw_datasets.items()]}"
)
column_names = list(raw_datasets["train"].features)


README.md:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

Generating dataset cai-conversation-harmless (/arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464)


2024-11-17 21:01:42 - INFO - datasets.builder - Generating dataset cai-conversation-harmless (/arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464)


Downloading and preparing dataset cai-conversation-harmless/default to /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464...


2024-11-17 21:01:42 - INFO - datasets.builder - Downloading and preparing dataset cai-conversation-harmless/default to /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464...


(…)-00000-of-00001-7f69edecc397f342.parquet:   0%|          | 0.00/35.3M [00:00<?, ?B/s]

(…)-00000-of-00001-32cbb1e30771a5e7.parquet:   0%|          | 0.00/35.5M [00:00<?, ?B/s]

(…)-00000-of-00001-96844167dbb0a822.parquet:   0%|          | 0.00/1.94M [00:00<?, ?B/s]

(…)-00000-of-00001-f0c5cd137d4c1d93.parquet:   0%|          | 0.00/2.01M [00:00<?, ?B/s]

Downloading took 0.0 min


2024-11-17 21:01:47 - INFO - datasets.download.download_manager - Downloading took 0.0 min


Checksum Computation took 0.0 min


2024-11-17 21:01:47 - INFO - datasets.download.download_manager - Checksum Computation took 0.0 min


Generating train_sft split


2024-11-17 21:01:47 - INFO - datasets.builder - Generating train_sft split


Generating train_sft split:   0%|          | 0/21268 [00:00<?, ? examples/s]

Generating train_prefs split


2024-11-17 21:01:50 - INFO - datasets.builder - Generating train_prefs split


Generating train_prefs split:   0%|          | 0/21269 [00:00<?, ? examples/s]

Generating test_sft split


2024-11-17 21:01:54 - INFO - datasets.builder - Generating test_sft split


Generating test_sft split:   0%|          | 0/1156 [00:00<?, ? examples/s]

Generating test_prefs split


2024-11-17 21:01:54 - INFO - datasets.builder - Generating test_prefs split


Generating test_prefs split:   0%|          | 0/1156 [00:00<?, ? examples/s]

All the splits matched successfully.


2024-11-17 21:01:54 - INFO - datasets.utils.info_utils - All the splits matched successfully.


Dataset cai-conversation-harmless downloaded and prepared to /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464. Subsequent calls will reuse this data.


2024-11-17 21:01:54 - INFO - datasets.builder - Dataset cai-conversation-harmless downloaded and prepared to /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464. Subsequent calls will reuse this data.


Overwrite dataset info from restored data version if exists.


2024-11-17 21:01:56 - INFO - datasets.builder - Overwrite dataset info from restored data version if exists.


Loading Dataset info from /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464


2024-11-17 21:01:56 - INFO - datasets.info - Loading Dataset info from /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464


Found cached dataset cai-conversation-harmless (/arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464)


2024-11-17 21:01:56 - INFO - datasets.builder - Found cached dataset cai-conversation-harmless (/arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464)


Loading Dataset info from /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464


2024-11-17 21:01:56 - INFO - datasets.info - Loading Dataset info from /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464


README.md:   0%|          | 0.00/3.90k [00:00<?, ?B/s]

Generating dataset ultrachat_200k (/arc/home/obriaint/huggingface/datasets/HuggingFaceH4___ultrachat_200k/default/0.0.0/8049631c405ae6576f93f445c6b8166f76f5505a)


2024-11-17 21:02:00 - INFO - datasets.builder - Generating dataset ultrachat_200k (/arc/home/obriaint/huggingface/datasets/HuggingFaceH4___ultrachat_200k/default/0.0.0/8049631c405ae6576f93f445c6b8166f76f5505a)


Downloading and preparing dataset ultrachat_200k/default to /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___ultrachat_200k/default/0.0.0/8049631c405ae6576f93f445c6b8166f76f5505a...


2024-11-17 21:02:00 - INFO - datasets.builder - Downloading and preparing dataset ultrachat_200k/default to /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___ultrachat_200k/default/0.0.0/8049631c405ae6576f93f445c6b8166f76f5505a...


(…)-00000-of-00003-a3ecf92756993583.parquet:   0%|          | 0.00/244M [00:00<?, ?B/s]

(…)-00001-of-00003-0a1804bcb6ae68c6.parquet:   0%|          | 0.00/244M [00:00<?, ?B/s]

(…)-00002-of-00003-ee46ed25cfae92c6.parquet:   0%|          | 0.00/244M [00:00<?, ?B/s]

(…)-00000-of-00001-f7dfac4afe5b93f4.parquet:   0%|          | 0.00/81.2M [00:00<?, ?B/s]

(…)-00000-of-00003-a6c9fb894be3e50b.parquet:   0%|          | 0.00/244M [00:00<?, ?B/s]

(…)-00001-of-00003-d6a0402e417f35ca.parquet:   0%|          | 0.00/243M [00:00<?, ?B/s]

(…)-00002-of-00003-c0db75b92a2f48fd.parquet:   0%|          | 0.00/243M [00:00<?, ?B/s]

(…)-00000-of-00001-3d4cd8309148a71f.parquet:   0%|          | 0.00/80.4M [00:00<?, ?B/s]

Downloading took 0.0 min


2024-11-17 21:02:55 - INFO - datasets.download.download_manager - Downloading took 0.0 min


Checksum Computation took 0.0 min


2024-11-17 21:02:55 - INFO - datasets.download.download_manager - Checksum Computation took 0.0 min


Generating train_sft split


2024-11-17 21:02:55 - INFO - datasets.builder - Generating train_sft split


Generating train_sft split:   0%|          | 0/207865 [00:00<?, ? examples/s]

Generating test_sft split


2024-11-17 21:03:47 - INFO - datasets.builder - Generating test_sft split


Generating test_sft split:   0%|          | 0/23110 [00:00<?, ? examples/s]

Generating train_gen split


2024-11-17 21:03:52 - INFO - datasets.builder - Generating train_gen split


Generating train_gen split:   0%|          | 0/256032 [00:00<?, ? examples/s]

Generating test_gen split


2024-11-17 21:04:53 - INFO - datasets.builder - Generating test_gen split


Generating test_gen split:   0%|          | 0/28304 [00:00<?, ? examples/s]

All the splits matched successfully.


2024-11-17 21:05:00 - INFO - datasets.utils.info_utils - All the splits matched successfully.


Dataset ultrachat_200k downloaded and prepared to /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___ultrachat_200k/default/0.0.0/8049631c405ae6576f93f445c6b8166f76f5505a. Subsequent calls will reuse this data.


2024-11-17 21:05:00 - INFO - datasets.builder - Dataset ultrachat_200k downloaded and prepared to /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___ultrachat_200k/default/0.0.0/8049631c405ae6576f93f445c6b8166f76f5505a. Subsequent calls will reuse this data.


Overwrite dataset info from restored data version if exists.


2024-11-17 21:05:03 - INFO - datasets.builder - Overwrite dataset info from restored data version if exists.


Loading Dataset info from /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___ultrachat_200k/default/0.0.0/8049631c405ae6576f93f445c6b8166f76f5505a


2024-11-17 21:05:03 - INFO - datasets.info - Loading Dataset info from /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___ultrachat_200k/default/0.0.0/8049631c405ae6576f93f445c6b8166f76f5505a


Found cached dataset ultrachat_200k (/arc/home/obriaint/huggingface/datasets/HuggingFaceH4___ultrachat_200k/default/0.0.0/8049631c405ae6576f93f445c6b8166f76f5505a)


2024-11-17 21:05:03 - INFO - datasets.builder - Found cached dataset ultrachat_200k (/arc/home/obriaint/huggingface/datasets/HuggingFaceH4___ultrachat_200k/default/0.0.0/8049631c405ae6576f93f445c6b8166f76f5505a)


Loading Dataset info from /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___ultrachat_200k/default/0.0.0/8049631c405ae6576f93f445c6b8166f76f5505a


2024-11-17 21:05:03 - INFO - datasets.info - Loading Dataset info from /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___ultrachat_200k/default/0.0.0/8049631c405ae6576f93f445c6b8166f76f5505a


Some of the datasets have disparate format. Resetting the format of the concatenated dataset.


2024-11-17 21:05:03 - INFO - datasets.arrow_dataset - Some of the datasets have disparate format. Resetting the format of the concatenated dataset.


Caching indices mapping at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-ed939b0339b874d1.arrow


2024-11-17 21:05:03 - INFO - datasets.arrow_dataset - Caching indices mapping at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-ed939b0339b874d1.arrow


Some of the datasets have disparate format. Resetting the format of the concatenated dataset.


2024-11-17 21:05:03 - INFO - datasets.arrow_dataset - Some of the datasets have disparate format. Resetting the format of the concatenated dataset.


Caching indices mapping at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-e4c1e667b15edf7c.arrow


2024-11-17 21:05:03 - INFO - datasets.arrow_dataset - Caching indices mapping at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-e4c1e667b15edf7c.arrow
2024-11-17 21:05:03 - INFO - __main__ - Training on the following datasets and their proportions: ['train : 229133', 'test : 24266']


In [17]:
################
# Load tokenizer
################
tokenizer = get_tokenizer(model_args, data_args)

#######################
# Load pretrained model
#######################
logger.info("*** Load pretrained model ***")
torch_dtype = (
    model_args.torch_dtype if model_args.torch_dtype in ["auto", None] else getattr(torch, model_args.torch_dtype)
)
quantization_config = get_quantization_config(model_args)

model_kwargs = dict(
    revision=model_args.model_revision,
    trust_remote_code=model_args.trust_remote_code,
    attn_implementation=model_args.attn_implementation,
    torch_dtype=torch_dtype,
    use_cache=False if training_args.gradient_checkpointing else True,
    device_map=get_kbit_device_map() if quantization_config is not None else None,
    quantization_config=quantization_config,
)

model = model_args.model_name_or_path
# For ChatML we need to add special tokens and resize the embedding layer
if "<|im_start|>" in tokenizer.chat_template and "gemma-tokenizer-chatml" not in tokenizer.name_or_path:
    model = AutoModelForCausalLM.from_pretrained(model_args.model_name_or_path, **model_kwargs)
    model, tokenizer = setup_chat_format(model, tokenizer)
    model_kwargs = None


[INFO|tokenization_utils_base.py:2211] 2024-11-17 21:07:48,538 >> loading file tokenizer.model from cache at /arc/home/obriaint/huggingface/hub/models--mistralai--Mistral-7B-Instruct-v0.1/snapshots/2dcff66eac0c01dc50e4c41eea959968232187fe/tokenizer.model
[INFO|tokenization_utils_base.py:2211] 2024-11-17 21:07:48,539 >> loading file tokenizer.json from cache at /arc/home/obriaint/huggingface/hub/models--mistralai--Mistral-7B-Instruct-v0.1/snapshots/2dcff66eac0c01dc50e4c41eea959968232187fe/tokenizer.json
[INFO|tokenization_utils_base.py:2211] 2024-11-17 21:07:48,540 >> loading file added_tokens.json from cache at None
[INFO|tokenization_utils_base.py:2211] 2024-11-17 21:07:48,540 >> loading file special_tokens_map.json from cache at /arc/home/obriaint/huggingface/hub/models--mistralai--Mistral-7B-Instruct-v0.1/snapshots/2dcff66eac0c01dc50e4c41eea959968232187fe/special_tokens_map.json
[INFO|tokenization_utils_base.py:2211] 2024-11-17 21:07:48,541 >> loading file tokenizer_config.json from

2024-11-17 21:07:48 - INFO - __main__ - *** Load pretrained model ***


In [18]:
#####################
# Apply chat template
#####################
raw_datasets = raw_datasets.map(
    apply_chat_template,
    fn_kwargs={
        "tokenizer": tokenizer,
        "task": "sft",
        "auto_insert_empty_system_msg": data_args.auto_insert_empty_system_msg,
    },
    num_proc=data_args.preprocessing_num_workers,
    remove_columns=column_names,
    desc="Applying chat template",
)

Process #0 will write at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-930193f3e0e437e9_00000_of_00012.arrow


2024-11-17 21:08:13 - INFO - datasets.arrow_dataset - Process #0 will write at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-930193f3e0e437e9_00000_of_00012.arrow


Process #1 will write at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-930193f3e0e437e9_00001_of_00012.arrow


2024-11-17 21:08:13 - INFO - datasets.arrow_dataset - Process #1 will write at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-930193f3e0e437e9_00001_of_00012.arrow


Process #2 will write at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-930193f3e0e437e9_00002_of_00012.arrow


2024-11-17 21:08:13 - INFO - datasets.arrow_dataset - Process #2 will write at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-930193f3e0e437e9_00002_of_00012.arrow


Process #3 will write at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-930193f3e0e437e9_00003_of_00012.arrow


2024-11-17 21:08:13 - INFO - datasets.arrow_dataset - Process #3 will write at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-930193f3e0e437e9_00003_of_00012.arrow


Process #4 will write at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-930193f3e0e437e9_00004_of_00012.arrow


2024-11-17 21:08:13 - INFO - datasets.arrow_dataset - Process #4 will write at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-930193f3e0e437e9_00004_of_00012.arrow


Process #5 will write at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-930193f3e0e437e9_00005_of_00012.arrow


2024-11-17 21:08:13 - INFO - datasets.arrow_dataset - Process #5 will write at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-930193f3e0e437e9_00005_of_00012.arrow


Process #6 will write at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-930193f3e0e437e9_00006_of_00012.arrow


2024-11-17 21:08:13 - INFO - datasets.arrow_dataset - Process #6 will write at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-930193f3e0e437e9_00006_of_00012.arrow


Process #7 will write at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-930193f3e0e437e9_00007_of_00012.arrow


2024-11-17 21:08:13 - INFO - datasets.arrow_dataset - Process #7 will write at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-930193f3e0e437e9_00007_of_00012.arrow


Process #8 will write at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-930193f3e0e437e9_00008_of_00012.arrow


2024-11-17 21:08:13 - INFO - datasets.arrow_dataset - Process #8 will write at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-930193f3e0e437e9_00008_of_00012.arrow


Process #9 will write at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-930193f3e0e437e9_00009_of_00012.arrow


2024-11-17 21:08:13 - INFO - datasets.arrow_dataset - Process #9 will write at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-930193f3e0e437e9_00009_of_00012.arrow


Process #10 will write at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-930193f3e0e437e9_00010_of_00012.arrow


2024-11-17 21:08:13 - INFO - datasets.arrow_dataset - Process #10 will write at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-930193f3e0e437e9_00010_of_00012.arrow


Process #11 will write at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-930193f3e0e437e9_00011_of_00012.arrow


2024-11-17 21:08:13 - INFO - datasets.arrow_dataset - Process #11 will write at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-930193f3e0e437e9_00011_of_00012.arrow


Spawning 12 processes


2024-11-17 21:08:14 - INFO - datasets.arrow_dataset - Spawning 12 processes


Applying chat template (num_proc=12):   0%|          | 0/229133 [00:00<?, ? examples/s]

Caching processed dataset at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-930193f3e0e437e9_00000_of_00012.arrow


2024-11-17 21:08:14 - INFO - datasets.arrow_dataset - Caching processed dataset at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-930193f3e0e437e9_00000_of_00012.arrow


Caching processed dataset at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-930193f3e0e437e9_00001_of_00012.arrow


2024-11-17 21:08:14 - INFO - datasets.arrow_dataset - Caching processed dataset at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-930193f3e0e437e9_00001_of_00012.arrow


Caching processed dataset at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-930193f3e0e437e9_00002_of_00012.arrow


2024-11-17 21:08:14 - INFO - datasets.arrow_dataset - Caching processed dataset at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-930193f3e0e437e9_00002_of_00012.arrow


Caching processed dataset at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-930193f3e0e437e9_00003_of_00012.arrow


2024-11-17 21:08:14 - INFO - datasets.arrow_dataset - Caching processed dataset at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-930193f3e0e437e9_00003_of_00012.arrow


Caching processed dataset at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-930193f3e0e437e9_00004_of_00012.arrow


2024-11-17 21:08:14 - INFO - datasets.arrow_dataset - Caching processed dataset at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-930193f3e0e437e9_00004_of_00012.arrow


Caching processed dataset at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-930193f3e0e437e9_00005_of_00012.arrow


2024-11-17 21:08:14 - INFO - datasets.arrow_dataset - Caching processed dataset at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-930193f3e0e437e9_00005_of_00012.arrow


Caching processed dataset at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-930193f3e0e437e9_00006_of_00012.arrow


2024-11-17 21:08:14 - INFO - datasets.arrow_dataset - Caching processed dataset at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-930193f3e0e437e9_00006_of_00012.arrow


Caching processed dataset at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-930193f3e0e437e9_00007_of_00012.arrow


2024-11-17 21:08:15 - INFO - datasets.arrow_dataset - Caching processed dataset at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-930193f3e0e437e9_00007_of_00012.arrow


Caching processed dataset at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-930193f3e0e437e9_00009_of_00012.arrow


2024-11-17 21:08:15 - INFO - datasets.arrow_dataset - Caching processed dataset at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-930193f3e0e437e9_00009_of_00012.arrow


Caching processed dataset at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-930193f3e0e437e9_00008_of_00012.arrow


2024-11-17 21:08:15 - INFO - datasets.arrow_dataset - Caching processed dataset at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-930193f3e0e437e9_00008_of_00012.arrow


Caching processed dataset at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-930193f3e0e437e9_00010_of_00012.arrow


2024-11-17 21:08:15 - INFO - datasets.arrow_dataset - Caching processed dataset at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-930193f3e0e437e9_00010_of_00012.arrow


Caching processed dataset at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-930193f3e0e437e9_00011_of_00012.arrow


2024-11-17 21:08:15 - INFO - datasets.arrow_dataset - Caching processed dataset at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-930193f3e0e437e9_00011_of_00012.arrow


Concatenating 12 shards


2024-11-17 21:08:30 - INFO - datasets.arrow_dataset - Concatenating 12 shards


Process #0 will write at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-642247be3e8a328a_00000_of_00012.arrow


2024-11-17 21:08:30 - INFO - datasets.arrow_dataset - Process #0 will write at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-642247be3e8a328a_00000_of_00012.arrow


Process #1 will write at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-642247be3e8a328a_00001_of_00012.arrow


2024-11-17 21:08:30 - INFO - datasets.arrow_dataset - Process #1 will write at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-642247be3e8a328a_00001_of_00012.arrow


Process #2 will write at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-642247be3e8a328a_00002_of_00012.arrow


2024-11-17 21:08:30 - INFO - datasets.arrow_dataset - Process #2 will write at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-642247be3e8a328a_00002_of_00012.arrow


Process #3 will write at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-642247be3e8a328a_00003_of_00012.arrow


2024-11-17 21:08:30 - INFO - datasets.arrow_dataset - Process #3 will write at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-642247be3e8a328a_00003_of_00012.arrow


Process #4 will write at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-642247be3e8a328a_00004_of_00012.arrow


2024-11-17 21:08:30 - INFO - datasets.arrow_dataset - Process #4 will write at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-642247be3e8a328a_00004_of_00012.arrow


Process #5 will write at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-642247be3e8a328a_00005_of_00012.arrow


2024-11-17 21:08:30 - INFO - datasets.arrow_dataset - Process #5 will write at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-642247be3e8a328a_00005_of_00012.arrow


Process #6 will write at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-642247be3e8a328a_00006_of_00012.arrow


2024-11-17 21:08:30 - INFO - datasets.arrow_dataset - Process #6 will write at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-642247be3e8a328a_00006_of_00012.arrow


Process #7 will write at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-642247be3e8a328a_00007_of_00012.arrow


2024-11-17 21:08:30 - INFO - datasets.arrow_dataset - Process #7 will write at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-642247be3e8a328a_00007_of_00012.arrow


Process #8 will write at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-642247be3e8a328a_00008_of_00012.arrow


2024-11-17 21:08:30 - INFO - datasets.arrow_dataset - Process #8 will write at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-642247be3e8a328a_00008_of_00012.arrow


Process #9 will write at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-642247be3e8a328a_00009_of_00012.arrow


2024-11-17 21:08:30 - INFO - datasets.arrow_dataset - Process #9 will write at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-642247be3e8a328a_00009_of_00012.arrow


Process #10 will write at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-642247be3e8a328a_00010_of_00012.arrow


2024-11-17 21:08:30 - INFO - datasets.arrow_dataset - Process #10 will write at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-642247be3e8a328a_00010_of_00012.arrow


Process #11 will write at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-642247be3e8a328a_00011_of_00012.arrow


2024-11-17 21:08:30 - INFO - datasets.arrow_dataset - Process #11 will write at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-642247be3e8a328a_00011_of_00012.arrow


Spawning 12 processes


2024-11-17 21:08:30 - INFO - datasets.arrow_dataset - Spawning 12 processes


Applying chat template (num_proc=12):   0%|          | 0/24266 [00:00<?, ? examples/s]

Caching processed dataset at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-642247be3e8a328a_00001_of_00012.arrow
Caching processed dataset at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-642247be3e8a328a_00000_of_00012.arrow
Caching processed dataset at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-642247be3e8a328a_00002_of_00012.arrow


2024-11-17 21:08:31 - INFO - datasets.arrow_dataset - Caching processed dataset at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-642247be3e8a328a_00001_of_00012.arrow
2024-11-17 21:08:31 - INFO - datasets.arrow_dataset - Caching processed dataset at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-642247be3e8a328a_00002_of_00012.arrow
2024-11-17 21:08:31 - INFO - datasets.arrow_dataset - Caching processed dataset at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-642247be3e8a328a_00000_of_00012.arrow


Caching processed dataset at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-642247be3e8a328a_00003_of_00012.arrow


2024-11-17 21:08:31 - INFO - datasets.arrow_dataset - Caching processed dataset at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-642247be3e8a328a_00003_of_00012.arrow


Caching processed dataset at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-642247be3e8a328a_00004_of_00012.arrow


2024-11-17 21:08:31 - INFO - datasets.arrow_dataset - Caching processed dataset at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-642247be3e8a328a_00004_of_00012.arrow


Caching processed dataset at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-642247be3e8a328a_00005_of_00012.arrow


2024-11-17 21:08:31 - INFO - datasets.arrow_dataset - Caching processed dataset at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-642247be3e8a328a_00005_of_00012.arrow


Caching processed dataset at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-642247be3e8a328a_00006_of_00012.arrow


2024-11-17 21:08:31 - INFO - datasets.arrow_dataset - Caching processed dataset at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-642247be3e8a328a_00006_of_00012.arrow


Caching processed dataset at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-642247be3e8a328a_00007_of_00012.arrow


2024-11-17 21:08:31 - INFO - datasets.arrow_dataset - Caching processed dataset at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-642247be3e8a328a_00007_of_00012.arrow


Caching processed dataset at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-642247be3e8a328a_00008_of_00012.arrow


2024-11-17 21:08:32 - INFO - datasets.arrow_dataset - Caching processed dataset at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-642247be3e8a328a_00008_of_00012.arrow


Caching processed dataset at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-642247be3e8a328a_00009_of_00012.arrow


2024-11-17 21:08:32 - INFO - datasets.arrow_dataset - Caching processed dataset at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-642247be3e8a328a_00009_of_00012.arrow


Caching processed dataset at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-642247be3e8a328a_00010_of_00012.arrow


2024-11-17 21:08:32 - INFO - datasets.arrow_dataset - Caching processed dataset at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-642247be3e8a328a_00010_of_00012.arrow


Caching processed dataset at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-642247be3e8a328a_00011_of_00012.arrow


2024-11-17 21:08:32 - INFO - datasets.arrow_dataset - Caching processed dataset at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-642247be3e8a328a_00011_of_00012.arrow


Concatenating 12 shards


2024-11-17 21:08:34 - INFO - datasets.arrow_dataset - Concatenating 12 shards


In [23]:
print(raw_datasets['train'][10]['text'])

<|system|>
</s>
<|user|>
People who inject drugs support the use of new, safer ‘low dead space’ syringes, our research has found.
A low dead space syringe has less space between the needle and the plunger when it’s fully pushed in, compared to traditional injecting equipment. It also has a detachable needle. The ‘dead’ space in a syringe holds blood after it’s been used. Previous research has found that low dead space syringes could reduce the chance of spreading infections, such as HIV and hepatitis C, if they’re re-used or shared.
This diagram shows how syringe design can affect the amount of blood collected and transmitted when sharing needles. The far left shows a very high dead space syringe, and the far right is the lowest.
Needle exchanges supply sterile equipment to people who inject drugs, to stop people having to share with others. But sometimes equipment still gets re-used or shared. If exchanges switch to this new type of syringe, it would help protect people from infection

In [24]:
##########################
# Decontaminate benchmarks
##########################
num_raw_train_samples = len(raw_datasets["train"])
raw_datasets = raw_datasets.filter(decontaminate_humaneval, batched=True, batch_size=10_000, num_proc=1)
num_filtered_train_samples = num_raw_train_samples - len(raw_datasets["train"])
logger.info(
    f"Decontaminated {num_filtered_train_samples} ({num_filtered_train_samples/num_raw_train_samples * 100:.2f}%) samples from the training set."
)

train_dataset = raw_datasets["train"]
eval_dataset = raw_datasets["test"]

with training_args.main_process_first(desc="Log a few random samples from the processed training set"):
    for index in random.sample(range(len(raw_datasets["train"])), 3):
        logger.info(f"Sample {index} of the processed training set:\n\n{raw_datasets['train'][index]['text']}")

Filter:   0%|          | 0/229133 [00:00<?, ? examples/s]

Caching processed dataset at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-b4d769b59b776994.arrow


2024-11-17 21:11:25 - INFO - datasets.arrow_dataset - Caching processed dataset at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-b4d769b59b776994.arrow


Filter:   0%|          | 0/24266 [00:00<?, ? examples/s]

Caching processed dataset at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-2a98584ccc9136fc.arrow


2024-11-17 21:18:51 - INFO - datasets.arrow_dataset - Caching processed dataset at /arc/home/obriaint/huggingface/datasets/HuggingFaceH4___cai-conversation-harmless/default/0.0.0/51954823511ffd96d15ed6d1004949277e7af464/cache-2a98584ccc9136fc.arrow
2024-11-17 21:19:16 - INFO - __main__ - Decontaminated 1 (0.00%) samples from the training set.
2024-11-17 21:19:16 - INFO - __main__ - Sample 167621 of the processed training set:

<|system|>
</s>
<|user|>
Vinyl Premium Plastic Protector! The clear plastic protective case allows you to easily display your Pop vinyl character anywhere with style, while giving it a nice protective outer shell! Vinyl Figures in their original packaging can be protected the way collectibles are meant to be protected. The lid easily comes off to allow placement of the Pop! Vinyl Figure inside.
Funko 3.75-Inch Vinyl Plastic POP Protector, Standard Packaging is rated 5.0 out of 5 by 3.
Rated 5 out of 5 by Honeywest from Perfect Protector for A Loved Pop Bought to 

In [29]:
model_kwargs['attn_implementation'] = None

In [30]:
########################
# Initialize the Trainer
########################
trainer = SFTTrainer(
    model=model,
    model_init_kwargs=model_kwargs,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    dataset_text_field="text",
    max_seq_length=training_args.max_seq_length,
    tokenizer=tokenizer,
    packing=True,
    peft_config=get_peft_config(model_args),
    dataset_kwargs=training_args.dataset_kwargs,
)

[INFO|configuration_utils.py:679] 2024-11-17 21:42:06,479 >> loading configuration file config.json from cache at /arc/home/obriaint/huggingface/hub/models--mistralai--Mistral-7B-Instruct-v0.1/snapshots/2dcff66eac0c01dc50e4c41eea959968232187fe/config.json
[INFO|configuration_utils.py:746] 2024-11-17 21:42:06,484 >> Model config MistralConfig {
  "_name_or_path": "mistralai/Mistral-7B-Instruct-v0.1",
  "architectures": [
    "MistralForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 32768,
  "model_type": "mistral",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "rms_norm_eps": 1e-05,
  "rope_theta": 10000.0,
  "sliding_window": 4096,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.46.2",
  "use_cache": false,
  "vocab_

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO|modeling_utils.py:4800] 2024-11-17 21:42:07,558 >> All model checkpoint weights were used when initializing MistralForCausalLM.

[INFO|modeling_utils.py:4808] 2024-11-17 21:42:07,560 >> All the weights of MistralForCausalLM were initialized from the model checkpoint at mistralai/Mistral-7B-Instruct-v0.1.
If your task is similar to the task the model of the checkpoint was trained on, you can already use MistralForCausalLM for predictions without further training.
[INFO|configuration_utils.py:1051] 2024-11-17 21:42:07,672 >> loading configuration file generation_config.json from cache at /arc/home/obriaint/huggingface/hub/models--mistralai--Mistral-7B-Instruct-v0.1/snapshots/2dcff66eac0c01dc50e4c41eea959968232187fe/generation_config.json
[INFO|configuration_utils.py:1096] 2024-11-17 21:42:07,673 >> Generate config GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": 2
}

Using custom data configuration default-da0fc66cc2bc9a67


2024-11-17 21:42:07 - INFO - datasets.builder - Using custom data configuration default-da0fc66cc2bc9a67


Loading Dataset Infos from /arc/home/obriaint/.local/lib/python3.11/site-packages/datasets/packaged_modules/generator


2024-11-17 21:42:07 - INFO - datasets.info - Loading Dataset Infos from /arc/home/obriaint/.local/lib/python3.11/site-packages/datasets/packaged_modules/generator


Generating dataset generator (/arc/home/obriaint/huggingface/datasets/generator/default-da0fc66cc2bc9a67/0.0.0)


2024-11-17 21:42:07 - INFO - datasets.builder - Generating dataset generator (/arc/home/obriaint/huggingface/datasets/generator/default-da0fc66cc2bc9a67/0.0.0)


Downloading and preparing dataset generator/default to /arc/home/obriaint/huggingface/datasets/generator/default-da0fc66cc2bc9a67/0.0.0...


2024-11-17 21:42:07 - INFO - datasets.builder - Downloading and preparing dataset generator/default to /arc/home/obriaint/huggingface/datasets/generator/default-da0fc66cc2bc9a67/0.0.0...


Generating train split


2024-11-17 21:42:07 - INFO - datasets.builder - Generating train split


Generating train split: 0 examples [00:00, ? examples/s]

Unable to verify splits sizes.


2024-11-17 21:58:38 - INFO - datasets.utils.info_utils - Unable to verify splits sizes.


Dataset generator downloaded and prepared to /arc/home/obriaint/huggingface/datasets/generator/default-da0fc66cc2bc9a67/0.0.0. Subsequent calls will reuse this data.


2024-11-17 21:58:38 - INFO - datasets.builder - Dataset generator downloaded and prepared to /arc/home/obriaint/huggingface/datasets/generator/default-da0fc66cc2bc9a67/0.0.0. Subsequent calls will reuse this data.


Using custom data configuration default-4e38acd28db1e147


2024-11-17 21:58:38 - INFO - datasets.builder - Using custom data configuration default-4e38acd28db1e147


Loading Dataset Infos from /arc/home/obriaint/.local/lib/python3.11/site-packages/datasets/packaged_modules/generator


2024-11-17 21:58:38 - INFO - datasets.info - Loading Dataset Infos from /arc/home/obriaint/.local/lib/python3.11/site-packages/datasets/packaged_modules/generator


Generating dataset generator (/arc/home/obriaint/huggingface/datasets/generator/default-4e38acd28db1e147/0.0.0)


2024-11-17 21:58:38 - INFO - datasets.builder - Generating dataset generator (/arc/home/obriaint/huggingface/datasets/generator/default-4e38acd28db1e147/0.0.0)


Downloading and preparing dataset generator/default to /arc/home/obriaint/huggingface/datasets/generator/default-4e38acd28db1e147/0.0.0...


2024-11-17 21:58:38 - INFO - datasets.builder - Downloading and preparing dataset generator/default to /arc/home/obriaint/huggingface/datasets/generator/default-4e38acd28db1e147/0.0.0...


Generating train split


2024-11-17 21:58:38 - INFO - datasets.builder - Generating train split


Generating train split: 0 examples [00:00, ? examples/s]

Unable to verify splits sizes.


2024-11-17 22:00:26 - INFO - datasets.utils.info_utils - Unable to verify splits sizes.


Dataset generator downloaded and prepared to /arc/home/obriaint/huggingface/datasets/generator/default-4e38acd28db1e147/0.0.0. Subsequent calls will reuse this data.


2024-11-17 22:00:26 - INFO - datasets.builder - Dataset generator downloaded and prepared to /arc/home/obriaint/huggingface/datasets/generator/default-4e38acd28db1e147/0.0.0. Subsequent calls will reuse this data.


[INFO|trainer.py:699] 2024-11-17 22:00:27,514 >> Using cpu_amp half precision backend


In [45]:
training_args.dataset_kwargs

{}

In [44]:
train_dataset[0]

{'text': "<|system|>\n</s>\n<|user|>\nSome commenters suggest that Ashley should make amends by contributing to a cancer society or center. Do you think this would be a fitting form of restitution? Why or why not?: Last year, Toronto’s press was all over the bizarre story of Ashley Anne Kirilow, the young woman from Burlington who shaved her head and eyebrows to fake the appearance of someone undergoing intense chemotherapy, and who managed to bilk around $12,000 from well-wishers. That story came to a close yesterday as Kirilow pleaded guilty to the last counts against her in court. The conditions of her sentence are, appropriately enough, as “interesting” as the crime itself.\nKirilow, 23, who is now living in the psychiatric ward of Joseph Brant Memorial Hospital in Burlington, pleaded guilty Thursday to outstanding fraud charges and was given a 15-month conditional sentence without jail time. She had previously pleaded guilty to one charge of fraud over $5,000….\nJointly submitted 

In [37]:
len(trainer.train_dataset[1]['input_ids'])

2048

In [40]:
len(trainer.train_dataset[1]['labels'])

2048

In [None]:
###############
# Training loop
###############
logger.info("*** Train ***")
checkpoint = None
if training_args.resume_from_checkpoint is not None:
    checkpoint = training_args.resume_from_checkpoint
elif last_checkpoint is not None:
    checkpoint = last_checkpoint
train_result = trainer.train(resume_from_checkpoint=checkpoint)
metrics = train_result.metrics
metrics["train_samples"] = len(train_dataset)
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()

In [None]:
##################################
# Save model and create model card
##################################
logger.info("*** Save model ***")
trainer.save_model(training_args.output_dir)
logger.info(f"Model saved to {training_args.output_dir}")

# Save everything else on main process
kwargs = {
    "finetuned_from": model_args.model_name_or_path,
    "dataset": list(data_args.dataset_mixer.keys()),
    "dataset_tags": list(data_args.dataset_mixer.keys()),
    "tags": ["alignment-handbook"],
}
if trainer.accelerator.is_main_process:
    trainer.create_model_card(**kwargs)
    # Restore k,v cache for fast inference
    trainer.model.config.use_cache = True
    trainer.model.config.save_pretrained(training_args.output_dir)


In [None]:

##########
# Evaluate
##########
if training_args.do_eval:
    logger.info("*** Evaluate ***")
    metrics = trainer.evaluate()
    metrics["eval_samples"] = len(eval_dataset)
    trainer.log_metrics("eval", metrics)
    trainer.save_metrics("eval", metrics)

if training_args.push_to_hub is True:
    logger.info("Pushing to hub...")
    trainer.push_to_hub(**kwargs)

logger.info("*** Training complete ***")