In [1]:
from rosemary import jpt_setup; jpt_setup()
import os; os.environ['CUDA_VISIBLE_DEVICES'] = '0'

  warn(f'Install `torch` for functionalities dependent on torch')


In [2]:

import logging
import math
from pathlib import Path
import os
import sys
from dataclasses import dataclass, field
from itertools import chain
from typing import Optional
import json
import numpy as np
import pickle
from functools import partial

import pyarrow
import datasets
import evaluate
import torch
from datasets import load_dataset, IterableDataset

import numpy as np

import transformers
from transformers import (
    CONFIG_MAPPING,
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    HfArgumentParser,
    Trainer,
    DataCollatorForLanguageModeling,
    is_torch_tpu_available,
    set_seed,
)
from transformers.testing_utils import CaptureLogger
from transformers.trainer_utils import get_last_checkpoint
from transformers.trainer_callback import TrainerState
from transformers.trainer import TRAINER_STATE_NAME
from transformers.utils import check_min_version, send_example_telemetry
from transformers.utils.versions import require_version
from transformers.models.gpt2.configuration_gpt2 import GPT2Config

from doremi.training_args import ModelArguments, DataTrainingArguments, FullTrainingArguments
import doremi.dataloader as data_utils
from doremi.trainer import DoReMiTrainer
from doremi.dataloader import determine_skip_per_domain
from doremi.dataloader import interleave_datasets



try:
    import doremi.models as doremi_models
except Exception:
    
    pass
try:
    from flash_attn.models.gpt_neox import gpt_neox_config_to_gpt2_config
except Exception:
    pass


# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.27.0")

require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")

logger = logging.getLogger(__name__)

[2023-08-01 23:13:47,210] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [3]:
package_dir = "/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi"
cache_dir = '/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/cache'
preprocessed_data = "/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/data/processed"

envs = {
    "CACHE": cache_dir,
    "DOREMI_DIR": package_dir,
    "PILE_DIR": os.path.join(package_dir, "data", 'raw'),
    "PREPROCESSED_PILE_DIR": preprocessed_data,
    "MODEL_OUTPUT_DIR": os.path.join(package_dir, 'results'),
    "PARTITION": "el8",
    "HF_HOME": cache_dir,
    "TRANSFORMERS_CACHE": cache_dir,
    "HF_DATASETS_CACHE": cache_dir,
    "HF_DATASETS_IN_MEMORY_MAX_SIZE": "0",
    "TORCH_EXTENSIONS_DIR": cache_dir,
    "TMPDIR": cache_dir,
    "WANDB_DIR": os.path.join(cache_dir, "wandb"),
    "PREPROCESSED_DATA": preprocessed_data,
    'PREPROCESSED_CACHE': os.path.join(cache_dir, 'preprocessed_cache', 'perdomain_pile_preprocessed'),

}

for k, v in envs.items():
    os.environ[k] = v
    
os.makedirs(cache_dir, exist_ok=True)


In [4]:
print(';'.join([f'export {k}={v}' for k, v in envs.items()]))

export CACHE=/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/cache;export DOREMI_DIR=/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi;export PILE_DIR=/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/data/raw;export PREPROCESSED_PILE_DIR=/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/data/processed;export MODEL_OUTPUT_DIR=/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/results;export PARTITION=el8;export HF_HOME=/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/cache;export TRANSFORMERS_CACHE=/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/cache;export HF_DATASETS_CACHE=/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/cache;export HF_DATASETS_IN_MEMORY_MAX_SIZE=0;export TORCH_EXTENSIONS_DIR=/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/cache;export TMPDIR=/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/cache;export WANDB_DIR=/gpfs/

In [5]:
# populate domain weight config
import json
domain_config_path = '../configs/humanmix_baseline_50kvocab.json'
domain_weights = {"cot": .25, "flan_v2": .25, "dolly": .25, "oasst1": .25}
domain_weights = {'cot': 0.5, 'flan_v2': 0.25, 'dolly': 0.12, 'oasst1': 0.13}

domain_config = {"train_domain_weights": domain_weights, "eval_domain_weights": domain_weights}
with open(domain_config_path, 'w') as f:
    json.dump(domain_config, f)

In [6]:
# # move domain datasets to the right folder.
# src_dir = '/gpfs/u/home/PTFM/PTFMqngp/scratch/github/mitibm2023/external/open-instruct/data/processed'
# tgt_dir = preprocessed_data

# def copyanything(src, dst):
#     import shutil, errno
#     try:
#         shutil.copytree(src, dst)
#     except OSError as exc: # python >2.5
#         if exc.errno in (errno.ENOTDIR, errno.EINVAL):
#             shutil.copy(src, dst)
#         else: raise

# for domain in  domain_weights.keys():
#     src_domain_dir = os.path.join(src_dir, domain)
#     tgt_domain_dir = os.path.join(tgt_dir, domain)
#     print(src_domain_dir, os.path.isdir(src_domain_dir))


In [25]:
job_name = 'train_baseline'

nodes = 1
num_gpus = 1

model_name_or_path = 'gpt2'
cache_dir = envs['CACHE']
domain_config_path = '../configs/humanmix_baseline_50kvocab.json'
output_dir = os.path.join(envs['MODEL_OUTPUT_DIR'], job_name)
dataset_dir = envs['PREPROCESSED_CACHE']
dataset_dir = preprocessed_data
 
total_batch_size = 128 # # 64*8=512
per_device_train_batch_size = 2
gradient_accumulation_steps = 1
gradient_accumulation_steps = int(total_batch_size/(num_gpus*nodes)/per_device_train_batch_size)

max_steps = 200000; save_steps = 5 # 200k steps.

# use `dataset_dir` instead of `dataset_name` to specify `preprocessed_dir`
# --dataset_name=pile \

## learning rate for pretraining, substituted with finetuning hyperparameters
# --learning_rate 1e-3 \
# --lr_end 1e-4 \
# --adam_epsilon 1e-8 \

## don't need cosine scheduling for finetuning
# --weight_decay 0.01 \
# --lr_scheduler_name linear_warmup_cosine \
# --warmup_ratio 0.06 \

## avoids grad scaling error
# --fp16 \
## for training model from scratch
# --config_overrides="n_positions=1024,n_embd=1024,n_layer=18,n_head=16" \

## added the following
# add_domain_id: for non-pile preprocessed dataset
# do_padding: true for variable size sequences, as in instruction tuning datasets.


cmd = f"""
--model_type={model_name_or_path} \
--tokenizer_name=gpt2 \
--do_train \
--cache_dir={cache_dir} \
--dataset_dir={dataset_dir} \
--domain_config_path={domain_config_path} \
--output_dir={output_dir} \
--max_token_length=1024 \
--per_device_train_batch_size={per_device_train_batch_size} \
--gradient_accumulation_steps={gradient_accumulation_steps} \
--dataloader_num_workers=1 \
--learning_rate=2e-5 \
--lr_scheduler_type=linear \
--warmup_ratio=0.03 \
--weight_decay=0. \
--max_grad_norm=1.0 \
--max_steps={max_steps} \
--evaluation_strategy=no \
--save_strategy=steps \
--save_steps={save_steps} \
--save_total_limit=1 \
--run_name={job_name} \
--seed=1111 \
--logging_strategy=steps \
--logging_steps=10 \
--logging_first_step \
--report_to=tensorboard \
--optim=adamw_hf \
--adam_beta1=0.9 \
--adam_beta2=0.99 \
--add_domain_id True \
--do_padding True \
--overwrite_output_dir \
--max_train_samples 1000
"""

import shlex
args = shlex.split(cmd)

parser = HfArgumentParser((ModelArguments, DataTrainingArguments, FullTrainingArguments))
model_args, data_args, training_args = parser.parse_args_into_dataclasses(args)
model_args, data_args, training_args

[INFO|training_args.py:1301] 2023-08-01 23:22:54,278 >> Found safetensors installation, but --save_safetensors=False. Safetensors should be a preferred weights saving format due to security and performance reasons. If your model cannot be saved by safetensors please feel free to open an issue at https://github.com/huggingface/safetensors!
[INFO|training_args.py:1716] 2023-08-01 23:22:54,279 >> PyTorch: setting up devices


(ModelArguments(model_name_or_path=None, model_type='gpt2', config_overrides=None, config_name=None, tokenizer_name='gpt2', cache_dir='/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/cache', use_fast_tokenizer=True, model_revision='main', use_auth_token=False, torch_dtype=None),
 DataTrainingArguments(dataset_dir='/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/data/processed', dataset_name='', max_train_samples=1000, max_eval_samples=None, max_token_length=1024, block_size=None, overwrite_cache=False, do_padding=True, add_domain_id=True, preprocessing_num_workers=None, shuffle=True),

In [26]:

# Setup logging
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    handlers=[logging.StreamHandler(sys.stdout)],
)

if training_args.should_log:
    # The default of training_args.log_level is passive, so we set log level at info here to have that default.
    transformers.utils.logging.set_verbosity_info()
    

log_level = training_args.get_process_log_level()
logger.setLevel(log_level)
datasets.utils.logging.set_verbosity(log_level)
transformers.utils.logging.set_verbosity(log_level)
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()


# Log on each process the small summary:
logger.warning(
    f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
    + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
)
logger.info(f"Training/evaluation parameters {training_args}")

08/01/2023 23:22:54 - INFO - __main__ - Training/evaluation parameters FullTrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.99,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=1,
dataloader_pin_memory=True,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=False,
do_predict=False,
do_train=True,
domain_config_path=../configs/humanmix_baseline_50kvocab.json,
doremi_optimizer=doremiv1,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=no,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_st

In [27]:

# Detecting last checkpoint.
last_checkpoint = None
num_skip_examples = 0
if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
    last_checkpoint = get_last_checkpoint(training_args.output_dir)
    if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. "
            "Use --overwrite_output_dir to overcome."
        )
    elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
        logger.info(
            f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
            "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
        )
        state = TrainerState.load_from_json(str(Path(last_checkpoint) / TRAINER_STATE_NAME))
        global_batch_size = training_args.train_batch_size * training_args.gradient_accumulation_steps * training_args.world_size
        num_skip_examples = state.global_step * global_batch_size
        logger.info(f"Skipping {num_skip_examples} examples")
        
last_checkpoint, num_skip_examples

(None, 0)

In [28]:

# Set seed before initializing model.
set_seed(training_args.seed)

In [29]:

# Load pretrained model and tokenizer
#
# Distributed training:
# The .from_pretrained methods guarantee that only one local process can concurrently
# download model & vocab.

config_kwargs = {
    "cache_dir": model_args.cache_dir,
    "revision": model_args.model_revision,
    "use_auth_token": True if model_args.use_auth_token else None,
}
if model_args.config_name:
    config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs)
elif model_args.model_name_or_path:
    config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs)
    if model_args.model_type == 'gpt_neox_flash':
        config = gpt_neox_config_to_gpt2_config(config)
        config.use_flash_attn = True
        config.fused_mlp = True
        config.fused_bias_fc = True
        config.fused_dropout_add_ln = True
        config.pad_vocab_size_multiple = 8
        config.activation_function = 'gelu_new'
        config.n_inner = None
        # disable absolute
        config.max_position_embeddings = 0
else:
    if model_args.model_type == 'gpt_flash': 
        config = GPT2Config(
                vocab_size=50257, n_positions=2048, n_embd=2048,
                n_layer=24, n_head=16, 
                scale_attn_by_inverse_layer_idx=True, 
                rotary_emb_fraction=0.5,
                use_flash_attn=True, fused_mlp=True,
                fused_bias_fc=True, fused_dropout_add_ln=True, 
                pad_vocab_size_multiple=8)
        # disable absolute
        config.max_position_embeddings = 0
    elif model_args.model_type == 'gpt_neox_flash':
        # convert to GPT2 config
        config = CONFIG_MAPPING['gpt_neox']() 
        config = gpt_neox_config_to_gpt2_config(config)
        config.use_flash_attn = True
        config.fused_mlp = True
        config.fused_bias_fc = True
        config.fused_dropout_add_ln = True
        config.pad_vocab_size_multiple = 8
        config.activation_function = 'gelu_new'
        config.n_inner = None
        # disable absolute
        config.max_position_embeddings = 0
    else:
        config = CONFIG_MAPPING[model_args.model_type]()
    logger.warning("You are instantiating a new config instance from scratch.")
    if model_args.config_overrides is not None:
        logger.info(f"Overriding config: {model_args.config_overrides}")
        config.update_from_string(model_args.config_overrides)
        logger.info(f"New config: {config}")




In [30]:

tokenizer_kwargs = {
    "cache_dir": model_args.cache_dir,
    "use_fast": model_args.use_fast_tokenizer,
    "revision": model_args.model_revision,
    "use_auth_token": True if model_args.use_auth_token else None,
}

if model_args.tokenizer_name:
    tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs)

elif model_args.model_name_or_path:
    tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs)
else:
    raise ValueError(
        "You are instantiating a new tokenizer from scratch. This is not supported by this script."
        "You can do it from another script, save it, and load it from here, using --tokenizer_name."
    )
    
tokenizer

[INFO|tokenization_auto.py:512] 2023-08-01 23:23:01,811 >> Could not locate the tokenizer configuration file, will try to use the model config instead.
[INFO|configuration_utils.py:712] 2023-08-01 23:23:01,835 >> loading configuration file config.json from cache at /gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/cache/models--gpt2/snapshots/11c5a3d5811f50298f278a704980280950aedb10/config.json
[INFO|configuration_utils.py:768] 2023-08-01 23:23:01,837 >> Model config GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  

GPT2TokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True)

In [31]:

if model_args.model_name_or_path:
    torch_dtype = (
        model_args.torch_dtype
        if model_args.torch_dtype in ["auto", None]
        else getattr(torch, model_args.torch_dtype)
    )
    if model_args.model_type in {'gpt_flash', 'gpt_neox_flash'}:
        model = doremi_models.GPTFlashAttnLMHeadModel.from_pretrained(model_args.model_name_or_path, config=config)
    else:
        model = AutoModelForCausalLM.from_pretrained(
            model_args.model_name_or_path,
            from_tf=bool(".ckpt" in model_args.model_name_or_path),
            config=config,
            cache_dir=model_args.cache_dir,
            revision=model_args.model_revision,
            use_auth_token=True if model_args.use_auth_token else None,
            torch_dtype=torch_dtype,
        )
else:
    if model_args.model_type in {'gpt_flash', 'gpt_neox_flash'}:
        model = doremi_models.GPTFlashAttnLMHeadModel(config)
    else:
        model = AutoModelForCausalLM.from_config(config)

    n_params = sum({p.data_ptr(): p.numel() for p in model.parameters()}.values())
    logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params")
    
model

[INFO|configuration_utils.py:603] 2023-08-01 23:23:04,013 >> Generate config GenerationConfig {
  "_from_model_config": true,
  "bos_token_id": 50256,
  "eos_token_id": 50256,
  "transformers_version": "4.32.0.dev0"
}



08/01/2023 23:23:12 - INFO - __main__ - Training new model from scratch - Total size=118.68M params


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [32]:
from transformers import GPT2LMHeadModel
from doremi.models import CausalLMOutputWithDomainIDs

class GPTLMHeadModelDoReMi(GPT2LMHeadModel):

    def __init__(self, config):
        super().__init__(config)
        self.ignore_index = -100
        self.loss_fct = CrossEntropyLoss(reduction='mean', ignore_index=self.ignore_index)
        self.pertoken_loss_fct = CrossEntropyLoss(reduction='none', ignore_index=self.ignore_index)
        self.reference_model = None

    def forward(
        self,
        ## wpq: re-order to match transformers's gpt2 args order.
        input_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        ## wpq: doremi specific
        domain_ids: Optional[torch.LongTensor] = None,
        return_pertoken_losses: Optional[bool] = False,
        inference_params: Optional[dict] = None,
        last_token_only: Optional[bool] = False,
        position_ids: Optional[torch.LongTensor] = None,
    ) -> Union[Tuple, CausalLMOutputWithDomainIDs]:
        
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if not return_pertoken_losses:
            lm_logits = super().forward(
                    input_ids=input_ids,
                    position_ids=position_ids,
                    inference_params=inference_params,
                    last_token_only=last_token_only).logits

            if labels is not None:
                # move labels to correct device to enable model parallelism
                labels = labels.to(lm_logits.device)
                # Shift so that tokens < n predict n
                shift_logits = lm_logits[:, :-1, :].contiguous()
                shift_labels = labels[:, 1:].contiguous()
                # Flatten the tokens
                loss = self.loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
            else:
                loss = None

            if not return_dict:
                output = (lm_logits, None, None, None, domain_ids, None, None, None) 
                return ((loss,) + output) if loss is not None else output

            return CausalLMOutputWithDomainIDs(
                loss=loss,
                logits=lm_logits,
                past_key_values=None,
                hidden_states=None,
                attentions=None,
                domain_ids=domain_ids)
        else:
            lm_logits = super().forward(
                    input_ids=input_ids,
                    position_ids=position_ids,
                    inference_params=inference_params,
                    last_token_only=last_token_only).logits

            loss = None
            pertoken_loss = None
            reference_pertoken_loss = None
            if labels is not None:
                # move labels to correct device to enable model parallelism
                labels = labels.to(lm_logits.device)
                # Shift so that tokens < n predict n
                shift_logits = lm_logits[:, :-1, :].contiguous()
                shift_labels = labels[:, 1:].contiguous()
                # Flatten the tokens
                ignore_index = -100
                pertoken_loss = self.pertoken_loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
                pertoken_loss = pertoken_loss.view(shift_labels.size(0), shift_labels.size(1))
                token_mask = shift_labels.ne(ignore_index).float()

                loss = pertoken_loss.sum() / token_mask.sum()

                # run reference model forward to get pertoken_loss
                if self.reference_model is not None:
                    self.reference_model.eval()
                    reference_outputs = self.reference_model(
                        input_ids=input_ids,
                        attention_mask=attention_mask,
                        inputs_embeds=inputs_embeds,
                        head_mask=head_mask,
                        past_key_values=past_key_values,
                        labels=labels,
                        use_cache=use_cache,
                        output_attentions=output_attentions,
                        output_hidden_states=output_hidden_states,
                        return_dict=return_dict,
                        domain_ids=domain_ids,
                        return_pertoken_losses=True,
                        position_ids=position_ids,
                        inference_params=inference_params,
                        last_token_only=last_token_only,
                    )
                    reference_pertoken_loss = reference_outputs['pertoken_loss']

            if not return_dict:
                output = (lm_logits, None, None, None, domain_ids, pertoken_loss, reference_pertoken_loss, token_mask) 
                return ((loss,) + output) if loss is not None else output

            return CausalLMOutputWithDomainIDs(
                loss=loss,
                logits=lm_logits,
                past_key_values=None,
                hidden_states=None,
                attentions=None,
                domain_ids=domain_ids,
                pertoken_loss=pertoken_loss,
                reference_pertoken_loss=reference_pertoken_loss,
                token_mask=token_mask)
        
        

#             model_args.model_name_or_path,
#             from_tf=bool(".ckpt" in model_args.model_name_or_path),
#             config=config,
#             cache_dir=model_args.cache_dir,
#             revision=model_args.model_revision,
#             use_auth_token=True if model_args.use_auth_token else None,
#             torch_dtype=torch_dtype,

model = doremi_models.GPTFlashAttnLMHeadModel.from_pretrained(
    model_args.model_name_or_path, config=config)

        
GPTLMHeadModelDoReMi


GPT2Config {
  "activation_function": "gelu_new",
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "transformers_version": "4.32.0.dev0",
  "use_cache": true,
  "vocab_size": 50257
}

In [34]:
model_args.model_name_or_path

In [16]:

with open(training_args.domain_config_path, 'r') as f:
    domain_config = json.load(f)

train_domain_weights_dict = domain_config['train_domain_weights']
eval_domain_weights_dict = domain_config['eval_domain_weights']
# whenever we convert dict to array, we sort by key
domain_list = list(sorted(train_domain_weights_dict.keys()))
num_domains = len(domain_list)

domain_list, num_domains, train_domain_weights_dict

(['cot', 'dolly', 'flan_v2', 'oasst1'],
 4,
 {'cot': 0.5, 'flan_v2': 0.25, 'dolly': 0.12, 'oasst1': 0.13})

In [17]:

if training_args.do_train:
    # data script could change tokenizer shape
    train_dataset = data_utils.get_preprocessed_mixed_dataset(
            preprocessed_dir=data_args.dataset_dir,
            domain_weights_dict=train_domain_weights_dict,
            dataset_name=data_args.dataset_name,
            cache_dir=model_args.cache_dir,
            split='train',
            max_samples=data_args.max_train_samples,
            add_domain_id=data_args.add_domain_id,
            tmp_file=None,
            seed=training_args.seed,
            tokenizer=tokenizer,
            shuffle=data_args.shuffle,
            num_skip_examples=num_skip_examples,
            shard_reversal=training_args.reweight_domains,
            training_args=training_args,
    )

if training_args.do_eval:
    eval_dataset = data_utils.get_preprocessed_mixed_dataset(
            preprocessed_dir=data_args.dataset_dir,
            domain_weights_dict=eval_domain_weights_dict,
            dataset_name=data_args.dataset_name,
            cache_dir=model_args.cache_dir,
            split='validation',
            add_domain_id=data_args.add_domain_id,
            max_samples=data_args.max_eval_samples,
            tokenizer=tokenizer,
            no_interleave=True,
            training_args=training_args,
    )

08/01/2023 23:14:21 - INFO - datasets.builder - Using custom data configuration default-20e8d8f5df875937
08/01/2023 23:14:21 - INFO - datasets.info - Loading Dataset Infos from /gpfs/u/scratch/PTFM/PTFMqngp/miniconda3/envs/open-instruct/lib/python3.10/site-packages/datasets/packaged_modules/json
08/01/2023 23:14:21 - INFO - datasets.builder - Overwrite dataset info from restored data version if exists.
08/01/2023 23:14:21 - INFO - datasets.info - Loading Dataset info from /gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/cache/json/default-20e8d8f5df875937/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96
08/01/2023 23:14:21 - INFO - datasets.info - Loading Dataset info from /gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/cache/json/default-20e8d8f5df875937/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96


  0%|          | 0/1 [00:00<?, ?it/s]

08/01/2023 23:14:21 - INFO - datasets.arrow_dataset - Process #0 will write at /gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/cache/json/default-20e8d8f5df875937/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-dac4998a00f3727e_00000_of_00008.arrow
08/01/2023 23:14:21 - INFO - datasets.arrow_dataset - Process #1 will write at /gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/cache/json/default-20e8d8f5df875937/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-dac4998a00f3727e_00001_of_00008.arrow
08/01/2023 23:14:21 - INFO - datasets.arrow_dataset - Process #2 will write at /gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/cache/json/default-20e8d8f5df875937/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-dac4998a00f3727e_00002_of_00008.arrow
08/01/2023 23:14:21 - INFO - datasets.arrow_dataset - Process #3 will write at /gpfs/u/scratch/PTFM/PTFMqngp/github/mi

  0%|          | 0/1 [00:00<?, ?it/s]

08/01/2023 23:14:21 - INFO - datasets.arrow_dataset - Process #0 will write at /gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/cache/json/default-68fd04897f8e942c/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-ee2bcd7dac9daed9_00000_of_00008.arrow
08/01/2023 23:14:21 - INFO - datasets.arrow_dataset - Process #1 will write at /gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/cache/json/default-68fd04897f8e942c/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-ee2bcd7dac9daed9_00001_of_00008.arrow
08/01/2023 23:14:21 - INFO - datasets.arrow_dataset - Process #2 will write at /gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/cache/json/default-68fd04897f8e942c/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-ee2bcd7dac9daed9_00002_of_00008.arrow
08/01/2023 23:14:21 - INFO - datasets.arrow_dataset - Process #3 will write at /gpfs/u/scratch/PTFM/PTFMqngp/github/mi

  0%|          | 0/1 [00:00<?, ?it/s]

08/01/2023 23:14:21 - INFO - datasets.arrow_dataset - Process #0 will write at /gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/cache/json/default-a01381664fd2589b/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-dd3397b5949d01cc_00000_of_00008.arrow
08/01/2023 23:14:21 - INFO - datasets.arrow_dataset - Process #1 will write at /gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/cache/json/default-a01381664fd2589b/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-dd3397b5949d01cc_00001_of_00008.arrow
08/01/2023 23:14:21 - INFO - datasets.arrow_dataset - Process #2 will write at /gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/cache/json/default-a01381664fd2589b/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-dd3397b5949d01cc_00002_of_00008.arrow
08/01/2023 23:14:21 - INFO - datasets.arrow_dataset - Process #3 will write at /gpfs/u/scratch/PTFM/PTFMqngp/github/mi

  0%|          | 0/1 [00:00<?, ?it/s]

08/01/2023 23:14:21 - INFO - datasets.arrow_dataset - Process #0 will write at /gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/cache/json/default-d43591dedd2996a1/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-9d43238af1e5752b_00000_of_00008.arrow
08/01/2023 23:14:21 - INFO - datasets.arrow_dataset - Process #1 will write at /gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/cache/json/default-d43591dedd2996a1/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-9d43238af1e5752b_00001_of_00008.arrow
08/01/2023 23:14:21 - INFO - datasets.arrow_dataset - Process #2 will write at /gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/cache/json/default-d43591dedd2996a1/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-9d43238af1e5752b_00002_of_00008.arrow
08/01/2023 23:14:21 - INFO - datasets.arrow_dataset - Process #3 will write at /gpfs/u/scratch/PTFM/PTFMqngp/github/mi

[ERROR|tokenization_utils_base.py:1056] 2023-08-01 23:14:21,926 >> Using pad_token, but it is not set yet.


08/01/2023 23:14:21 - INFO - datasets.builder - Using custom data configuration default-2cb43153e98ab920
08/01/2023 23:14:21 - INFO - datasets.info - Loading Dataset Infos from /gpfs/u/scratch/PTFM/PTFMqngp/miniconda3/envs/open-instruct/lib/python3.10/site-packages/datasets/packaged_modules/generator
08/01/2023 23:14:21 - INFO - datasets.builder - Using custom data configuration default-ecd9b34c777a15d8
08/01/2023 23:14:21 - INFO - datasets.info - Loading Dataset Infos from /gpfs/u/scratch/PTFM/PTFMqngp/miniconda3/envs/open-instruct/lib/python3.10/site-packages/datasets/packaged_modules/generator
08/01/2023 23:14:21 - INFO - datasets.builder - Using custom data configuration default-4984ce55f649fc18
08/01/2023 23:14:21 - INFO - datasets.info - Loading Dataset Infos from /gpfs/u/scratch/PTFM/PTFMqngp/miniconda3/envs/open-instruct/lib/python3.10/site-packages/datasets/packaged_modules/generator
08/01/2023 23:14:21 - INFO - datasets.builder - Using custom data configuration default-7f33c1

In [18]:

# preprocessed_dir=data_args.dataset_dir
# domain_weights_dict=train_domain_weights_dict
# dataset_name=data_args.dataset_name
# cache_dir=model_args.cache_dir
# split='train'
# max_samples=data_args.max_train_samples
# add_domain_id=data_args.add_domain_id
# tmp_file=None
# seed=training_args.seed
# tokenizer=tokenizer
# shuffle=data_args.shuffle
# num_skip_examples=num_skip_examples
# shard_reversal=training_args.reweight_domains
# no_interleave=False

# print(preprocessed_dir)
# print(domain_weights_dict)
# print(dataset_name)
# print(cache_dir)
# print(split)
# print(max_samples)
# print(add_domain_id)
# print(seed)
# print(shuffle)
# print(num_skip_examples)
# print(shard_reversal)


# domain_names = list(sorted(domain_weights_dict.keys()))
# domain_to_idx = {domain_names[i]: i for i in range(len(domain_names))}
# domain_weights = np.asarray([domain_weights_dict[domain_name] for domain_name in domain_names])
# domain_weights = domain_weights / domain_weights.sum()

# print()
# print(json.dumps({'domain_names': domain_names, 
#                   'domain_to_idx': domain_to_idx, 
#                   'domain_weights': list(domain_weights)},
#                 indent=4))


# # write domain weights to file if tmp_file is set
# if tmp_file is not None:
#     probabilities_tmp_file = tmp_file

#     with open(str(probabilities_tmp_file), 'wb') as f:
#         pickle.dump(domain_weights, f)
#     probabilities = None
# else:
#     probabilities = domain_weights
#     probabilities_tmp_file = None


# print()
# print(json.dumps({'probabilities': list(probabilities)}, indent=4))

# # from doremi.dataloader import get_perdomain_datasets
# # all_ds = get_perdomain_datasets(
# #     preprocessed_dir, 
# #     domain_weights_dict,
# #     cache_dir=cache_dir,
# #     split=split,
# #     seed=seed,
# #     domain_weights=domain_weights,
# #     domain_names=domain_names,
# #     num_skip_examples=num_skip_examples,
# #     shuffle=shuffle,
# #     shard_reversal=shard_reversal
# # )

# domain_name_to_skip_num = determine_skip_per_domain(num_skip_examples, seed, domain_weights, domain_names)

# preprocessed_dir = Path(preprocessed_dir)
# if split is not None and (preprocessed_dir / split).exists():
#     preprocessed_dir = preprocessed_dir / split
# else:
#     logger.warn(f"No split used or split directory not found: using same data for all splits.")

# domains = list(sorted(domain_weights_dict.keys()))

# print(preprocessed_dir)
# print(domain_name_to_skip_num)
# print()
# print(json.dumps({'preprocessed_dir': str(preprocessed_dir), 
#                   'domain_name_to_skip_num': domain_name_to_skip_num}, indent=4))


# all_ds = {}
# for domain in domains:
#     domain_dir = preprocessed_dir / domain
    
#     ## wpq: read instruction tuning dataset off `jsonl` files
#     if (domain_dir / f'{domain}_data.jsonl').exists():
#         from datasets import load_dataset
#         from functools import partial
#         from open_instruct.finetune_trainer import encode_with_prompt_completion_format, encode_with_messages_format
#         from doremi.dataloader import skippable_data_gen_dataset

#         data_files = {'train': str(domain_dir / f'{domain}_data.jsonl')}
#         raw_datasets = load_dataset(
#             "json",
#             data_files=data_files,
#             cache_dir=cache_dir,
#             use_auth_token=True if model_args.use_auth_token else None,
#         )
#         # Preprocessing the datasets.
#         if "prompt" in raw_datasets["train"].column_names and "completion" in raw_datasets["train"].column_names:
#             encode_function = partial(
#                 encode_with_prompt_completion_format,
#                 tokenizer=tokenizer,
#                 max_seq_length=1024,
#             )
#         elif "messages" in raw_datasets["train"].column_names:
#             encode_function = partial(
#                 encode_with_messages_format,
#                 tokenizer=tokenizer,
#                 max_seq_length=1024,
#             )
#         else:
#             raise ValueError("You need to have either 'prompt'&'completion' or 'messages' in your column names.")

#         with training_args.main_process_first(local=False, desc="Processing instruction data"):
#             lm_datasets = raw_datasets.map(
#                 encode_function,
#                 num_proc=16,
#                 batched=False,
#             )
#             lm_datasets.set_format(type="pt")
#         ds = lm_datasets['train']
#         ds = IterableDataset.from_generator(
#                 skippable_data_gen_dataset,
#                 gen_kwargs={'ds': ds,
#                             'num_skip_examples': domain_name_to_skip_num[domain],
#                             'loop': (split == 'train'),
#                             'seed': seed,
#                             'shuffle': shuffle}
#                 )
#         seed += 1
#     elif (domain_dir / 'dataset_info.json').exists():
#         ds = load_from_disk(dataset_path=str(domain_dir))
#         logger.info(f"Loaded {domain_dir}. Length: {len(ds)}")
#     else:
#         curr_shards = list(domain_dir.iterdir())
#         if shard_reversal:
#             curr_shards = list(reversed(curr_shards))
#         # shuffle shard order
#         random.Random(seed).shuffle(curr_shards)
#         ds = IterableDataset.from_generator(
#                 skippable_data_gen,
#                 gen_kwargs={'shards': curr_shards,
#                             'num_skip_examples': domain_name_to_skip_num[domain],
#                             'loop': (split == 'train'),
#                             'seed': seed,
#                             'shuffle': shuffle}
#                 )
#         seed += 1
#     all_ds[domain] = ds
    

# if tokenizer.pad_token is None:
#     tokenizer.pad_token = tokenizer.eos_token

# def add_domain_id_generator(ds, domain_idx):
#     for ex in ds:
#         ex['domain_id'] = domain_idx
#         yield ex
        
# domain_ds_ls = []
# for domain_name in domain_names:
#     domain_idx = domain_to_idx[domain_name]
#     domain_ds = all_ds[domain_name]
#     # add domain_id if necessary
#     if add_domain_id:
#         domain_ds = IterableDataset.from_generator(
#             add_domain_id_generator, 
#             gen_kwargs={'ds': domain_ds, 'domain_idx': domain_idx})
#     domain_ds_ls.append(domain_ds)

# if no_interleave:
#     # instead of interleaving, run through each dataset
#     def data_generator(shards):
#         for shard in shards:
#             for ex in shard:
#                 yield ex
#     ds = IterableDataset.from_generator(data_generator, gen_kwargs={'shards': domain_ds_ls})
#     logger.info("Not interleaving dataset - will not sample according to domain weights")

# else:
#     ds = interleave_datasets(
#             domain_ds_ls,
#             probabilities=probabilities,
#             probabilities_file=probabilities_tmp_file,
#             seed=seed)
    

# def take_data_generator(ds, max_samples):
#     idx = 0
#     for ex in ds:
#         yield ex
#         idx += 1
#         if max_samples is not None and idx >= max_samples:
#             return

# ds = IterableDataset.from_generator(take_data_generator, gen_kwargs={'ds': ds, 'max_samples': max_samples})
# train_dataset = ds


In [19]:
# test_ds = load_dataset(
#     "json",
#     data_files='test.jsonl',
#     cache_dir=model_args.cache_dir)['train']
# test_ds[0]

# for x in test_ds.to_iterable_dataset():
#     print(x)
# for i, v in enumerate(ds):
#     if i == 10:
#         break
#     print(v)

In [20]:

# turn off find unused parameters
training_args.ddp_find_unused_parameters = False

# We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
# on a small vocab and want a smaller embedding size, remove this test.
# embedding_size = model.get_input_embeddings.weight.shape[0]
# if len(tokenizer) > embedding_size:
#     model.resize_token_embeddings(len(tokenizer))

torch.cuda.empty_cache()

# Initialize our Trainer
trainer = DoReMiTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset if training_args.do_train else None,
    eval_dataset=eval_dataset if training_args.do_eval else None,
    tokenizer=tokenizer,
    data_collator=data_utils.get_data_collator(tokenizer, do_padding=data_args.do_padding),
)

[INFO|trainer.py:565] 2023-08-01 23:14:23,845 >> max_steps is given, it will override any value given in num_train_epochs


In [21]:
print(trainer.args.max_grad_norm, \
    trainer.sharded_ddp, \
    trainer.args.half_precision_backend, \
    trainer.do_grad_scaling)

1.0 None auto False


In [22]:

# Training
if training_args.do_train:
    checkpoint = None
    if training_args.resume_from_checkpoint is not None:
        checkpoint = training_args.resume_from_checkpoint
    elif last_checkpoint is not None:
        checkpoint = last_checkpoint
    train_result = trainer.train(resume_from_checkpoint=checkpoint)
    trainer.save_model()  # Saves the tokenizer too for easy upload

    metrics = train_result.metrics

    if training_args.reweight_domains:
        avg_domain_weights_dict = {}
        for i in range(len(model.avg_domain_weights)):
            domain_name = domain_list[i]
            metrics[f'avg_domain_weight:{domain_name}'] = model.avg_domain_weights[i].item()
            avg_domain_weights_dict[domain_name] = model.avg_domain_weights[i].item()

        # save avg domain weights to json
        avg_domain_weights_file = Path(training_args.output_dir) / 'avg_domain_weights.json'
        with open(avg_domain_weights_file, 'w') as f:
            json.dump(avg_domain_weights_dict, f, indent=2)

        # also save to configs dir
        config_dict = {"train_domain_weights": avg_domain_weights_dict,
                       "eval_domain_weights": avg_domain_weights_dict}
        config_dict_file = Path(__file__).parent.parent / 'configs' / f"{Path(training_args.output_dir).name}.json"
        with open(config_dict_file, 'w') as f:
            json.dump(config_dict, f, indent=2)

    trainer.log_metrics("train", metrics)
    trainer.save_metrics("train", metrics)
    trainer.save_state()

[INFO|trainer.py:1682] 2023-08-01 23:14:24,401 >> ***** Running training *****
[INFO|trainer.py:1683] 2023-08-01 23:14:24,401 >>   Num examples = 25,600,000
[INFO|trainer.py:1684] 2023-08-01 23:14:24,402 >>   Num Epochs = 9,223,372,036,854,775,807
[INFO|trainer.py:1685] 2023-08-01 23:14:24,403 >>   Instantaneous batch size per device = 4
[INFO|trainer.py:1688] 2023-08-01 23:14:24,404 >>   Total train batch size (w. parallel, distributed & accumulation) = 128
[INFO|trainer.py:1689] 2023-08-01 23:14:24,404 >>   Gradient Accumulation steps = 32
[INFO|trainer.py:1690] 2023-08-01 23:14:24,405 >>   Total optimization steps = 200,000
[INFO|trainer.py:1691] 2023-08-01 23:14:24,406 >>   Number of trainable parameters = 124,439,808




[INFO|trainer_utils.py:696] 2023-08-01 23:14:24,482 >> The following columns in the training set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: id, dataset, messages, domain_id. If id, dataset, messages, domain_id are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.




Step,Training Loss
1,10.9927
10,10.9949


[INFO|trainer.py:2807] 2023-08-01 23:15:00,299 >> Saving model checkpoint to /gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/results/train_baseline/checkpoint-5
[INFO|configuration_utils.py:458] 2023-08-01 23:15:00,303 >> Configuration saved in /gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/results/train_baseline/checkpoint-5/config.json
[INFO|configuration_utils.py:379] 2023-08-01 23:15:00,304 >> Configuration saved in /gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/results/train_baseline/checkpoint-5/generation_config.json
[INFO|modeling_utils.py:1855] 2023-08-01 23:15:00,760 >> Model weights saved in /gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/results/train_baseline/checkpoint-5/pytorch_model.bin
[INFO|tokenization_utils_base.py:2210] 2023-08-01 23:15:00,763 >> tokenizer config file saved in /gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/results/train_baseline/checkpoint-5/tokenizer_config.json
[IN



[INFO|trainer_utils.py:696] 2023-08-01 23:15:20,848 >> The following columns in the training set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: id, dataset, messages, domain_id. If id, dataset, messages, domain_id are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.




[INFO|trainer.py:2807] 2023-08-01 23:15:36,394 >> Saving model checkpoint to /gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/results/train_baseline/checkpoint-10
[INFO|configuration_utils.py:458] 2023-08-01 23:15:36,397 >> Configuration saved in /gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/results/train_baseline/checkpoint-10/config.json
[INFO|configuration_utils.py:379] 2023-08-01 23:15:36,399 >> Configuration saved in /gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/results/train_baseline/checkpoint-10/generation_config.json
[INFO|modeling_utils.py:1855] 2023-08-01 23:15:36,865 >> Model weights saved in /gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/results/train_baseline/checkpoint-10/pytorch_model.bin
[INFO|tokenization_utils_base.py:2210] 2023-08-01 23:15:36,868 >> tokenizer config file saved in /gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/results/train_baseline/checkpoint-10/tokenizer_config.jso

AttributeError: 'GPT2LMHeadModel' object has no attribute 'perdomain_scores'