Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 29 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,15 @@ python pretrained/GPT-NeoX-20B/prepare.py

The weights for this model will be in the `pretrained/GPT-NeoX-20B/EleutherAI_gpt-neox-20b`.

In case you want to fine-tune other gpt-neox models, e.g. [the Pythia model suite](https://huggingface.co/models?sort=downloads&search=pythia), you can specify the HF model name, for example:

```shell
python pretrained/GPT-NeoX-20B/prepare.py --model-name EleutherAI/pythia-6.9b-deduped
```

And the weights for this model will be in the `pretrained/GPT-NeoX-20B/EleutherAI_pythia-6.9b-deduped`.


# Training and Finetuning

## (Optional) 8bit Adam
Expand All @@ -113,21 +122,38 @@ As the training loop runs, checkpoints are saved to the `model_ckpts` directory

Please see [the training README](training/README.md) for more details about customizing the training run.

The `training/finetune_Pythia-Chat-Base-7B.sh` script is another example to fine-tune a 7B pythia (gpt-neox) model. The script launches 8 processes with a pipeline-parallel degree of 4 and a data-parallel degree of 2.

# Converting Weights to Huggingface Format

Before you can use this model to perform inference, it must be converted to the Huggingface format. Run this command from the root of the repo to do so.

```shell
mkdir huggingface_models \
&& python tools/convert_to_hf_gptneox.py \
mkdir huggingface_models \
&& python tools/convert_to_hf_gptneox.py \
--ckpt-path model_ckpts/GPT-Neo-XT-Chat-Base-20B/checkpoint_100 \
--save-path huggingface_models/GPT-NeoXT-Chat-Base-20B \
--n-stages 8 \
--n-layer-per-stage 6
--n-layer-per-stage 6 \
--fp16
```
where the `--fp16` flag will load and store models in fp16.

Make sure to replace `model_ckpts/GPT-Neo-XT-Chat-Base-20B/checkpoint_100` with the latest checkpoint in the `model_ckpts/GPT-Neo-XT-Chat-Base-20B` directory.

If you need to convert ckpts of other gpt-neox variants, make sure to specify the correct config name for your variant.
For example, if you want to convert a checkpoint fine-tuned from `EleutherAI/pythia-6.9b-deduped`, you should indicate this as a config name:
```shell
python tools/convert_to_hf_gptneox.py \
--config-name EleutherAI/pythia-6.9b-deduped \
--ckpt-path model_ckpts/Pythia-Chat-Base-7B/checkpoint_100 \
--save-path huggingface_models/Pythia-Chat-Base-7B \
--n-stages 4 \
--n-layer-per-stage 8 \
--fp16
```


# Inference

To help you test the model, we provide a simple test command line test harness to interact with the bot.
Expand Down
9 changes: 8 additions & 1 deletion pretrained/GPT-NeoX-20B/prepare.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,28 +22,35 @@
if not os.path.exists(save_path):
os.mkdir(save_path)

print('loading model from HF...')
config = AutoConfig.from_pretrained(args.model_name)
config.save_pretrained(save_path)
tokenizer = AutoTokenizer.from_pretrained(args.model_name)
tokenizer.save_pretrained(save_path)

# offload model from memory to disk if offload-dir is specified
if args.offload_dir is not None:
if not os.path.exists(args.offload_dir):
os.mkdir(args.offload_dir)
model = AutoModelForCausalLM.from_pretrained(args.model_name, torch_dtype=torch.float16, device_map="auto", offload_folder=args.offload_dir)
else:
model = AutoModelForCausalLM.from_pretrained(args.model_name, torch_dtype=torch.float16)
print('loaded model from HF...')

print('converting the embedding layer...')
item = {}
item['embed_in.weight'] = model.gpt_neox.embed_in.weight
torch.save(item, os.path.join(save_path, 'pytorch_embs.pt'))
print('converted the embedding layer.')

for i in range(len(model.gpt_neox.layers)):
print(f'converting the {i}-th transformer layer...')
torch.save(model.gpt_neox.layers[i].state_dict(), os.path.join(save_path, f'pytorch_{i}.pt'))
print(f'converted the {i}-th transformer layer.')

print('converting the lm_head layer...')
item = {}
item['embed_out.weight'] = model.embed_out.weight
item['final_layer_norm.weight'] = model.gpt_neox.final_layer_norm.weight
item['final_layer_norm.bias'] = model.gpt_neox.final_layer_norm.bias
torch.save(item, os.path.join(save_path, 'pytorch_lm_head.pt'))
print('converted the lm_head layer.')
29 changes: 23 additions & 6 deletions tools/convert_to_hf_gptneox.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,13 +56,14 @@ def load_decentralized_checkpoint(model, checkpoint_path, n_stages=2, n_layer_pe

elif i == n_stages - 1:
for j in range(n_layer_per_stage):
if i*n_layer_per_stage + j == 44:
break
_tmp = {k[len(f"{j}."):]:v for k,v in checkpoint.items() if k.startswith(f"{j}.")}
if len(_tmp) == 0:
break
# torch.save(_tmp, os.path.join(output_path, f'pytorch_{i*n_layer_per_stage + j}.pt'))
model.gpt_neox.layers[i*n_layer_per_stage + j].load_state_dict(_tmp)
if i*n_layer_per_stage + j == len(model.gpt_neox.layers) - 1:
j += 1
break

_tmp = {k[len(f"{j}."):]:v for k,v in checkpoint.items() if k.startswith(f"{j}.")}
if len(_tmp) == 0:
Expand All @@ -88,14 +89,17 @@ def load_decentralized_checkpoint(model, checkpoint_path, n_stages=2, n_layer_pe
if __name__ == '__main__':

parser = argparse.ArgumentParser(description='Convert HF checkpoints')
parser.add_argument('--config-name', type=str, default='EleutherAI/gpt-neox-20b',
help='config-name')
parser.add_argument('--ckpt-path', type=str, default=None,
help='model-name')
help='ckpt-path')
parser.add_argument('--save-path', type=str, default=None,
help='model-name')
help='save-path')
parser.add_argument('--n-stages', type=int, default=8,
help='pipeline group size')
parser.add_argument('--n-layer-per-stage', type=int, default=6,
help='n layers per GPU device')
parser.add_argument('--fp16', default=False, action='store_true')
args = parser.parse_args()

assert args.ckpt_path is not None
Expand All @@ -104,13 +108,26 @@ def load_decentralized_checkpoint(model, checkpoint_path, n_stages=2, n_layer_pe
if not os.path.exists(args.save_path):
os.mkdir(args.save_path)

config = AutoConfig.from_pretrained('EleutherAI/gpt-neox-20b')
tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-neox-20b')
print('loading config...')
config = AutoConfig.from_pretrained(args.config_name)
print('loaded config.')
print('loading tokenizer...')
tokenizer = AutoTokenizer.from_pretrained(args.config_name)
print('loaded tokenizer.')
print('creating empty model...')
model = create_empty_gptneox(config)
if args.fp16:
model = model.half()
print('created empty model.')
print('loading model ckpt...')
load_decentralized_checkpoint(
model, args.ckpt_path, n_stages=args.n_stages, n_layer_per_stage=args.n_layer_per_stage,
)
print('loaded model ckpt.')

print('saving HF model...')
model.save_pretrained(args.save_path)
print(f'saved HF model to `{args.save_path}`')
config.save_pretrained(args.save_path)
tokenizer.save_pretrained(args.save_path)

83 changes: 83 additions & 0 deletions training/finetune_Pythia-Chat-Base-7B.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
DIR=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)

netif=lo
export GLOO_SOCKET_IFNAME=${netif}
export NCCL_SOCKET_IFNAME=${netif}
export MODEL_NAME=Pythia-Chat-Base-7B

export SHOW_DATA=0

BASE_MODEL="${DIR}/../pretrained/GPT-NeoX-20B/EleutherAI_pythia-6.9b-deduped/"

CHECKPOINT_STEPS=100

DATASETS="\
${DIR}/../data/OIG/files/unified_ni.jsonl:0.2,\
${DIR}/../data/OIG/files/unified_p3.jsonl:0.5,\
${DIR}/../data/OIG/files/unified_flan.jsonl:0.2,\
${DIR}/../data/OIG/files/unified_chip2.jsonl:0.01,\
${DIR}/../data/OIG/files/unified_rallio_safety_and_prosocial.jsonl:0.1,\
${DIR}/../data/OIG/files/unified_soda_dialog.jsonl:0.1,\
${DIR}/../data/OIG/files/unified_unifiedskg_instructions.jsonl:0.1,\
${DIR}/../data/OIG/files/unified_merged_code_xp3.jsonl:0.1,\
${DIR}/../data/OIG/files/unified_oscar_en_sample_dialog.jsonl:0.1,\
${DIR}/../data/OIG/files/unified_ul2_plus_oscar_en_sample_dialog.jsonl:0.1,\
${DIR}/../data/OIG/files/unified_multi_news.jsonl:0.05,\
${DIR}/../data/OIG/files/unified_openai_summarize_tldr.jsonl:0.05,\
${DIR}/../data/OIG/files/unified_squad_v2.jsonl:0.01,\
${DIR}/../data/OIG/files/unified_nq.jsonl:0.01,\
${DIR}/../data/OIG/files/unified_poetry_instructions.jsonl:0.01,\
${DIR}/../data/OIG/files/unified_sqlv2.jsonl:0.01,\
${DIR}/../data/OIG/files/unified_unnatural_instructions.jsonl:0.01,\
${DIR}/../data/OIG/files/unified_conv_finqa.jsonl:0.01,\
${DIR}/../data/OIG/files/unified_essays.jsonl:0.01,\
${DIR}/../data/OIG/files/unified_plot_screenplay_books_dialog.jsonl:0.01,\
${DIR}/../data/OIG/files/unified_grade_school_math_instructions.jsonl:0.01,\
${DIR}/../data/OIG/files/unified_mathqa_flanv2_kojma_cot.jsonl:0.01,\
${DIR}/../data/OIG/files/unified_joke_explanations.jsonl:0.01,\
${DIR}/../data/OIG/files/unified_cuad.jsonl:0.01,\
${DIR}/../data/OIG/files/unified_abstract_infill.jsonl:0.1,\
${DIR}/../data/OIG/files/unified_image_prompts_instructions.jsonl:0.01 \
"

ARGS="--model-name ${BASE_MODEL} \
--tokenizer-name ${BASE_MODEL} \
--project-name together \
--model-type gptneox \
--optimizer adam \
--seed 42 \
--load-pretrained-model true \
--task-name \
"${DATASETS}" \
--checkpoint-path ${DIR}/../model_ckpts/${MODEL_NAME} \
--total-steps 20000 --warmup-steps 10 --train-warmup-steps 0 \
--checkpoint-steps ${CHECKPOINT_STEPS} \
--lr 1e-5 --seq-length 2048 --batch-size 32 --micro-batch-size 1 --gradient-accumulate-step 1 \
--dist-url tcp://127.0.0.1:7033 \
--num-layers 8 --embedding-dim 4096 \
--world-size 8 --pipeline-group-size 4 --data-group-size 2 \
--job-id 0 --net-interface ${netif} \
--fp16 \
--dp-backend nccl \
--dp-mode allreduce \
--pp-mode gpipe --profiling no-profiling"


(trap 'kill 0' SIGINT; \
python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 0 --rank 0 \
& \
python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 1 --rank 1 \
& \
python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 2 --rank 2 \
& \
python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 3 --rank 3 \
& \
python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 4 --rank 4 \
& \
python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 5 --rank 5 \
& \
python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 6 --rank 6 \
& \
python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 7 --rank 7 \
& \
wait)