Skip to content

Commit

Permalink
Fix and update notebook
Browse files Browse the repository at this point in the history
  • Loading branch information
chiragjn committed Mar 14, 2024
1 parent fb745af commit ab185bb
Show file tree
Hide file tree
Showing 3 changed files with 93 additions and 73 deletions.
158 changes: 89 additions & 69 deletions finetune.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,25 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"Data needs to be in `jsonl` format with each line containing a json encoded string containing two keys `prompt` and `completion`\n",
"We support two different data formats:\n",
"\n",
"### `Chat`\n",
"\n",
"Data needs to be in `jsonl` format with each line containing a whole conversation in OpenAI chat format i.e. each line contains a key called `messages`. Each `messages` key contains a list of messages, where each message is a dictionary with `role` and `content` keys. The `role` key can be either `user`, `assistant` or `system` and the `content` key contains the message content.\n",
"\n",
"```jsonl\n",
"{\"messages\": [{\"role\": \"system\", \"content\": \"Marv is a factual chatbot that is also sarcastic.\"}, {\"role\": \"user\", \"content\": \"What's the capital of France?\"}, {\"role\": \"assistant\", \"content\": \"Paris\"}, {\"role\": \"user\", \"content\": \"Can you be more sarcastic?\"}, {\"role\": \"assistant\", \"content\": \"Paris, as if everyone doesn't know that already.\"}]}\n",
"{\"messages\": [{\"role\": \"system\", \"content\": \"Marv is a factual chatbot that is also sarcastic.\"}, {\"role\": \"user\", \"content\": \"Who wrote 'Romeo and Juliet'?\"}, {\"role\": \"assistant\", \"content\": \"William Shakespeare\"}, {\"role\": \"user\", \"content\": \"Can you be more sarcastic?\"}, {\"role\": \"assistant\", \"content\": \"Oh, just some guy named William Shakespeare. Ever heard of him?\"}]}\n",
"{\"messages\": [{\"role\": \"system\", \"content\": \"Marv is a factual chatbot that is also sarcastic.\"}, {\"role\": \"user\", \"content\": \"How far is the Moon from Earth?\"}, {\"role\": \"assistant\", \"content\": \"384,400 kilometers\"}, {\"role\": \"user\", \"content\": \"Can you be more sarcastic?\"}, {\"role\": \"assistant\", \"content\": \"Around 384,400 kilometers. Give or take a few, like that really matters.\"}]}\n",
"...\n",
"```\n",
"\n",
"\n",
"Reference: https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset\n",
"\n",
"\n",
"### `Completion`\n",
"Data needs to be in `jsonl` format with each line containing a json encoded string containing two keys `prompt` and `completion`.\n",
"\n",
"```jsonl\n",
"{\"prompt\": \"What is 2 + 2?\", \"completion\": \"The answer to 2 + 2 is 4\"}\n",
Expand All @@ -59,7 +77,11 @@
"...\n",
"```\n",
"\n",
"Once you have your data on `.jsonl` files, you can upload them to the file tree on the left and change the `train_data` and `eval_data` variables in the `Data Parameters` section\n",
"Reference: https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset\n",
"\n",
"### Uploading data to notebook\n",
"\n",
"Once you have your data on `.jsonl` files, you can upload them to the file tree on the left and change the `train_data_uri` and `eval_data_uri` variables in the `Data Parameters` section\n",
"\n",
"![Upload Data](./assets/upload-data.png)\n",
"\n",
Expand Down Expand Up @@ -97,33 +119,34 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Note: Only first 100 data points will be used. This is okay for quick testing. To use all data points please set `max_num_samples` to 0\n"
]
}
],
"outputs": [],
"source": [
"from typing import List, Dict, Optional, Any\n",
"from data_utils import DatasetType\n",
"\n",
"# Type of dataset - Either `completion` or `chat`\n",
"dataset_type = DatasetType.completion\n",
"\n",
"# URI to training data. Can be a file on disk or an mlfoundry artifact fqn\n",
"train_data = \"./standford_alpaca_train_49k.jsonl\"\n",
"train_data_uri: str = \"./standford_alpaca_train_49k.jsonl\"\n",
"\n",
"# URI to evaluation data. Can be a file on disk or an mlfoundry artifact fqn. \n",
"# Set to \"NA\" if you want to split from train data\n",
"eval_data = \"./standford_alpaca_test_2k.jsonl\"\n",
"# Set to \"None\" if you want to split from train data\n",
"eval_data_uri: Optional[str] = \"./standford_alpaca_test_2k.jsonl\"\n",
"\n",
"# When eval_data is set to \"NA\", use this portion of the train_data to use as eval\n",
"# When eval_data is set to `None`, use this portion of the train_data to use as eval\n",
"eval_size = 0.1\n",
"\n",
"# How many samples to use for training. 0 means all data. Useful to test quickly\n",
"max_num_samples = 0\n",
"# If your dataset is small (< 10 examples), set this to False\n",
"sample_packing = True\n",
"\n",
"if max_num_samples != 0:\n",
" print(f\"Note: Only first {max_num_samples} data points will be used. This is okay for quick testing. To use all data points please set `max_num_samples` to 0\")"
"# How many steps to use for training. None means all data. Useful to test quickly\n",
"max_steps: Optional[int] = None\n",
"\n",
"if max_steps is not None:\n",
" print(f\"Note: max_steps is set, this might not use the entire training data. This is okay for quick testing. To use all data points please set `max_steps` to `None`\")"
]
},
{
Expand Down Expand Up @@ -158,9 +181,6 @@
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import torch\n",
"\n",
"# Huggingface hub model id to finetune e.g. \"stas/tiny-random-llama-2\"\n",
"# If you created this notebook instance from Truefoundry's Model Catalogue, the model id will be set in `launch_parameters`\n",
"model_id = launch_parameters.model_id\n",
Expand All @@ -182,21 +202,19 @@
"metadata": {},
"outputs": [],
"source": [
"# Enable LoRa with Quantization\n",
"use_qlora = True\n",
"adapter = \"qlora\"\n",
"\n",
"# If you want to disable quantization, set `use_qlora` to False and set `use_lora` to True\n",
"use_lora = False\n",
"\n",
"# qlora r. Increasing this will increase GPU memory requirement and training time but can give better results\n",
"# lora r. Increasing this will increase GPU memory requirement and training time but can give better results\n",
"lora_r = 32\n",
"\n",
"# qlora alpha\n",
"# lora alpha\n",
"lora_alpha = max(16, 2 * lora_r)\n",
"\n",
"# Whether to apply Lora to all linear layers\n",
"lora_target_linear = True\n",
"\n",
"if use_qlora and use_lora:\n",
" raise ValueError(\"Both `use_qlora` and `use_lora` cannot be True at the same time!\")"
"# The names of the modules to apply Lora to. These will be added to modules found by `lora_target_linear` if that is enabled\n",
"lora_target_modules: Optional[List[str]] = None"
]
},
{
Expand All @@ -213,31 +231,31 @@
"outputs": [],
"source": [
"# Where to dump checkpoints and model\n",
"output_dir = \"./model\"\n",
"output_dir = \"./outputs\"\n",
"\n",
"# If to delete `output_dir` before starting\n",
"cleanup_output_dir_on_start = False\n",
"\n",
"# Max Sequence Length. \n",
"# Increasing this will allow longer sequences but will significantly increase GPU memory requirement and training time.\n",
"# This cannot be greater than model's max sequence length\n",
"max_length = launch_parameters.max_length\n",
"max_sequence_length = launch_parameters.max_length\n",
"\n",
"# Max batch size per GPU. \n",
"# Batch size per GPU. \n",
"# Increasing this will increase GPU memory requirement and training time\n",
"per_device_train_batch_size = launch_parameters.batch_size\n",
"micro_batch_size = launch_parameters.batch_size\n",
"\n",
"# Learning rate\n",
"learning_rate = 0.00003\n",
"\n",
"# How many epochs to run training for\n",
"num_train_epochs = 10\n",
"num_epochs = 10\n",
"\n",
"# How often to evaluate. Value less than 1 denotes every X% of total run\n",
"eval_steps = 0.05\n",
"eval_steps = 0.1\n",
"\n",
"# How often to save checkpoints. Value less than 1 denotes every X% of total run\n",
"save_steps = 0.05"
"save_steps = 0.1"
]
},
{
Expand All @@ -253,10 +271,11 @@
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"from mlfoundry_utils import generate_run_name, get_or_create_run\n",
"\n",
"# Enable reporting metrics to mlfoundry\n",
"mlfoundry_enable_reporting = True\n",
"mlfoundry_enable_reporting = False\n",
"\n",
"# Which ML Repo to log metrics and checkpoints to. \n",
"# You can create new ML Repos from the https://<your-org>.truefoundry.cloud/mlfoundry page\n",
Expand All @@ -267,7 +286,7 @@
"mlfoundry_log_checkpoints = True\n",
"\n",
"# Run to which metrics and checkpoints will be logged\n",
"mlfoundry_run_name = generate_run_name(model_id)\n",
"mlfoundry_run_name = generate_run_name(model_id, seed=os.getpid())\n",
"\n",
"# If to upload checkpoints to ML Repo when they are saved\n",
"mlfoundry_checkpoint_artifact_name = f\"ckpt-{mlfoundry_run_name}\"\n",
Expand Down Expand Up @@ -295,7 +314,7 @@
" from urllib.parse import urljoin\n",
" from tensorboard import notebook\n",
"\n",
" tb_logs = os.path.join(\".\", \"tensorboard_logs\")\n",
" tb_logs = os.path.join(os.path.abspath(output_dir), \"model\", \"runs\")\n",
" os.makedirs(tb_logs, exist_ok=True)\n",
" os.environ[\"TENSORBOARD_PROXY_URL\"] = urljoin(os.getenv(\"NB_PREFIX\", \"/\"), \"proxy/%PORT%/\")\n",
" notebook.start(f\"--logdir {tb_logs} --reload_interval 30.0 --reload_multifile True\")\n",
Expand All @@ -317,55 +336,56 @@
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import torch\n",
"\n",
"# Mixed Precision Training. We automatically select the precision based on GPU capability\n",
"mixed_precision = \"bf16\" if torch.cuda.is_bf16_supported() else \"fp16\"\n",
"bf16 = (mixed_precision == \"bf16\")\n",
"fp16 = (mixed_precision == \"fp16\")\n",
"\n",
"COMMAND = f\"\"\"\n",
"accelerate launch \\\n",
"--mixed_precision {mixed_precision} \\\n",
"--use_deepspeed \\\n",
"train.py \\\n",
"config-base.yaml \\\n",
"--deepspeed ./deepspeed_configs/3_ds_z2_config.json \\\n",
"--bf16 {bf16} \\\n",
"--fp16 {fp16} \\\n",
"--model_id {model_id} \\\n",
"--flash_attention True \\\n",
"--gradient_checkpointing True \\\n",
"--base_model {model_id} \\\n",
"--output_dir {output_dir} \\\n",
"--train_data {train_data} \\\n",
"--eval_data {eval_data} \\\n",
"--eval_size {eval_size} \\\n",
"--max_num_samples {max_num_samples} \\\n",
"--train_on_prompt False \\\n",
"--max_length {max_length} \\\n",
"--use_qlora {use_qlora} \\\n",
"--use_lora {use_lora} \\\n",
"--qlora_bit_length 4 \\\n",
"--lora_target_modules auto \\\n",
"--dataset_type {dataset_type} \\\n",
"--train_data_uri {train_data_uri} \\\n",
"--val_data_uri {eval_data_uri} \\\n",
"--val_set_size {eval_size} \\\n",
"--max_steps {max_steps} \\\n",
"--sequence_len {max_sequence_length} \\\n",
"--train_on_inputs False \\\n",
"--sample_packing {sample_packing} \\\n",
"--pad_to_sequence_len True \\\n",
"--num_epochs {num_epochs} \\\n",
"--micro_batch_size {micro_batch_size} \\\n",
"--learning_rate {learning_rate} \\\n",
"--warmup_ratio 0.1 \\\n",
"--gradient_accumulation_steps 4 \\\n",
"--early_stopping_patience 10 \\\n",
"--adapter qlora \\\n",
"--lora_target_linear {lora_target_linear} \\\n",
"--lora_target_modules {lora_target_modules} \\\n",
"--lora_r {lora_r} \\\n",
"--lora_alpha {lora_alpha} \\\n",
"--lora_dropout 0.05 \\\n",
"--lora_bias none \\\n",
"--num_train_epochs {num_train_epochs} \\\n",
"--early_stopping_patience 10 \\\n",
"--early_stopping_threshold 0.0 \\\n",
"--auto_find_batch_size false \\\n",
"--per_device_train_batch_size {per_device_train_batch_size} \\\n",
"--per_device_eval_batch_size {per_device_train_batch_size} \\\n",
"--gradient_accumulation_steps 4 \\\n",
"--learning_rate {learning_rate} \\\n",
"--logging_strategy steps \\\n",
"--logging_steps 5 \\\n",
"--evaluation_strategy steps \\\n",
"--eval_steps {eval_steps} \\\n",
"--save_strategy steps \\\n",
"--save_steps {save_steps} \\\n",
"--seed 42 \\\n",
"--mlfoundry_enable_reporting {mlfoundry_enable_reporting} \\\n",
"--mlfoundry_ml_repo {mlfoundry_ml_repo} \\\n",
"--mlfoundry_run_name {mlfoundry_run_name} \\\n",
"--mlfoundry_checkpoint_artifact_name {mlfoundry_checkpoint_artifact_name} \\\n",
"--mlfoundry_log_checkpoints {mlfoundry_log_checkpoints} \\\n",
"--cleanup_output_dir_on_start False \\\n",
"--cleanup_output_dir_on_start {cleanup_output_dir_on_start} \\\n",
"--resume_from_checkpoint True \\\n",
"| tee train.log\n",
"\"\"\"\n",
Expand All @@ -382,7 +402,7 @@
},
"outputs": [],
"source": [
"!{COMMAND} "
"!{COMMAND}"
]
}
],
Expand Down
6 changes: 3 additions & 3 deletions mlfoundry_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,12 +184,12 @@ def sanitize_name(value):
return re.sub(rf"[{re.escape(string.punctuation)}]+", "-", value.encode("ascii", "ignore").decode("utf-8"))


def generate_run_name(model_id):
def generate_run_name(model_id, seed: Optional[int] = None):
*_, model_name = model_id.split("/", 1)
sanitized_model_name = sanitize_name(model_name)
alphabet = string.ascii_lowercase + string.digits
random.choices(alphabet, k=8)
random_id = "".join(random.choices(alphabet, k=6))
rng = random.Random(seed) if seed is not None else random
random_id = "".join(rng.choices(alphabet, k=6))
run_name = f"ft-{sanitized_model_name}-{random_id}"
return run_name

Expand Down
2 changes: 1 addition & 1 deletion utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ class Config:
extra = "ignore"

model_id: str = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
max_length: Optional[int] = None
max_length: Optional[int] = 2048
batch_size: int = 1


Expand Down

0 comments on commit ab185bb

Please sign in to comment.