Fix and update notebook

truefoundry · Mar 14, 2024 · ab185bb · ab185bb
1 parent fb745af
commit ab185bb
Show file tree

Hide file tree

Showing 3 changed files with 93 additions and 73 deletions.
diff --git a/finetune.ipynb b/finetune.ipynb
@@ -50,7 +50,25 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Data needs to be in `jsonl` format with each line containing a json encoded string containing two keys `prompt` and `completion`\n",
+    "We support two different data formats:\n",
+    "\n",
+    "### `Chat`\n",
+    "\n",
+    "Data needs to be in `jsonl` format with each line containing a whole conversation in OpenAI chat format i.e. each line contains a key called `messages`. Each `messages` key contains a list of messages, where each message is a dictionary with `role` and `content` keys. The `role` key can be either `user`, `assistant` or `system` and the `content` key contains the message content.\n",
+    "\n",
+    "```jsonl\n",
+    "{\"messages\": [{\"role\": \"system\", \"content\": \"Marv is a factual chatbot that is also sarcastic.\"}, {\"role\": \"user\", \"content\": \"What's the capital of France?\"}, {\"role\": \"assistant\", \"content\": \"Paris\"}, {\"role\": \"user\", \"content\": \"Can you be more sarcastic?\"}, {\"role\": \"assistant\", \"content\": \"Paris, as if everyone doesn't know that already.\"}]}\n",
+    "{\"messages\": [{\"role\": \"system\", \"content\": \"Marv is a factual chatbot that is also sarcastic.\"}, {\"role\": \"user\", \"content\": \"Who wrote 'Romeo and Juliet'?\"}, {\"role\": \"assistant\", \"content\": \"William Shakespeare\"}, {\"role\": \"user\", \"content\": \"Can you be more sarcastic?\"}, {\"role\": \"assistant\", \"content\": \"Oh, just some guy named William Shakespeare. Ever heard of him?\"}]}\n",
+    "{\"messages\": [{\"role\": \"system\", \"content\": \"Marv is a factual chatbot that is also sarcastic.\"}, {\"role\": \"user\", \"content\": \"How far is the Moon from Earth?\"}, {\"role\": \"assistant\", \"content\": \"384,400 kilometers\"}, {\"role\": \"user\", \"content\": \"Can you be more sarcastic?\"}, {\"role\": \"assistant\", \"content\": \"Around 384,400 kilometers. Give or take a few, like that really matters.\"}]}\n",
+    "...\n",
+    "```\n",
+    "\n",
+    "\n",
+    "Reference: https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset\n",
+    "\n",
+    "\n",
+    "### `Completion`\n",
+    "Data needs to be in `jsonl` format with each line containing a json encoded string containing two keys `prompt` and `completion`.\n",
     "\n",
     "```jsonl\n",
     "{\"prompt\": \"What is 2 + 2?\", \"completion\": \"The answer to 2 + 2 is 4\"}\n",
@@ -59,7 +77,11 @@
     "...\n",
     "```\n",
     "\n",
-    "Once you have your data on `.jsonl` files, you can upload them to the file tree on the left and change the `train_data` and `eval_data` variables in the `Data Parameters` section\n",
+    "Reference: https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset\n",
+    "\n",
+    "### Uploading data to notebook\n",
+    "\n",
+    "Once you have your data on `.jsonl` files, you can upload them to the file tree on the left and change the `train_data_uri` and `eval_data_uri` variables in the `Data Parameters` section\n",
     "\n",
     "![Upload Data](./assets/upload-data.png)\n",
     "\n",
@@ -97,33 +119,34 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Note: Only first 100 data points will be used. This is okay for quick testing. To use all data points please set `max_num_samples` to 0\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
+    "from typing import List, Dict, Optional, Any\n",
+    "from data_utils import DatasetType\n",
+    "\n",
+    "# Type of dataset - Either `completion` or `chat`\n",
+    "dataset_type = DatasetType.completion\n",
+    "\n",
     "# URI to training data. Can be a file on disk or an mlfoundry artifact fqn\n",
-    "train_data = \"./standford_alpaca_train_49k.jsonl\"\n",
+    "train_data_uri: str = \"./standford_alpaca_train_49k.jsonl\"\n",
     "\n",
     "# URI to evaluation data. Can be a file on disk or an mlfoundry artifact fqn. \n",
-    "# Set to \"NA\" if you want to split from train data\n",
-    "eval_data = \"./standford_alpaca_test_2k.jsonl\"\n",
+    "# Set to \"None\" if you want to split from train data\n",
+    "eval_data_uri: Optional[str] = \"./standford_alpaca_test_2k.jsonl\"\n",
     "\n",
-    "# When eval_data is set to \"NA\", use this portion of the train_data to use as eval\n",
+    "# When eval_data is set to `None`, use this portion of the train_data to use as eval\n",
     "eval_size = 0.1\n",
     "\n",
-    "# How many samples to use for training. 0 means all data. Useful to test quickly\n",
-    "max_num_samples = 0\n",
+    "# If your dataset is small (< 10 examples), set this to False\n",
+    "sample_packing = True\n",
     "\n",
-    "if max_num_samples != 0:\n",
-    "    print(f\"Note: Only first {max_num_samples} data points will be used. This is okay for quick testing. To use all data points please set `max_num_samples` to 0\")"
+    "# How many steps to use for training. None means all data. Useful to test quickly\n",
+    "max_steps: Optional[int] = None\n",
+    "\n",
+    "if max_steps is not None:\n",
+    "    print(f\"Note: max_steps is set, this might not use the entire training data. This is okay for quick testing. To use all data points please set `max_steps` to `None`\")"
    ]
   },
   {
@@ -158,9 +181,6 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import os\n",
-    "import torch\n",
-    "\n",
     "# Huggingface hub model id to finetune e.g. \"stas/tiny-random-llama-2\"\n",
     "# If you created this notebook instance from Truefoundry's Model Catalogue, the model id will be set in `launch_parameters`\n",
     "model_id = launch_parameters.model_id\n",
@@ -182,21 +202,19 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Enable LoRa with Quantization\n",
-    "use_qlora = True\n",
+    "adapter = \"qlora\"\n",
     "\n",
-    "# If you want to disable quantization, set `use_qlora` to False and set `use_lora` to True\n",
-    "use_lora = False\n",
-    "\n",
-    "# qlora r. Increasing this will increase GPU memory requirement and training time but can give better results\n",
+    "# lora r. Increasing this will increase GPU memory requirement and training time but can give better results\n",
     "lora_r = 32\n",
     "\n",
-    "# qlora alpha\n",
+    "# lora alpha\n",
     "lora_alpha = max(16, 2 * lora_r)\n",
     "\n",
+    "# Whether to apply Lora to all linear layers\n",
+    "lora_target_linear = True\n",
     "\n",
-    "if use_qlora and use_lora:\n",
-    "    raise ValueError(\"Both `use_qlora` and `use_lora` cannot be True at the same time!\")"
+    "#  The names of the modules to apply Lora to. These will be added to modules found by `lora_target_linear` if that is enabled\n",
+    "lora_target_modules: Optional[List[str]] = None"
    ]
   },
   {
@@ -213,31 +231,31 @@
    "outputs": [],
    "source": [
     "# Where to dump checkpoints and model\n",
-    "output_dir = \"./model\"\n",
+    "output_dir = \"./outputs\"\n",
     "\n",
     "# If to delete `output_dir` before starting\n",
     "cleanup_output_dir_on_start = False\n",
     "\n",
     "# Max Sequence Length. \n",
     "# Increasing this will allow longer sequences but will significantly increase GPU memory requirement and training time.\n",
     "# This cannot be greater than model's max sequence length\n",
-    "max_length = launch_parameters.max_length\n",
+    "max_sequence_length = launch_parameters.max_length\n",
     "\n",
-    "# Max batch size per GPU. \n",
+    "# Batch size per GPU. \n",
     "# Increasing this will increase GPU memory requirement and training time\n",
-    "per_device_train_batch_size = launch_parameters.batch_size\n",
+    "micro_batch_size = launch_parameters.batch_size\n",
     "\n",
     "# Learning rate\n",
     "learning_rate = 0.00003\n",
     "\n",
     "# How many epochs to run training for\n",
-    "num_train_epochs = 10\n",
+    "num_epochs = 10\n",
     "\n",
     "# How often to evaluate. Value less than 1 denotes every X% of total run\n",
-    "eval_steps = 0.05\n",
+    "eval_steps = 0.1\n",
     "\n",
     "# How often to save checkpoints. Value less than 1 denotes every X% of total run\n",
-    "save_steps = 0.05"
+    "save_steps = 0.1"
    ]
   },
   {
@@ -253,10 +271,11 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "import os\n",
     "from mlfoundry_utils import generate_run_name, get_or_create_run\n",
     "\n",
     "# Enable reporting metrics to mlfoundry\n",
-    "mlfoundry_enable_reporting = True\n",
+    "mlfoundry_enable_reporting = False\n",
     "\n",
     "# Which ML Repo to log metrics and checkpoints to. \n",
     "# You can create new ML Repos from the https://<your-org>.truefoundry.cloud/mlfoundry page\n",
@@ -267,7 +286,7 @@
     "mlfoundry_log_checkpoints = True\n",
     "\n",
     "# Run to which metrics and checkpoints will be logged\n",
-    "mlfoundry_run_name = generate_run_name(model_id)\n",
+    "mlfoundry_run_name = generate_run_name(model_id, seed=os.getpid())\n",
     "\n",
     "# If to upload checkpoints to ML Repo when they are saved\n",
     "mlfoundry_checkpoint_artifact_name = f\"ckpt-{mlfoundry_run_name}\"\n",
@@ -295,7 +314,7 @@
     "    from urllib.parse import urljoin\n",
     "    from tensorboard import notebook\n",
     "\n",
-    "    tb_logs = os.path.join(\".\", \"tensorboard_logs\")\n",
+    "    tb_logs = os.path.join(os.path.abspath(output_dir), \"model\", \"runs\")\n",
     "    os.makedirs(tb_logs, exist_ok=True)\n",
     "    os.environ[\"TENSORBOARD_PROXY_URL\"] = urljoin(os.getenv(\"NB_PREFIX\", \"/\"), \"proxy/%PORT%/\")\n",
     "    notebook.start(f\"--logdir {tb_logs} --reload_interval 30.0 --reload_multifile True\")\n",
@@ -317,55 +336,56 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "import os\n",
+    "import torch\n",
+    "\n",
     "# Mixed Precision Training. We automatically select the precision based on GPU capability\n",
     "mixed_precision = \"bf16\" if torch.cuda.is_bf16_supported() else \"fp16\"\n",
-    "bf16 = (mixed_precision == \"bf16\")\n",
-    "fp16 = (mixed_precision == \"fp16\")\n",
     "\n",
     "COMMAND = f\"\"\"\n",
     "accelerate launch \\\n",
     "--mixed_precision {mixed_precision} \\\n",
     "--use_deepspeed \\\n",
     "train.py \\\n",
+    "config-base.yaml \\\n",
     "--deepspeed ./deepspeed_configs/3_ds_z2_config.json \\\n",
-    "--bf16 {bf16} \\\n",
-    "--fp16 {fp16} \\\n",
-    "--model_id {model_id} \\\n",
+    "--flash_attention True \\\n",
+    "--gradient_checkpointing True \\\n",
+    "--base_model {model_id} \\\n",
     "--output_dir {output_dir} \\\n",
-    "--train_data {train_data} \\\n",
-    "--eval_data {eval_data} \\\n",
-    "--eval_size {eval_size} \\\n",
-    "--max_num_samples {max_num_samples} \\\n",
-    "--train_on_prompt False \\\n",
-    "--max_length {max_length} \\\n",
-    "--use_qlora {use_qlora} \\\n",
-    "--use_lora {use_lora} \\\n",
-    "--qlora_bit_length 4 \\\n",
-    "--lora_target_modules auto \\\n",
+    "--dataset_type {dataset_type} \\\n",
+    "--train_data_uri {train_data_uri} \\\n",
+    "--val_data_uri {eval_data_uri} \\\n",
+    "--val_set_size {eval_size} \\\n",
+    "--max_steps {max_steps} \\\n",
+    "--sequence_len {max_sequence_length} \\\n",
+    "--train_on_inputs False \\\n",
+    "--sample_packing {sample_packing} \\\n",
+    "--pad_to_sequence_len True \\\n",
+    "--num_epochs {num_epochs} \\\n",
+    "--micro_batch_size {micro_batch_size} \\\n",
+    "--learning_rate {learning_rate} \\\n",
+    "--warmup_ratio 0.1 \\\n",
+    "--gradient_accumulation_steps 4 \\\n",
+    "--early_stopping_patience 10 \\\n",
+    "--adapter qlora \\\n",
+    "--lora_target_linear {lora_target_linear} \\\n",
+    "--lora_target_modules {lora_target_modules} \\\n",
     "--lora_r {lora_r} \\\n",
     "--lora_alpha {lora_alpha} \\\n",
     "--lora_dropout 0.05 \\\n",
-    "--lora_bias none \\\n",
-    "--num_train_epochs {num_train_epochs} \\\n",
-    "--early_stopping_patience 10 \\\n",
-    "--early_stopping_threshold 0.0 \\\n",
-    "--auto_find_batch_size false \\\n",
-    "--per_device_train_batch_size {per_device_train_batch_size} \\\n",
-    "--per_device_eval_batch_size {per_device_train_batch_size} \\\n",
-    "--gradient_accumulation_steps 4 \\\n",
-    "--learning_rate {learning_rate} \\\n",
-    "--logging_strategy steps \\\n",
     "--logging_steps 5 \\\n",
     "--evaluation_strategy steps \\\n",
     "--eval_steps {eval_steps} \\\n",
     "--save_strategy steps \\\n",
     "--save_steps {save_steps} \\\n",
+    "--seed 42 \\\n",
     "--mlfoundry_enable_reporting {mlfoundry_enable_reporting} \\\n",
     "--mlfoundry_ml_repo {mlfoundry_ml_repo} \\\n",
     "--mlfoundry_run_name {mlfoundry_run_name} \\\n",
     "--mlfoundry_checkpoint_artifact_name {mlfoundry_checkpoint_artifact_name} \\\n",
     "--mlfoundry_log_checkpoints {mlfoundry_log_checkpoints} \\\n",
-    "--cleanup_output_dir_on_start False \\\n",
+    "--cleanup_output_dir_on_start {cleanup_output_dir_on_start} \\\n",
     "--resume_from_checkpoint True \\\n",
     "| tee train.log\n",
     "\"\"\"\n",
@@ -382,7 +402,7 @@
    },
    "outputs": [],
    "source": [
-    "!{COMMAND} "
+    "!{COMMAND}"
    ]
   }
  ],

diff --git a/mlfoundry_utils.py b/mlfoundry_utils.py
@@ -184,12 +184,12 @@ def sanitize_name(value):
     return re.sub(rf"[{re.escape(string.punctuation)}]+", "-", value.encode("ascii", "ignore").decode("utf-8"))
 
 
-def generate_run_name(model_id):
+def generate_run_name(model_id, seed: Optional[int] = None):
     *_, model_name = model_id.split("/", 1)
     sanitized_model_name = sanitize_name(model_name)
     alphabet = string.ascii_lowercase + string.digits
-    random.choices(alphabet, k=8)
-    random_id = "".join(random.choices(alphabet, k=6))
+    rng = random.Random(seed) if seed is not None else random
+    random_id = "".join(rng.choices(alphabet, k=6))
     run_name = f"ft-{sanitized_model_name}-{random_id}"
     return run_name
 

diff --git a/utils.py b/utils.py
@@ -110,7 +110,7 @@ class Config:
         extra = "ignore"
 
     model_id: str = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
-    max_length: Optional[int] = None
+    max_length: Optional[int] = 2048
     batch_size: int = 1