# Fine-tuning LLM on MLRun documentation

In [1]:
import mlrun
import os

In [3]:
# TODO: Create project with better name:
project = mlrun.get_or_create_project("learn-docs-dev", user_project=True)
# TODO: Change to more exact image with appropiate tag:
project.set_default_image("yonishelach/mlrun:ds-debug-u")
project.set_source("git://github.com/yonishelach/learn-docs.git#main", pull_at_runtime=True)

> 2023-05-07 10:10:44,743 [info] loaded project learn-docs-dev from MLRun DB


In [4]:
base_path = os.path.abspath(project.context)
base_path

'/User/learn-docs'

## Prepare MLRun documentation data

### Download docs

In [22]:
fetch_docs = project.set_function(
    "src/download_docs.py",
    name="docs-downloader",
    handler="download_all_files",
    kind="job",
)
fetch_docs.apply(mlrun.auto_mount())

<mlrun.runtimes.kubejob.KubejobRuntime at 0x7fa5b6f05b50>

In [27]:
target_dir = os.path.join(base_path, "data/mlrun_docs")
# url = 'https://pandas.pydata.org/docs/user_guide/'
url = 'https://docs.mlrun.org/en/stable/'

In [28]:
fetch_docs_run = mlrun.run_function(
    function="docs-downloader",
    params={"target_dir": target_dir,"url": url, "html_to_text": True},
    outputs=["docs_dir"],
)

> 2023-05-07 10:39:12,048 [info] Storing function: {'name': 'docs-downloader-download-all-files', 'uid': 'd1d24bb9fa8447f4aa1c16139166204c', 'db': 'http://mlrun-api:8080'}
> 2023-05-07 10:39:12,289 [info] Job is running in the background, pod: docs-downloader-download-all-files-dqsnt
Found 127 links in https://docs.mlrun.org/en/stable/
wrote file: architecture.txt
wrote file: install.txt
wrote file: tutorial_index.txt
wrote file: tutorial_07-batch-infer.txt
wrote file: install_remote.txt
wrote file: install_kubernetes.txt
wrote file: projects_run-build-deploy.txt
wrote file: projects_project.txt
wrote file: tutorial_03-model-serving.txt
wrote file: install_local-docker.txt
wrote file: projects_ci-integration.txt
wrote file: projects_build-run-workflows-pipelines.txt
wrote file: runtimes_functions.txt
wrote file: projects_create-project.txt
wrote file: runtimes_functions-architecture.txt
wrote file: runtimes_dask-pipeline.txt
wrote file: runtimes_dask-overview.txt
wrote file: runtimes_s

project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
learn-docs-dev-admin,...9166204c,0,May 07 10:39:28,completed,docs-downloader-download-all-files,v3io_user=adminkind=jobowner=adminmlrun/client_version=1.3.0mlrun/client_python_version=3.9.16host=docs-downloader-download-all-files-dqsnt,,target_dir=/User/learn-docs/data/mlrun_docsurl=https://docs.mlrun.org/en/stable/html_to_text=True,,docs_dir





> 2023-05-07 10:39:46,497 [info] run executed, status=completed: {'name': 'docs-downloader-download-all-files'}


In [29]:
fetch_docs_run.outputs["docs_dir"]

'v3io:///projects/learn-docs-dev-admin/artifacts/docs-downloader-download-all-files/0/docs_dir.zip'

In [35]:
docs_dir = 'v3io:///projects/learn-docs-dev-admin/artifacts/docs-downloader-download-all-files/0/docs_dir.zip'

### Prepare docs dataset

In [38]:
data_preparation = project.set_function(
    "src/data_prep.py",
    name="data-prep",
    handler="prepare_dataset",
    kind="job",
)
data_preparation.apply(mlrun.auto_mount())

<mlrun.runtimes.kubejob.KubejobRuntime at 0x7fa5b66ce610>

In [39]:
target_dir = os.path.join(base_path, "data/mlrun_dataset")
prepare_dataset_run = mlrun.run_function(
    function="data-prep",
    params={
        "target_dir": target_dir,
        "ignored_files": ["contents", "genindex"],
        "from_text": True,
    },
    inputs={"docs_source": docs_dir},
    outputs=["train_dataset", "test_dataset", "additional_params"],
)

> 2023-05-07 10:57:32,672 [info] Storing function: {'name': 'data-prep-prepare-dataset', 'uid': 'feb75f95a1174d2aa00b3fb92cac0736', 'db': 'http://mlrun-api:8080'}
> 2023-05-07 10:57:32,931 [info] Job is running in the background, pod: data-prep-prepare-dataset-mc6st
Added to dataset: api_index
Added to dataset: api_mlrun.artifacts
Added to dataset: api_mlrun.config
Added to dataset: api_mlrun.datastore
Added to dataset: api_mlrun.db
Added to dataset: api_mlrun.execution
Added to dataset: api_mlrun.feature_store
Added to dataset: api_mlrun.frameworks_index
Added to dataset: api_mlrun.frameworks_mlrun.frameworks.auto_mlrun
Added to dataset: api_mlrun.frameworks_mlrun.frameworks.lgbm
Added to dataset: api_mlrun.frameworks_mlrun.frameworks.pytorch
Added to dataset: api_mlrun.frameworks_mlrun.frameworks.sklearn
Added to dataset: api_mlrun.frameworks_mlrun.frameworks.tf_keras
Added to dataset: api_mlrun.frameworks_mlrun.frameworks.xgboost
Added to dataset: api_mlrun.model
Added to dataset: a

project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
learn-docs-dev-admin,...2cac0736,0,May 07 10:57:48,completed,data-prep-prepare-dataset,v3io_user=adminkind=jobowner=adminmlrun/client_version=1.3.0mlrun/client_python_version=3.9.16host=data-prep-prepare-dataset-mc6st,docs_source,"target_dir=/User/learn-docs/data/mlrun_datasetignored_files=['contents', 'genindex']from_text=True",,train_datasettest_dataset





> 2023-05-07 10:57:57,129 [info] run executed, status=completed: {'name': 'data-prep-prepare-dataset'}


In [40]:
train_dataset = "store://datasets/deepspeed-admin/data-prep-prepare-dataset_train_dataset#0:latest"
test_dataset = "store://datasets/deepspeed-admin/data-prep-prepare-dataset_test_dataset#0:latest"

## Test pretrained LLM from HuggingFace hub without fine-tuning

In [41]:
test_pretrained_llm = project.set_function(
    "src/test_model.py",
    name="test-llm",
    handler="load_my_model",
    kind="job",
)
test_pretrained_llm.apply(mlrun.auto_mount())

<mlrun.runtimes.kubejob.KubejobRuntime at 0x7fa5b66ae940>

In [42]:
mlrun.run_function(function="test-llm", params={"model_name": "gpt2"})

> 2023-05-07 10:59:36,018 [info] Storing function: {'name': 'test-llm-load-my-model', 'uid': 'ad0700a3c0bf40a7b7a4bc4967368818', 'db': 'http://mlrun-api:8080'}
> 2023-05-07 10:59:36,268 [info] Job is running in the background, pod: test-llm-load-my-model-wt9km
Downloading (…)olve/main/vocab.json: 100%|██████████| 1.04M/1.04M [00:00<00:00, 20.3MB/s]
Downloading (…)olve/main/merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 12.8MB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 665/665 [00:00<00:00, 225kB/s]
Downloading pytorch_model.bin: 100%|██████████| 548M/548M [00:01<00:00, 482MB/s] 
Downloading (…)neration_config.json: 100%|██████████| 124/124 [00:00<00:00, 17.3kB/s]
Generated response: What is mlrun?

mlrun is a Python library for running Python code in a Python interpreter. It is a Python wrapper around the standard library.

It is a Python wrapper around the standard library. It is a Python wrapper around the standard library. It is a Python wrapper around the standa

project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
learn-docs-dev-admin,...67368818,0,May 07 10:59:51,completed,test-llm-load-my-model,v3io_user=adminkind=jobowner=adminmlrun/client_version=1.3.0mlrun/client_python_version=3.9.16host=test-llm-load-my-model-wt9km,,model_name=gpt2,,





> 2023-05-07 11:01:02,758 [info] run executed, status=completed: {'name': 'test-llm-load-my-model'}


<mlrun.model.RunObject at 0x7fa5b6e714f0>

## Fine-tune this LLM with MLRun dataset

In [43]:
trainer = project.set_function(
    "src/trainer.py",
    name="hugging_face_classifier_trainer",
    kind="mpijob",
    with_repo=True,
)
trainer.spec.replicas = 4
trainer.with_requests(cpu=5, mem="5Gi")
trainer.with_limits(gpus=1, cpu=8, mem="50Gi")


trainer.apply(mlrun.auto_mount())
trainer.save()

Names with underscore '_' are about to be deprecated, use dashes '-' instead. Replacing underscores with dashes.


'db://learn-docs-dev-admin/hugging-face-classifier-trainer'

In [44]:
params = {
    "model_name": "gpt2-ft-mlrun",
    "pretrained_model": "gpt2",
    "pretrained_tokenizer": "gpt2",
    "model_class": "transformers.GPT2LMHeadModel",
    "tokenizer_class": "transformers.GPT2Tokenizer",
    "random_state": 42,
    "TRAIN_output_dir": os.path.join(base_path, "finetuning-mlrun-data"),
    "TRAIN_overwrite_output_dir": True,
    "TRAIN_num_train_epochs": 2,
    "TRAIN_fp16": True,
    "TRAIN_bf16": False,
    "TRAIN_per_device_train_batch_size": 4,
    "TRAIN_per_device_eval_batch_size": 4,
    "TRAIN_save_steps": 10000,
    "TRAIN_save_total_limit": 2,
    "TRAIN_deepspeed": os.path.join(base_path, "deepspeed_config/stage3.json"),
}

In [45]:
training_run = mlrun.run_function(
    function="hugging_face_classifier_trainer",
    name="trainer",
    inputs={
            "dataset": train_dataset,  # prepare_dataset_run.outputs["train_dataset"],
            "test_set": test_dataset  # prepare_dataset_run.outputs["test_dataset"],
        },
    params=params,
    handler="train",
    # outputs=["model"],
)

> 2023-05-07 11:01:02,902 [info] Storing function: {'name': 'trainer', 'uid': 'c254cab12e9c447190295db829ca58e2', 'db': 'http://mlrun-api:8080'}
> 2023-05-07 11:01:31,536 [info] MpiJob trainer-f8a2010b launcher pod trainer-f8a2010b-launcher state active
+ POD_NAME=trainer-f8a2010b-worker-0
+ shift
+ /opt/kube/kubectl exec trainer-f8a2010b-worker-0 -- /bin/sh -c  orted -mca ess "env" -mca ess_base_jobid "3238461440" -mca ess_base_vpid 1 -mca ess_base_num_procs "5" -mca orte_node_regex "trainer-f[1:8]a2010b-launcher,trainer-f[1:8]a2010b-worker-0,trainer-f[1:8]a2010b-worker-1,trainer-f[1:8]a2010b-worker-2,trainer-f[1:8]a2010b-worker-3@0(5)" -mca orte_hnp_uri "3238461440.0;tcp://192.168.139.248:39719" -mca plm "rsh" --tree-spawn -mca routed "radix" -mca orte_parent_uri "3238461440.0;tcp://192.168.139.248:39719" -mca plm_rsh_agent "/etc/mpi/kubexec.sh" -mca orte_default_hostfile "/etc/mpi/hostfile" -mca pmix "^s1,s2,cray,isolated"
+ POD_NAME=trainer-f8a2010b-worker-3
+ shift
+ /opt/kube/kub

project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
learn-docs-dev-admin,...29ca58e2,0,May 07 11:01:36,completed,trainer,v3io_user=adminkind=mpijobowner=adminmlrun/client_version=1.3.0mlrun/client_python_version=3.9.16mlrun/job=trainer-f8a2010bhost=trainer-f8a2010b-worker-0,datasettest_set,model_name=gpt2-ft-mlrunpretrained_model=gpt2pretrained_tokenizer=gpt2model_class=transformers.GPT2LMHeadModeltokenizer_class=transformers.GPT2Tokenizerrandom_state=42TRAIN_output_dir=/User/learn-docs/finetuning-mlrun-dataTRAIN_overwrite_output_dir=TrueTRAIN_num_train_epochs=2TRAIN_fp16=TrueTRAIN_bf16=FalseTRAIN_per_device_train_batch_size=4TRAIN_per_device_eval_batch_size=4TRAIN_save_steps=10000TRAIN_save_total_limit=2TRAIN_deepspeed=/User/learn-docs/deepspeed_config/stage3.json,train_runtime=16.752train_samples_per_second=16.118train_steps_per_second=1.075total_flos=190381457408.0train_loss=2.9049479166666665,tokenizermodel





> 2023-05-07 11:04:34,780 [info] run executed, status=completed: {'name': 'trainer'}


### Test our fine-tuned model

In [46]:
model_name = "store://models/deepspeed-admin/gpt2-ft-mlrun#0:latest"
tokenizer_name = "store://artifacts/deepspeed-admin/trainer_tokenizer#0:latest"

In [46]:
temp_model = os.path.join(base_path, "finetuning-mlrun-data")

In [27]:
mlrun.run_function(function="test-llm", params={"model_name": "gpt2"})

> 2023-05-07 09:34:08,918 [info] Storing function: {'name': 'test-llm-load-my-model', 'uid': '9dd0a9d8dd634797a6e3779115cbdcec', 'db': 'http://mlrun-api:8080'}
> 2023-05-07 09:34:09,167 [info] Job is running in the background, pod: test-llm-load-my-model-xjzwf


KeyboardInterrupt: 

## Train on DataBricks Dolly dataset

In [None]:
prepare_dolly_dataset = project.set_function(
    "src/data_prep_dolly.py",
    name="data-prep-dolly",
    handler="preprocess_dolly",
    kind="job",
)
prepare_dolly_dataset.apply(mlrun.auto_mount())

In [28]:
prepare_dolly_run = mlrun.run_function(
    function="data-prep-dolly",
    params={"pretrained_tokenizer_name_or_path": "gpt2"},
    handler="preprocess_dolly",
)

> 2023-05-07 09:34:25,375 [info] Storing function: {'name': 'data-prep-dolly-preprocess-dolly', 'uid': 'd9a282660b304544bbcf86c3c19fee25', 'db': 'http://mlrun-api:8080'}
> 2023-05-07 09:34:25,510 [info] Job is running in the background, pod: data-prep-dolly-preprocess-dolly-pcjfp
> 2023-05-07 09:34:42,739 [info] Loading tokenizer for gpt2
Downloading (…)lve/main/config.json: 100%|██████████| 665/665 [00:00<00:00, 93.8kB/s]
Downloading (…)olve/main/vocab.json: 100%|██████████| 1.04M/1.04M [00:00<00:00, 20.4MB/s]
Downloading (…)olve/main/merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 6.04MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 1.36M/1.36M [00:00<00:00, 22.5MB/s]
> 2023-05-07 09:34:43,680 [info] Loading dataset from databricks/databricks-dolly-15k
Downloading readme: 100%|██████████| 7.70k/7.70k [00:00<00:00, 7.47MB/s]
Downloading and preparing dataset json/databricks--databricks-dolly-15k to /root/.cache/huggingface/datasets/databricks___json/databricks--databri

project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
learn-docs-dev-admin,...c19fee25,0,May 07 09:34:41,completed,data-prep-dolly-preprocess-dolly,v3io_user=adminkind=jobowner=adminmlrun/client_version=1.3.0mlrun/client_python_version=3.9.16host=data-prep-dolly-preprocess-dolly-pcjfp,,pretrained_tokenizer_name_or_path=gpt2,,train_dolly_datasettest_dolly_dataset





> 2023-05-07 09:35:33,977 [info] run executed, status=completed: {'name': 'data-prep-dolly-preprocess-dolly'}


In [29]:
prepare_dolly_run.outputs

{'train_dolly_dataset': 'store://artifacts/learn-docs-dev-admin/data-prep-dolly-preprocess-dolly_train_dolly_dataset:d9a282660b304544bbcf86c3c19fee25',
 'test_dolly_dataset': 'store://artifacts/learn-docs-dev-admin/data-prep-dolly-preprocess-dolly_test_dolly_dataset:d9a282660b304544bbcf86c3c19fee25'}

In [47]:
train_dolly_dataset = "store://artifacts/learn-docs-dev-admin/data-prep-dolly-preprocess-dolly_train_dolly_dataset:d9a282660b304544bbcf86c3c19fee25"
test_dolly_dataset = "store://artifacts/learn-docs-dev-admin/data-prep-dolly-preprocess-dolly_test_dolly_dataset:d9a282660b304544bbcf86c3c19fee25"

In [50]:
params= {
    "pretrained_model": temp_model,
    "pretrained_tokenizer": temp_model,
    "model_class": "transformers.AutoModelForCausalLM",
    "tokenizer_class": "transformers.AutoTokenizer",
    "require_tokenization": False,
    "TRAIN_deepspeed": os.path.join(base_path, "deepspeed_config/ds_z3_bf16_config.json"),
    "TRAIN_num_train_epochs": 2,
    "TRAIN_output_dir": "dolly_output",
    "TRAIN_per_device_train_batch_size": 6,
    "TRAIN_per_device_eval_batch_size": 6,
    "TRAIN_logging_steps": 10,
    "TRAIN_save_steps": 200,
    "TRAIN_save_total_limit": 20,
    "TRAIN_eval_steps": 50,
    "TRAIN_warmup_steps": 50,
    "TRAIN_learning_rate": 5e-6,
    "TRAIN_gradient_checkpointing": True,
    "TRAIN_local_rank": True,
    "TRAIN_bf16": False,
    "CLASS_ignore_mismatched_sizes": True,
    "data_collator": "src.data_collator.DataCollatorForCompletionOnlyLM",
    "DC_mlm": False,
    "DC_return_tensors": "pt",
    "pad_to_multiple_of": 8,
    
}

In [None]:
 mlrun.run_function(
    function="hugging_face_classifier_trainer",
    name="trainer",
    inputs={
            "dataset": train_dolly_dataset,  # prepare_dolly_run.outputs["train_dolly_dataset"],
            "test_set": test_dataset  # prepare_dolly_run.outputs["test_dolly_dataset"],
        },
    params=params,
    handler="train",
)

### Test complete model