# Fine-tuning LLM on MLRun documentation

In [1]:
import mlrun
import os

In [2]:
# TODO: Create project with better name:
project = mlrun.get_or_create_project("learn-docs-dev", user_project=True)
# TODO: Change to more exact image with appropiate tag:
project.set_default_image("yonishelach/mlrun:ds-debug-u")


> 2023-05-07 05:04:52,844 [info] Created and saved project learn-docs-dev-admin: {'from_template': None, 'overwrite': False, 'context': './', 'save': True}
> 2023-05-07 05:04:52,846 [info] created project learn-docs-dev and saved in MLRun DB


In [3]:
base_path = os.path.abspath(project.context)
base_path

'/User/learn-docs'

## Prepare MLRun documentation data

### Download docs

In [40]:
fetch_docs = project.set_function(
    "src/download_docs.py",
    name="docs-downloader",
    handler="download_all_files",
    kind="job",
)
fetch_docs.apply(mlrun.auto_mount())

<mlrun.runtimes.kubejob.KubejobRuntime at 0x7fc676b0d880>

In [41]:
target_dir = os.path.join(base_path, "data/mlrun_docs")
# url = 'https://pandas.pydata.org/docs/user_guide/'
url = 'https://docs.mlrun.org/en/stable/'

In [42]:
fetch_docs_run = mlrun.run_function(
    function="docs-downloader",
    params={"target_dir": target_dir, "url": url},
    outputs=["docs_dir"],
)

> 2023-05-07 06:20:53,643 [info] Storing function: {'name': 'docs-downloader-download-all-files', 'uid': 'd4ff024cf5e340739e006ce9139bee7c', 'db': 'http://mlrun-api:8080'}
> 2023-05-07 06:20:53,893 [info] Job is running in the background, pod: docs-downloader-download-all-files-nd86b
Found 127 links in https://docs.mlrun.org/en/stable/
wrote file: /User/learn-docs/data/mlrun_docs/architecture.html
wrote file: /User/learn-docs/data/mlrun_docs/tutorial_04-pipeline.html
wrote file: /User/learn-docs/data/mlrun_docs/install_local-docker.html
wrote file: /User/learn-docs/data/mlrun_docs/install_remote.html
wrote file: /User/learn-docs/data/mlrun_docs/projects_project.html
wrote file: /User/learn-docs/data/mlrun_docs/install_aws-install.html
wrote file: /User/learn-docs/data/mlrun_docs/projects_load-project.html
wrote file: /User/learn-docs/data/mlrun_docs/tutorial_index.html
wrote file: /User/learn-docs/data/mlrun_docs/projects_ci-integration.html
wrote file: /User/learn-docs/data/mlrun_docs

project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
learn-docs-dev-admin,...139bee7c,0,May 07 06:21:09,completed,docs-downloader-download-all-files,v3io_user=adminkind=jobowner=adminmlrun/client_version=1.3.0mlrun/client_python_version=3.9.16host=docs-downloader-download-all-files-nd86b,,target_dir=/User/learn-docs/data/mlrun_docsurl=https://docs.mlrun.org/en/stable/,,docs_dir





> 2023-05-07 06:21:21,107 [info] run executed, status=completed: {'name': 'docs-downloader-download-all-files'}


In [24]:
fetch_docs_run.outputs["docs_dir"]

'v3io:///projects/learn-docs-dev-admin/artifacts/docs-downloader-download-all-files/0/docs_dir.zip'

In [43]:
docs_dir = 'v3io:///projects/learn-docs-dev-admin/artifacts/docs-downloader-download-all-files/0/docs_dir.zip'

### Prepare docs dataset

In [44]:
data_preparation = project.set_function(
    "src/data_prep.py",
    name="data-prep",
    handler="prepare_dataset",
    kind="job",
)
data_preparation.apply(mlrun.auto_mount())

<mlrun.runtimes.kubejob.KubejobRuntime at 0x7fc676b01670>

In [45]:
target_dir = os.path.join(base_path, "data/mlrun_dataset")
prepare_dataset_run = mlrun.run_function(
    function="data-prep",
    params={
        "target_dir": target_dir,
        "ignored_files": ["contents", "genindex"],
    },
    inputs={"docs_source": docs_dir},
    outputs=["train_dataset", "test_dataset", "additional_params"],
)

> 2023-05-07 06:21:21,259 [info] Storing function: {'name': 'data-prep-prepare-dataset', 'uid': '4275bc1e3c8845eb9871d8330ad5a2fa', 'db': 'http://mlrun-api:8080'}
> 2023-05-07 06:21:21,499 [info] Job is running in the background, pod: data-prep-prepare-dataset-l7w72
Added to dataset: api_index
Added to dataset: api_mlrun.artifacts
Added to dataset: api_mlrun.config
Added to dataset: api_mlrun.datastore
Added to dataset: api_mlrun.db
Added to dataset: api_mlrun.execution
Added to dataset: api_mlrun.feature_store
Added to dataset: api_mlrun.frameworks_index
Added to dataset: api_mlrun.frameworks_mlrun.frameworks.auto_mlrun
Added to dataset: api_mlrun.frameworks_mlrun.frameworks.lgbm
Added to dataset: api_mlrun.frameworks_mlrun.frameworks.pytorch
Added to dataset: api_mlrun.frameworks_mlrun.frameworks.sklearn
Added to dataset: api_mlrun.frameworks_mlrun.frameworks.tf_keras
Added to dataset: api_mlrun.frameworks_mlrun.frameworks.xgboost
Added to dataset: api_mlrun
Added to dataset: api_mlr

project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
learn-docs-dev-admin,...0ad5a2fa,0,May 07 06:21:36,completed,data-prep-prepare-dataset,v3io_user=adminkind=jobowner=adminmlrun/client_version=1.3.0mlrun/client_python_version=3.9.16host=data-prep-prepare-dataset-l7w72,docs_source,"target_dir=/User/learn-docs/data/mlrun_datasetignored_files=['contents', 'genindex']",,train_datasettest_dataset





> 2023-05-07 06:21:55,708 [info] run executed, status=completed: {'name': 'data-prep-prepare-dataset'}


In [46]:
train_dataset = "store://datasets/deepspeed-admin/data-prep-prepare-dataset_train_dataset#0:latest"
test_dataset = "store://datasets/deepspeed-admin/data-prep-prepare-dataset_test_dataset#0:latest"

## Test pretrained LLM from HuggingFace hub without fine-tuning

In [47]:
test_pretrained_llm = project.set_function(
    "src/test_model.py",
    name="test-llm",
    handler="load_my_model",
    kind="job",
)
test_pretrained_llm.apply(mlrun.auto_mount())

<mlrun.runtimes.kubejob.KubejobRuntime at 0x7fc676afd3a0>

In [48]:
mlrun.run_function(function="test-llm", params={"model_name": "gpt2"})

> 2023-05-07 06:25:02,457 [info] Storing function: {'name': 'test-llm-load-my-model', 'uid': '4966edac55d34337aa7e345c8c2fe00b', 'db': 'http://mlrun-api:8080'}
> 2023-05-07 06:25:02,706 [info] Job is running in the background, pod: test-llm-load-my-model-flhbm
Downloading (…)olve/main/vocab.json: 100%|██████████| 1.04M/1.04M [00:00<00:00, 27.9MB/s]
Downloading (…)olve/main/merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 11.9MB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 665/665 [00:00<00:00, 407kB/s]
Downloading pytorch_model.bin: 100%|██████████| 548M/548M [00:01<00:00, 331MB/s] 
Downloading (…)neration_config.json: 100%|██████████| 124/124 [00:00<00:00, 17.2kB/s]
Generated response: What is mlrun?

mlrun is a Python library for running Python code in a Python interpreter. It is a Python wrapper around the standard library.

It is a Python wrapper around the standard library. It is a Python wrapper around the standard library. It is a Python wrapper around the standa

project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
learn-docs-dev-admin,...8c2fe00b,0,May 07 06:25:18,completed,test-llm-load-my-model,v3io_user=adminkind=jobowner=adminmlrun/client_version=1.3.0mlrun/client_python_version=3.9.16host=test-llm-load-my-model-flhbm,,model_name=gpt2,,





> 2023-05-07 06:26:19,147 [info] run executed, status=completed: {'name': 'test-llm-load-my-model'}


<mlrun.model.RunObject at 0x7fc676adfc40>

## Fine-tune this LLM with MLRun dataset

In [67]:
trainer = project.set_function(
    "src/trainer.py",
    name="hugging_face_classifier_trainer",
    kind="mpijob",
    with_repo=True,
)
trainer.spec.replicas = 4
trainer.with_requests(cpu=5, mem="5Gi")
trainer.with_limits(gpus=1, cpu=8, mem="50Gi")


trainer.apply(mlrun.auto_mount())
trainer.save()

'db://learn-docs-dev-admin/hugging-face-classifier-trainer'

In [68]:
params = {
    "model_name": "gpt2-ft-mlrun",
    "pretrained_model": "gpt2",
    "pretrained_tokenizer": "gpt2",
    "model_class": "transformers.GPT2LMHeadModel",
    "tokenizer_class": "transformers.GPT2Tokenizer",
    "random_state": 42,
    "TRAIN_output_dir": os.path.join(base_path, "finetuning-mlrun-data"),
    "TRAIN_overwrite_output_dir": True,
    "TRAIN_num_train_epochs": 2,
    "TRAIN_fp16": True,
    "TRAIN_bf16": False,
    "TRAIN_per_device_train_batch_size": 4,
    "TRAIN_per_device_eval_batch_size": 4,
    "TRAIN_save_steps": 10000,
    "TRAIN_save_total_limit": 2,
    "TRAIN_deepspeed": os.path.join(base_path, "deepspeed_config/stage3.json"),
}

In [69]:
training_run = mlrun.run_function(
    function="hugging_face_classifier_trainer",
    name="trainer",
    inputs={
            "dataset": train_dataset,  # prepare_dataset_run.outputs["train_dataset"],
            "test_set": test_dataset  # prepare_dataset_run.outputs["test_dataset"],
        },
    params=params,
    handler="train",
    # outputs=["model"],
)

> 2023-05-07 07:04:12,909 [info] Storing function: {'name': 'trainer', 'uid': '12371a463d5d480381d9caab99e5e081', 'db': 'http://mlrun-api:8080'}
> 2023-05-07 07:04:40,635 [info] MpiJob trainer-1805ace0 launcher pod trainer-1805ace0-launcher state active
+ POD_NAME=trainer-1805ace0-worker-3
+ shift
+ /opt/kube/kubectl exec trainer-1805ace0-worker-3 -- /bin/sh -c  orted -mca ess "env" -mca ess_base_jobid "3095986176" -mca ess_base_vpid 4 -mca ess_base_num_procs "5" -mca orte_node_regex "trainer-[4:1805]ace0-launcher,trainer-[4:1805]ace0-worker-0,trainer-[4:1805]ace0-worker-1,trainer-[4:1805]ace0-worker-2,trainer-[4:1805]ace0-worker-3@0(5)" -mca orte_hnp_uri "3095986176.0;tcp://192.168.139.230:39101" -mca plm "rsh" --tree-spawn -mca routed "radix" -mca orte_parent_uri "3095986176.0;tcp://192.168.139.230:39101" -mca plm_rsh_agent "/etc/mpi/kubexec.sh" -mca orte_default_hostfile "/etc/mpi/hostfile" -mca pmix "^s1,s2,cray,isolated"
+ POD_NAME=trainer-1805ace0-worker-2
+ shift
+ /opt/kube/kub

project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
learn-docs-dev-admin,...99e5e081,0,May 07 07:04:44,completed,trainer,v3io_user=adminkind=mpijobowner=adminmlrun/client_version=1.3.0mlrun/client_python_version=3.9.16mlrun/job=trainer-1805ace0host=trainer-1805ace0-worker-0,datasettest_set,model_name=gpt2-ft-mlrunpretrained_model=gpt2pretrained_tokenizer=gpt2model_class=transformers.GPT2LMHeadModeltokenizer_class=transformers.GPT2Tokenizerrandom_state=42TRAIN_output_dir=/User/learn-docs/finetuning-mlrun-dataTRAIN_overwrite_output_dir=TrueTRAIN_num_train_epochs=2TRAIN_fp16=TrueTRAIN_bf16=FalseTRAIN_per_device_train_batch_size=4TRAIN_per_device_eval_batch_size=4TRAIN_save_steps=10000TRAIN_save_total_limit=2TRAIN_deepspeed=/User/learn-docs/deepspeed_config/stage3.json,train_runtime=16.5327train_samples_per_second=16.331train_steps_per_second=1.089total_flos=190381457408.0train_loss=2.9049479166666665,tokenizermodel





> 2023-05-07 07:07:30,829 [info] run executed, status=completed: {'name': 'trainer'}


### Test our fine-tuned model

In [46]:
model_name = "store://models/deepspeed-admin/gpt2-ft-mlrun#0:latest"
tokenizer_name = "store://artifacts/deepspeed-admin/trainer_tokenizer#0:latest"

In [72]:
temp_model = os.path.join(base_path, "finetuning-mlrun-data")

In [70]:
mlrun.run_function(function="test-llm", params={"model_name": temp_model})

> 2023-05-07 07:08:44,250 [info] Storing function: {'name': 'test-llm-load-my-model', 'uid': '99ee9b47b5344138a2767c5607ccb90d', 'db': 'http://mlrun-api:8080'}
> 2023-05-07 07:08:44,501 [info] Job is running in the background, pod: test-llm-load-my-model-jff67
Generated response: What is mlrun?

mlrun is a Python library for running Python code in a container. It is a container for running code in a container.

It is a container for running code in a container. It is a container for running code in a container.

It is a container for running code in a container. It is a container for running code in a container.

It is a container for running code in a container. It is a container for running code
Generated response: What is an MLRun function?

An MLRun function is a function that is executed when the function is called.

It is executed when the function is called with the following parameters:

The function name

The function type

The function parameters

The function parameters

The

project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
learn-docs-dev-admin,...07ccb90d,0,May 07 07:09:00,completed,test-llm-load-my-model,v3io_user=adminkind=jobowner=adminmlrun/client_version=1.3.0mlrun/client_python_version=3.9.16host=test-llm-load-my-model-jff67,,model_name=/User/learn-docs/finetuning-mlrun-data,,





> 2023-05-07 07:10:01,874 [info] run executed, status=completed: {'name': 'test-llm-load-my-model'}


<mlrun.model.RunObject at 0x7fc63cc7c0a0>

## Train on DataBricks Dolly dataset

In [71]:
prepare_dolly_dataset = project.set_function(
    "src/data_prep_dolly.py",
    name="data-prep-dolly",
    handler="preprocess_dolly",
    kind="job",
)
prepare_dolly_dataset.apply(mlrun.auto_mount())

<mlrun.runtimes.kubejob.KubejobRuntime at 0x7fc676b0df10>

In [73]:
prepare_dolly_run = mlrun.run_function(
    function="data-prep-dolly",
    params={"pretrained_tokenizer_name_or_path": temp_model},
    handler="preprocess_dolly",
)

> 2023-05-07 07:11:39,415 [info] Storing function: {'name': 'data-prep-dolly-preprocess-dolly', 'uid': '004b987f67d44632b37b3d69ddf91085', 'db': 'http://mlrun-api:8080'}
> 2023-05-07 07:11:39,672 [info] Job is running in the background, pod: data-prep-dolly-preprocess-dolly-xj7fx
> 2023-05-07 07:11:56,640 [info] Loading tokenizer for /User/learn-docs/finetuning-mlrun-data
> 2023-05-07 07:11:56,832 [info] Loading dataset from databricks/databricks-dolly-15k
Downloading readme: 100%|██████████| 7.70k/7.70k [00:00<00:00, 8.73MB/s]
Downloading and preparing dataset json/databricks--databricks-dolly-15k to /root/.cache/huggingface/datasets/databricks___json/databricks--databricks-dolly-15k-6e0f9ea7eaa0ee08/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4...
Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]
Downloading data:   0%|          | 0.00/13.1M [00:00<?, ?B/s][A
Downloading data:  39%|███▊      | 5.07M/13.1M [00:00<00:00, 50.7MB/s][A
Downloading da

project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
learn-docs-dev-admin,...ddf91085,0,May 07 07:11:55,completed,data-prep-dolly-preprocess-dolly,v3io_user=adminkind=jobowner=adminmlrun/client_version=1.3.0mlrun/client_python_version=3.9.16host=data-prep-dolly-preprocess-dolly-xj7fx,,pretrained_tokenizer_name_or_path=/User/learn-docs/finetuning-mlrun-data,,train_dolly_datasettest_dolly_dataset





> 2023-05-07 07:12:58,236 [info] run executed, status=completed: {'name': 'data-prep-dolly-preprocess-dolly'}


In [8]:
train_dolly_dataset = "store://artifacts/deepspeed-admin/data-prep-dolly-preprocess-dolly_train_dolly_dataset:74d36220f0134f43a15e3a66710221aa"
test_dolly_dataset = "store://artifacts/deepspeed-admin/data-prep-dolly-preprocess-dolly_test_dolly_dataset:74d36220f0134f43a15e3a66710221aa"

In [30]:
params= {
    "pretrained_model": os.path.join(base_path, "model-for-dolly"),
    "pretrained_tokenizer": os.path.join(base_path, "model-for-dolly"),
    "model_class": "transformers.GPT2LMHeadModel",
    "tokenizer_class": "transformers.GPT2Tokenizer",
    "require_tokenization": False,
    "TRAIN_deepspeed": os.path.join(base_path, "deepspeed/ds_z3_bf16_config.json"),
    "TRAIN_num_train_epochs": 2,
    "TRAIN_output_dir": "dolly_output",
    "TRAIN_per_device_train_batch_size": 6,
    "TRAIN_per_device_eval_batch_size": 6,
    "TRAIN_logging_steps": 10,
    "TRAIN_save_steps": 200,
    "TRAIN_save_total_limit": 20,
    "TRAIN_eval_steps": 50,
    "TRAIN_warmup_steps": 50,
    "TRAIN_learning_rate": 5e-6,
    "TRAIN_gradient_checkpointing": True,
    "TRAIN_local_rank": True,
    "TRAIN_bf16": False,
    "CLASS_ignore_mismatched_sizes": True,
    "data_collator": "data_collator.DataCollatorForCompletionOnlyLM",
    "DC_mlm": False,
    "DC_return_tensors": "pt",
    "pad_to_multiple_of": 8,
    
}

In [None]:
 mlrun.run_function(
    function="hugging_face_classifier_trainer",
    name="trainer",
    inputs={
            "dataset": train_dolly_dataset,  # prepare_dolly_run.outputs["train_dolly_dataset"],
            "test_set": test_dataset  # prepare_dolly_run.outputs["test_dolly_dataset"],
        },
    params=params,
    handler="train",
)

### Test complete model