diff --git a/llm-complete-guide/.assets/huggingface-space-rag-deployment.png b/llm-complete-guide/.assets/huggingface-space-rag-deployment.png new file mode 100644 index 00000000..2fecf64b Binary files /dev/null and b/llm-complete-guide/.assets/huggingface-space-rag-deployment.png differ diff --git a/llm-complete-guide/README.md b/llm-complete-guide/README.md index 033de4d6..5e5844c4 100644 --- a/llm-complete-guide/README.md +++ b/llm-complete-guide/README.md @@ -57,9 +57,9 @@ export ZENML_PROJECT_SECRET_NAME=llm-complete ### Setting up Supabase -[Supabase](https://supabase.com/) is a cloud provider that provides a PostgreSQL +[Supabase](https://supabase.com/) is a cloud provider that offers a PostgreSQL database. It's simple to use and has a free tier that should be sufficient for -this project. Once you've created a Supabase account and organisation, you'll +this project. Once you've created a Supabase account and organization, you'll need to create a new project. ![](.assets/supabase-create-project.png) @@ -76,7 +76,7 @@ string from the Supabase dashboard. ![](.assets/supabase-connection-string.png) -In case supabase is not an option for you, you can use a different database as the backend. +In case Supabase is not an option for you, you can use a different database as the backend. ### Running the RAG pipeline @@ -114,6 +114,51 @@ Note that Claude will require a different API key from Anthropic. See [the `litellm` docs](https://docs.litellm.ai/docs/providers/anthropic) on how to set this up. +### Deploying the RAG pipeline + +![](.assets/huggingface-space-rag-deployment.png) + +You'll need to update and add some secrets to make this work with your Hugging +Face account. To get your ZenML service account API token and store URL, you can +first create a new service account: + +```bash +zenml service-account create +``` + +For more information on this part of the process, please refer to the [ZenML +documentation](https://docs.zenml.io/how-to/project-setup-and-management/connecting-to-zenml/connect-with-a-service-account). + +Once you have your service account API token and store URL (the URL of your +deployed ZenML tenant), you can update the secrets with the following command: + +```bash +zenml secret update llm-complete --zenml_api_token= --zenml_store_url= +``` + +To set the Hugging Face user space that gets used for the Gradio app deployment, +you should set an environment variable with the following command: + +```bash +export ZENML_HF_USERNAME= +export ZENML_HF_SPACE_NAME= # optional, defaults to "llm-complete-guide-rag" +``` + +To deploy the RAG pipeline, you can use the following command: + +```shell +python run.py --deploy +``` + +Alternatively, you can run the basic RAG pipeline *and* deploy it in one go: + +```shell +python run.py --rag --deploy +``` + +This will open a Hugging Face space in your browser where you can interact with +the RAG pipeline. + ### Run the LLM RAG evaluation pipeline To run the evaluation pipeline, you can use the following command: @@ -157,7 +202,6 @@ will need to change the hf repo urls to a space you have permissions to. zenml secret update llm-complete -v '{"argilla_api_key": "YOUR_ARGILLA_API_KEY", "argilla_api_url": "YOUR_ARGILLA_API_URL", "hf_token": "YOUR_HF_TOKEN"}' ``` - ### Finetune the embeddings As with the previous pipeline, you will need to have set up and connected to an Argilla instance for this diff --git a/llm-complete-guide/deployment_hf.py b/llm-complete-guide/deployment_hf.py new file mode 100644 index 00000000..6724fc0f --- /dev/null +++ b/llm-complete-guide/deployment_hf.py @@ -0,0 +1,13 @@ +import gradio as gr +from utils.llm_utils import process_input_with_retrieval + + +def predict(message, history): + return process_input_with_retrieval( + input=message, + n_items_retrieved=20, + use_reranking=True, + ) + + +gr.ChatInterface(predict, type="messages").launch() diff --git a/llm-complete-guide/gh_action_rag.py b/llm-complete-guide/gh_action_rag.py index 49c8c0f3..4828b57d 100644 --- a/llm-complete-guide/gh_action_rag.py +++ b/llm-complete-guide/gh_action_rag.py @@ -21,11 +21,10 @@ import click import yaml +from pipelines.llm_basic_rag import llm_basic_rag from zenml.client import Client from zenml.exceptions import ZenKeyError -from pipelines.llm_basic_rag import llm_basic_rag - @click.command( help=""" @@ -39,7 +38,6 @@ default=False, help="Disable cache.", ) - @click.option( "--create-template", "create_template", @@ -51,26 +49,26 @@ "--config", "config", default="rag_local_dev.yaml", - help="Specify a configuration file" + help="Specify a configuration file", ) @click.option( "--service-account-id", "service_account_id", default=None, - help="Specify a service account ID" + help="Specify a service account ID", ) @click.option( "--event-source-id", "event_source_id", default=None, - help="Specify an event source ID" + help="Specify an event source ID", ) def main( no_cache: bool = False, - config: Optional[str]= "rag_local_dev.yaml", + config: Optional[str] = "rag_local_dev.yaml", create_template: bool = False, service_account_id: Optional[str] = None, - event_source_id: Optional[str] = None + event_source_id: Optional[str] = None, ): """ Executes the pipeline to train a basic RAG model. @@ -86,43 +84,43 @@ def main( client = Client() config_path = Path(__file__).parent / "configs" / config - with (open(config_path,"r") as file): + with open(config_path, "r") as file: config = yaml.safe_load(file) if create_template: - # run pipeline run = llm_basic_rag.with_options( - config_path=str(config_path), - enable_cache=not no_cache + config_path=str(config_path), enable_cache=not no_cache )() # create new run template rt = client.create_run_template( name=f"production-llm-complete-{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}", - deployment_id=run.deployment_id + deployment_id=run.deployment_id, ) try: # Check if an action ahs already be configured for this pipeline action = client.get_action( name_id_or_prefix="LLM Complete (production)", - allow_name_prefix_match=True + allow_name_prefix_match=True, ) except ZenKeyError: if not event_source_id: - raise RuntimeError("An event source is required for this workflow.") + raise RuntimeError( + "An event source is required for this workflow." + ) if not service_account_id: service_account_id = client.create_service_account( name="github-action-sa", - description="To allow triggered pipelines to run with M2M authentication." + description="To allow triggered pipelines to run with M2M authentication.", ).id action_id = client.create_action( name="LLM Complete (production)", configuration={ "template_id": str(rt.id), - "run_config": pop_restricted_configs(config) + "run_config": pop_restricted_configs(config), }, service_account_id=service_account_id, auth_window=0, @@ -132,7 +130,7 @@ def main( event_source_id=UUID(event_source_id), event_filter={"event_type": "tag_event"}, action_id=action_id, - description="Trigger pipeline to reindex everytime the docs are updated through git." + description="Trigger pipeline to reindex everytime the docs are updated through git.", ) else: # update the action with the new template @@ -141,14 +139,13 @@ def main( name_id_or_prefix=action.id, configuration={ "template_id": str(rt.id), - "run_config": pop_restricted_configs(config) - } + "run_config": pop_restricted_configs(config), + }, ) else: llm_basic_rag.with_options( - config_path=str(config_path), - enable_cache=not no_cache + config_path=str(config_path), enable_cache=not no_cache )() @@ -162,22 +159,22 @@ def pop_restricted_configs(run_configuration: dict) -> dict: Modified dictionary with restricted items removed """ # Pop top-level restricted items - run_configuration.pop('parameters', None) - run_configuration.pop('build', None) - run_configuration.pop('schedule', None) + run_configuration.pop("parameters", None) + run_configuration.pop("build", None) + run_configuration.pop("schedule", None) # Pop docker settings if they exist - if 'settings' in run_configuration: - run_configuration['settings'].pop('docker', None) + if "settings" in run_configuration: + run_configuration["settings"].pop("docker", None) # Pop docker settings from steps if they exist - if 'steps' in run_configuration: - for step in run_configuration['steps'].values(): - if 'settings' in step: - step['settings'].pop('docker', None) + if "steps" in run_configuration: + for step in run_configuration["steps"].values(): + if "settings" in step: + step["settings"].pop("docker", None) return run_configuration if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/llm-complete-guide/pipelines/llm_basic_rag.py b/llm-complete-guide/pipelines/llm_basic_rag.py index 3cfb4051..6cf99f08 100644 --- a/llm-complete-guide/pipelines/llm_basic_rag.py +++ b/llm-complete-guide/pipelines/llm_basic_rag.py @@ -15,8 +15,6 @@ # limitations under the License. # -from zenml import pipeline - from steps.populate_index import ( generate_embeddings, index_generator, @@ -24,6 +22,7 @@ ) from steps.url_scraper import url_scraper from steps.web_url_loader import web_url_loader +from zenml import pipeline @pipeline diff --git a/llm-complete-guide/pipelines/llm_eval.py b/llm-complete-guide/pipelines/llm_eval.py index d310fd18..8f604dac 100644 --- a/llm-complete-guide/pipelines/llm_eval.py +++ b/llm-complete-guide/pipelines/llm_eval.py @@ -13,12 +13,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import os from pathlib import Path from typing import Optional import click - from steps.eval_e2e import e2e_evaluation, e2e_evaluation_llm_judged from steps.eval_retrieval import ( retrieval_evaluation_full, @@ -82,12 +80,9 @@ def llm_eval() -> None: "--config", "config", default="rag_local_dev.yaml", - help="Specify a configuration file" + help="Specify a configuration file", ) -def main( - no_cache: bool = False, - config: Optional[str] = "rag_eval.yaml" -): +def main(no_cache: bool = False, config: Optional[str] = "rag_eval.yaml"): """ Executes the pipeline to train a basic RAG model. @@ -98,10 +93,9 @@ def main( config_path = Path(__file__).parent.parent / "configs" / config llm_eval.with_options( - config_path=str(config_path), - enable_cache=not no_cache + config_path=str(config_path), enable_cache=not no_cache )() if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/llm-complete-guide/requirements.txt b/llm-complete-guide/requirements.txt index 13563b92..2c107e4b 100644 --- a/llm-complete-guide/requirements.txt +++ b/llm-complete-guide/requirements.txt @@ -1,13 +1,11 @@ zenml[server]>=0.68.1 -langchain-community ratelimit -langchain>=0.0.325 -langchain-openai pgvector psycopg2-binary beautifulsoup4 unstructured pandas +openai numpy sentence-transformers>=3 transformers diff --git a/llm-complete-guide/run.py b/llm-complete-guide/run.py index 50bbf8fe..2152fda4 100644 --- a/llm-complete-guide/run.py +++ b/llm-complete-guide/run.py @@ -151,7 +151,7 @@ "--config", "config", default=None, - help="Generate chunks for Hugging Face dataset", + help="Path to config", ) def main( rag: bool = False, @@ -181,7 +181,7 @@ def main( argilla (bool): If `True`, the Argilla annotations will be used. chunks (bool): If `True`, the chunks pipeline will be run. reranked (bool): If `True`, rerankers will be used - config (str: Path to config + config (str): Path to config """ pipeline_args = {"enable_cache": not no_cache} embeddings_finetune_args = { @@ -264,4 +264,4 @@ def main( materializer_registry.register_materializer_type( Document, DocumentMaterializer ) - main() \ No newline at end of file + main() diff --git a/llm-complete-guide/steps/finetune_embeddings.py b/llm-complete-guide/steps/finetune_embeddings.py index ad9d9469..3117c473 100644 --- a/llm-complete-guide/steps/finetune_embeddings.py +++ b/llm-complete-guide/steps/finetune_embeddings.py @@ -23,7 +23,8 @@ DATASET_NAME_DISTILABEL, EMBEDDINGS_MODEL_ID_BASELINE, EMBEDDINGS_MODEL_ID_FINE_TUNED, - EMBEDDINGS_MODEL_MATRYOSHKA_DIMS, SECRET_NAME, + EMBEDDINGS_MODEL_MATRYOSHKA_DIMS, + SECRET_NAME, ) from datasets import DatasetDict, concatenate_datasets, load_dataset from datasets.arrow_dataset import Dataset @@ -294,7 +295,7 @@ def finetune( trainer.model.push_to_hub( f"zenml/{EMBEDDINGS_MODEL_ID_FINE_TUNED}", exist_ok=True, - token=zenml_client.get_secret(SECRET_NAME).secret_values["hf_token"] + token=zenml_client.get_secret(SECRET_NAME).secret_values["hf_token"], ) log_model_metadata( diff --git a/llm-complete-guide/steps/push_to_argilla.py b/llm-complete-guide/steps/push_to_argilla.py index 90c3d2d9..e67bf621 100644 --- a/llm-complete-guide/steps/push_to_argilla.py +++ b/llm-complete-guide/steps/push_to_argilla.py @@ -16,7 +16,6 @@ import argilla as rg import torch from argilla._exceptions import ConflictError - from constants import ( DATASET_NAME_ARGILLA, EMBEDDINGS_MODEL_ID_BASELINE, @@ -115,7 +114,7 @@ def push_to_argilla(train_dataset: Dataset, test_dataset: Dataset) -> None: try: ds.create() except ConflictError: - ds = client.datasets(DATASET_NAME_ARGILLA) + ds = client.datasets(DATASET_NAME_ARGILLA) # process original HF dataset try: diff --git a/llm-complete-guide/steps/rag_deployment.py b/llm-complete-guide/steps/rag_deployment.py index 7779339c..a750dde6 100644 --- a/llm-complete-guide/steps/rag_deployment.py +++ b/llm-complete-guide/steps/rag_deployment.py @@ -1,8 +1,44 @@ -import time +import os +import webbrowser -import gradio as gr +from huggingface_hub import HfApi from utils.llm_utils import process_input_with_retrieval from zenml import step +from zenml.client import Client +from zenml.integrations.registry import integration_registry + +secret = Client().get_secret("llm-complete") + +ZENML_API_TOKEN = secret.secret_values["zenml_api_token"] +ZENML_STORE_URL = secret.secret_values["zenml_store_url"] +HF_TOKEN = os.getenv("HF_TOKEN") +SPACE_USERNAME = os.environ.get("ZENML_HF_USERNAME", "zenml") +SPACE_NAME = os.environ.get("ZENML_HF_SPACE_NAME", "llm-complete-guide-rag") + +hf_repo_id = f"{SPACE_USERNAME}/{SPACE_NAME}" +gcp_reqs = integration_registry.select_integration_requirements("gcp") + +hf_repo_requirements = f""" +zenml>=0.68.1 +ratelimit +pgvector +psycopg2-binary +beautifulsoup4 +pandas +openai +numpy +sentence-transformers>=3 +transformers +litellm +tiktoken +matplotlib +pyarrow +rerankers[flashrank] +datasets +torch +huggingface-hub +{chr(10).join(gcp_reqs)} +""" def predict(message, history): @@ -13,15 +49,69 @@ def predict(message, history): ) -@step +def upload_files_to_repo( + api, repo_id: str, files_mapping: dict, token: str = HF_TOKEN +): + """Upload multiple files to a Hugging Face repository + + Args: + api: Hugging Face API client + repo_id: Target repository ID + files_mapping: Dict mapping local files to repo destinations + token: HF API token + """ + for local_path, repo_path in files_mapping.items(): + content = ( + local_path.encode() + if isinstance(local_path, str) and not os.path.exists(local_path) + else local_path + ) + api.upload_file( + path_or_fileobj=content, + path_in_repo=repo_path, + repo_id=repo_id, + repo_type="space", + token=token, + ) + + +@step(enable_cache=False) def gradio_rag_deployment() -> None: """Launches a Gradio chat interface with the slow echo demo. Starts a web server with a chat interface that echoes back user messages. The server runs indefinitely until manually stopped. """ - demo = gr.ChatInterface(predict, type="messages") - demo.launch(share=True, inbrowser=True) - # Keep the step running - while True: - time.sleep(1) + api = HfApi() + api.create_repo( + repo_id=hf_repo_id, + repo_type="space", + space_sdk="gradio", + private=True, + exist_ok=True, + token=HF_TOKEN, + ) + api.add_space_secret( + repo_id=hf_repo_id, + key="ZENML_STORE_API_KEY", + value=ZENML_API_TOKEN, + ) + api.add_space_secret( + repo_id=hf_repo_id, + key="ZENML_STORE_URL", + value=ZENML_STORE_URL, + ) + + files_to_upload = { + "deployment_hf.py": "app.py", + "utils/llm_utils.py": "utils/llm_utils.py", + "utils/openai_utils.py": "utils/openai_utils.py", + "utils/__init__.py": "utils/__init__.py", + "constants.py": "constants.py", + "structures.py": "structures.py", + hf_repo_requirements: "requirements.txt", + } + + upload_files_to_repo(api, hf_repo_id, files_to_upload, HF_TOKEN) + + webbrowser.open(f"https://huggingface.co/spaces/{hf_repo_id}")