From 85dad5bf0b2c8a4b05408ecd24f4a493849d6ca8 Mon Sep 17 00:00:00 2001
From: Praveen Venkateswaran <praveenv@uci.edu>
Date: Mon, 6 Nov 2023 10:51:31 -0500
Subject: [PATCH] docs: update hf pipeline docs (#12908)

- **Description:** Noticed that the Hugging Face Pipeline documentation
was a bit out of date.
Updated with information about passing in a pipeline directly
(consistent with docstring) and a recent contribution of mine on adding
support for multi-gpu specifications with Accelerate in
21eeba075c05714f185e5541f25228f7b555f606
---
 .../llms/huggingface_pipelines.ipynb          | 75 +++++++++++++++++--
 1 file changed, 68 insertions(+), 7 deletions(-)

diff --git a/docs/docs/integrations/llms/huggingface_pipelines.ipynb b/docs/docs/integrations/llms/huggingface_pipelines.ipynb
index 3fd8e0a0a6ca4..95423649e9342 100644
--- a/docs/docs/integrations/llms/huggingface_pipelines.ipynb
+++ b/docs/docs/integrations/llms/huggingface_pipelines.ipynb
@@ -41,7 +41,9 @@
    "id": "91ad075f-71d5-4bc8-ab91-cc0ad5ef16bb",
    "metadata": {},
    "source": [
-    "### Load the model"
+    "### Model Loading\n",
+    "\n",
+    "Models can be loaded by specifying the model parameters using the `from_model_id` method."
    ]
   },
   {
@@ -53,12 +55,12 @@
    },
    "outputs": [],
    "source": [
-    "from langchain.llms import HuggingFacePipeline\n",
+    "from langchain.llms.huggingface_pipeline import HuggingFacePipeline\n",
     "\n",
-    "llm = HuggingFacePipeline.from_model_id(\n",
-    "    model_id=\"bigscience/bloom-1b7\",\n",
+    "hf = HuggingFacePipeline.from_model_id(\n",
+    "    model_id=\"gpt2\",\n",
     "    task=\"text-generation\",\n",
-    "    model_kwargs={\"temperature\": 0, \"max_length\": 64},\n",
+    "    pipeline_kwargs={\"max_new_tokens\": 10},\n",
     ")"
    ]
   },
@@ -66,6 +68,31 @@
    "cell_type": "markdown",
    "id": "00104b27-0c15-4a97-b198-4512337ee211",
    "metadata": {},
+   "source": [
+    "They can also be loaded by passing in an existing `transformers` pipeline directly"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.llms.huggingface_pipeline import HuggingFacePipeline\n",
+    "from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline\n",
+    "\n",
+    "model_id = \"gpt2\"\n",
+    "tokenizer = AutoTokenizer.from_pretrained(model_id)\n",
+    "model = AutoModelForCausalLM.from_pretrained(model_id)\n",
+    "pipe = pipeline(\n",
+    "    \"text-generation\", model=model, tokenizer=tokenizer, max_new_tokens=10\n",
+    ")\n",
+    "hf = HuggingFacePipeline(pipeline=pipe)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
    "source": [
     "### Create Chain\n",
     "\n",
@@ -87,7 +114,7 @@
     "Answer: Let's think step by step.\"\"\"\n",
     "prompt = PromptTemplate.from_template(template)\n",
     "\n",
-    "chain = prompt | llm\n",
+    "chain = prompt | hf\n",
     "\n",
     "question = \"What is electroencephalography?\"\n",
     "\n",
@@ -98,6 +125,40 @@
    "cell_type": "markdown",
    "id": "dbbc3a37",
    "metadata": {},
+   "source": [
+    "### GPU Inference\n",
+    "\n",
+    "When running on a machine with GPU, you can specify the `device=n` parameter to put the model on the specified device.\n",
+    "Defaults to `-1` for CPU inference.\n",
+    "\n",
+    "If you have multiple-GPUs and/or the model is too large for a single GPU, you can specify `device_map=\"auto\"`, which requires and uses the [Accelerate](https://huggingface.co/docs/accelerate/index) library to automatically determine how to load the model weights. \n",
+    "\n",
+    "*Note*: both `device` and `device_map` should not be specified together and can lead to unexpected behavior."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gpu_llm = HuggingFacePipeline.from_model_id(\n",
+    "    model_id=\"gpt2\",\n",
+    "    task=\"text-generation\",\n",
+    "    device=0,  # replace with device_map=\"auto\" to use the accelerate library.\n",
+    "    pipeline_kwargs={\"max_new_tokens\": 10},\n",
+    ")\n",
+    "\n",
+    "gpu_chain = prompt | gpu_llm\n",
+    "\n",
+    "question = \"What is electroencephalography?\"\n",
+    "\n",
+    "print(gpu_chain.invoke({\"question\": question}))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
    "source": [
     "### Batch GPU Inference\n",
     "\n",
@@ -147,7 +208,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.10"
+   "version": "3.10.5"
   }
  },
  "nbformat": 4,