In [1]:
import torch
import numpy as np
import pandas as pd
from datasets import load_dataset
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from tqdm.notebook import tqdm

In [2]:
device='cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

Database

In [3]:
data=load_dataset("m-ric/huggingface_doc", split="train")
data

Dataset({
    features: ['text', 'source'],
    num_rows: 2647
})

In [21]:
data[0]['text']

' Create an Endpoint\n\nAfter your first login, you will be directed to the [Endpoint creation page](https://ui.endpoints.huggingface.co/new). As an example, this guide will go through the steps to deploy [distilbert-base-uncased-finetuned-sst-2-english](https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english) for text classification. \n\n## 1. Enter the Hugging Face Repository ID and your desired endpoint name:\n\n<img src="https://raw.githubusercontent.com/huggingface/hf-endpoints-documentation/main/assets/1_repository.png" alt="select repository" />\n\n## 2. Select your Cloud Provider and region. Initially, only AWS will be available as a Cloud Provider with the `us-east-1` and `eu-west-1` regions. We will add Azure soon, and if you need to test Endpoints with other Cloud Providers or regions, please let us know.\n\n<img src="https://raw.githubusercontent.com/huggingface/hf-endpoints-documentation/main/assets/1_region.png" alt="select region" />\n\n## 3. Define the [S

In [22]:
data[0]['source']

'huggingface/hf-endpoints-documentation/blob/main/docs/source/guides/create_endpoint.mdx'

In [4]:
database=[Document(page_content=doc['text'], metadata={'source':doc['source']}) for doc in tqdm(data)]

  0%|          | 0/2647 [00:00<?, ?it/s]

In [24]:
print(database[0])
print(len(database))

page_content=' Create an Endpoint

After your first login, you will be directed to the [Endpoint creation page](https://ui.endpoints.huggingface.co/new). As an example, this guide will go through the steps to deploy [distilbert-base-uncased-finetuned-sst-2-english](https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english) for text classification. 

## 1. Enter the Hugging Face Repository ID and your desired endpoint name:

<img src="https://raw.githubusercontent.com/huggingface/hf-endpoints-documentation/main/assets/1_repository.png" alt="select repository" />

## 2. Select your Cloud Provider and region. Initially, only AWS will be available as a Cloud Provider with the `us-east-1` and `eu-west-1` regions. We will add Azure soon, and if you need to test Endpoints with other Cloud Providers or regions, please let us know.

<img src="https://raw.githubusercontent.com/huggingface/hf-endpoints-documentation/main/assets/1_region.png" alt="select region" />

## 3. Define the [

Chunking database

In [None]:
emb_id='thenlper/gte-small'
tokenizer=AutoTokenizer.from_pretrained(emb_id, device='cuda')

In [60]:
#Using the markdown character separated
md_sep= [
    "\n#{1,6} ",
    "```\n",
    "\n\\*\\*\\*+\n",
    "\n---+\n",
    "\n___+\n",
    "\n\n",
    "\n",
    " ",
    "",
]

In [6]:
#Chunling database
def doc_split(data, chunk_size=512, tokenizer=tokenizer):
    text_split=RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
        tokenizer=tokenizer,
        chunk_size=chunk_size,
        chunk_overlap=int(chunk_size/10),
        add_start_index=True,
        strip_whitespace=True,
        separators=md_sep,
    )
    docs_process=[]
    for doc in data:
        docs_process +=text_split.split_documents([doc])
    docs_unique=[]
    unique_texts={}
    for doc in docs_process:
        if doc.page_content not in unique_texts:
            docs_unique.append(doc)
            unique_texts[doc.page_content]=True
    return docs_unique, unique_texts

In [8]:
docs_process, unique_texts=doc_split(database)
docs_process[0]

Document(metadata={'source': 'huggingface/hf-endpoints-documentation/blob/main/docs/source/guides/create_endpoint.mdx', 'start_index': 1}, page_content='Create an Endpoint\n\nAfter your first login, you will be directed to the [Endpoint creation page](https://ui.endpoints.huggingface.co/new). As an example, this guide will go through the steps to deploy [distilbert-base-uncased-finetuned-sst-2-english](https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english) for text classification. \n\n## 1. Enter the Hugging Face Repository ID and your desired endpoint name:\n\n<img src="https://raw.githubusercontent.com/huggingface/hf-endpoints-documentation/main/assets/1_repository.png" alt="select repository" />\n\n## 2. Select your Cloud Provider and region. Initially, only AWS will be available as a Cloud Provider with the `us-east-1` and `eu-west-1` regions. We will add Azure soon, and if you need to test Endpoints with other Cloud Providers or regions, please let us know.\n\n<im

In [14]:
len(docs_process)

17995

Embedding

In [7]:
from langchain.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores.utils import DistanceStrategy

In [8]:
emb_model=HuggingFaceEmbeddings(
    model_name=emb_id,
    multi_process=True,
    model_kwargs={'device':'cuda'},
    encode_kwargs={'normalize_embeddings':True}
    )

  emb_model=HuggingFaceEmbeddings(


Vectors store

In [20]:
database_vector=FAISS.from_documents(
    docs_process,
    emb_model,
    distance_strategy=DistanceStrategy.COSINE
)

In [26]:
#Save the database_FAISS
database_vector.save_local('dtb_vector')

In [9]:
#Loading database_FAISS
dtb_vector=FAISS.load_local('dtb_vector', emb_model, allow_dangerous_deserialization=True)

In [32]:
dtb_vector

<langchain_community.vectorstores.faiss.FAISS at 0x7f22dff34200>

LLM model

In [11]:
model_id='meta-llama/Llama-3.2-1B-Instruct'
model=AutoModelForCausalLM.from_pretrained(model_id, token=token_acces).to(device)
tokenizer=AutoTokenizer.from_pretrained(model_id, token=token_acces)

In [12]:
generate=pipeline(
    model=model,
    tokenizer=tokenizer,
    task='text-generation',
    do_sample=True,
    temperature=0.01,
    repetition_penalty=1.1,
    return_full_text=False,
    max_new_tokens=500,
    device=device
    )

Prompt template

In [46]:
prompt_chat = [
    {
        "role": "system",

        "content": """Using the information contained in the context,

give a comprehensive answer to the question.

Respond only to the question asked, response should be concise and relevant to the question.

Provide the number of the source document when relevant.

If the answer cannot be deduced from the context, do not give an answer.""",

    },

    {

        "role": "user",

        "content": """Context:

{context}

---

Now here is the question you need to answer.

Question: {question}""",

    },

]

In [14]:
prompt_template=tokenizer.apply_chat_template(
    prompt_chat,
    tokenize=False,
    add_generation_prompt=True
)

In [53]:
print(prompt_template)

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 18 Jan 2025

Using the information contained in the context,

give a comprehensive answer to the question.

Respond only to the question asked, response should be concise and relevant to the question.

Provide the number of the source document when relevant.

If the answer cannot be deduced from the context, do not give an answer.<|eot_id|><|start_header_id|>user<|end_header_id|>

Context:

{context}

---

Now here is the question you need to answer.

Question: {question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>




Example

In [15]:
# Find the most relevant text to the query from the database
user_query = "How to create a pipeline object?"
query_vector = emb_model.embed_query(user_query)
retriever_docs=dtb_vector.similarity_search(query=user_query, k=5)

In [16]:
print(retriever_docs[1].page_content)
print('\n')
print(retriever_docs[1].metadata)

```
</tf>
</frameworkcontent>

## Pipeline

<Youtube id="tiZFewofSLM"/>

The [`pipeline`] is the easiest and fastest way to use a pretrained model for inference. You can use the [`pipeline`] out-of-the-box for many tasks across different modalities, some of which are shown in the table below:

<Tip>

For a complete list of available tasks, check out the [pipeline API reference](./main_classes/pipelines).

</Tip>


{'source': 'huggingface/transformers/blob/main/docs/source/en/quicktour.md', 'start_index': 1585}


In [18]:
#Create the context
retriever_docs_text=[doc.page_content for doc in retriever_docs] 
context='\nExtracted documents: \n'
context+=''.join([f'Document {str(i)}:::\n' +doc for i, doc in enumerate(retriever_docs_text)])
print(context)


Extracted documents: 
Document 0:::
```

## Available Pipelines:Document 1:::
```
</tf>
</frameworkcontent>

## Pipeline

<Youtube id="tiZFewofSLM"/>

The [`pipeline`] is the easiest and fastest way to use a pretrained model for inference. You can use the [`pipeline`] out-of-the-box for many tasks across different modalities, some of which are shown in the table below:

<Tip>

For a complete list of available tasks, check out the [pipeline API reference](./main_classes/pipelines).

</Tip>Document 2:::
```

2. Pass a prompt to the pipeline to generate an image:

```py
image = pipeline(
	"stained glass of darth vader, backlight, centered composition, masterpiece, photorealistic, 8k"
).images[0]
imageDocument 3:::
!--Copyright 2020 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Un

In [48]:
#Final prompt
final_prompt=prompt_template.format(question='How to create a pipeline object', context=context)
print(final_prompt)

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 18 Jan 2025

Using the information contained in the context,

give a comprehensive answer to the question.

Respond only to the question asked, response should be concise and relevant to the question.

Provide the number of the source document when relevant.

If the answer cannot be deduced from the context, do not give an answer.<|eot_id|><|start_header_id|>user<|end_header_id|>

Context:


Extracted documents: 
Document 0:::
```

## Available Pipelines:Document 1:::
```
</tf>
</frameworkcontent>

## Pipeline

<Youtube id="tiZFewofSLM"/>

The [`pipeline`] is the easiest and fastest way to use a pretrained model for inference. You can use the [`pipeline`] out-of-the-box for many tasks across different modalities, some of which are shown in the table below:

<Tip>

For a complete list of available tasks, check out the [pipeline API reference](./main_classes/pipelines).

</Tip>D

In [70]:
#Generate the answer by using the LLM model
answer=generate(final_prompt)[0]
answer

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


{'generated_text': 'To create a pipeline object, you need to inherit from the `Pipeline` class provided by the `transformers` library. Here\'s a simple example:\n\n```python\nfrom transformers import Pipeline\n\nclass CustomPipeline(Pipeline):\n    def __init__(self, preprocess, postprocess, sanitize_parameters=None):\n        super().__init__(preprocess, postprocess)\n        self.sanitize_parameters = sanitize_parameters\n\n    # Define the preprocess function\n    def preprocess(self, input_data):\n        # Your preprocessing logic goes here\n        return input_data\n\n    # Define the postprocess function\n    def postprocess(self, output_data):\n        # Your postprocessing logic goes here\n        return output_data\n\n# Create a new pipeline object\ncustom_pipeline = CustomPipeline(preprocess=lambda x: x, postprocess=lambda y: y)\n\n# Use the custom pipeline\ninput_data = {"text": "Hello World"}\noutput_data = custom_pipeline(input_data)\nprint(output_data)\n```\n\nThis code

In [56]:
print(answer['generated_text'])

To create a pipeline object, you need to inherit from the `Pipeline` class provided by the `transformers` library. Here's a simple example:

```python
from transformers import Pipeline

class CustomPipeline(Pipeline):
    def __init__(self, preprocess, postprocess, sanitize_parameters=None):
        super().__init__(preprocess, postprocess)
        self.sanitize_parameters = sanitize_parameters

    # Define the preprocess function
    def preprocess(self, input_data):
        # Your preprocessing logic goes here
        return input_data

    # Define the postprocess function
    def postprocess(self, output_data):
        # Your postprocessing logic goes here
        return output_data

# Create a new pipeline object
custom_pipeline = CustomPipeline(preprocess=lambda x: x, postprocess=lambda y: y)

# Use the custom pipeline
input_data = {"text": "Hello World"}
output_data = custom_pipeline(input_data)
print(output_data)
```

This code defines a custom pipeline class `CustomPipeline` 

**Combining**

We can retrieve more documents than we want, then rerank the results with a more powerful retrieval model before.

In [54]:
def answer_rag(question, generate=generate, data=dtb_vector, num_retrieved_docs=10, num_reranker=5, reranker=None):
    #question_emb=emb_model.embed_query(question)
    relevant_docs=data.similarity_search(query=question, k=num_retrieved_docs)
    relevant_docs=[doc.page_content for doc in relevant_docs]
    if reranker:
        relevant_docs=reranker.rerank(question, relevant_docs, k=num_reranker)
        relevant_docs=[doc['content'] for doc in relevant_docs]
    relevant_docs=relevant_docs[:num_reranker]
    #Prompt
    context='\nExtracted documents:\n'
    context+=''.join([f'Document {str(i)}:::\n'+doc for i, doc in enumerate(relevant_docs)])
    prompt_final=prompt_template.format(question=question, context=context)
    answer=generate(prompt_final)[0]['generated_text']
    return answer

In [55]:
question='How to create a pipeline object'
answer=answer_rag(question)
print(answer)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


You can create a pipeline object using the following code snippet:

```python
from transformers import Pipeline

# Define the pipeline parameters
inputs = {"prompt": "stained glass of darth vader, backlight, centered composition, masterpiece, photorealistic, 8k"}

# Create a new pipeline instance
pipe = Pipeline(
    # Specify the model name
    model_name="stable-diffusion-v1-5",
    
    # Specify the custom pipeline function
    custom_pipeline=inputs["prompt"],
    
    # Specify the output directory
    output_dir="/path/to/output/directory"
)

print(pipe)
```

This will create a pipeline object with the specified inputs and output directory.


Chat demo

In [56]:
import gradio as gr

In [57]:
def func_chat(text, sys_prompt='Welcome'):
    answer=answer_rag(text)
    return answer

In [58]:
demo=gr.ChatInterface(
    func_chat,
    #type='messages',
    textbox=gr.Textbox(placeholder='Enter question here', container=False, scale=7),
    chatbot=gr.Chatbot(height=400),
)

In [59]:
demo.launch()

* Running on local URL:  http://127.0.0.1:7863

To create a public link, set `share=True` in `launch()`.




Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
