In [1]:
import cohere
co = cohere.Client("j3dnErL21HLrrwoaSskJsQpmAn2SxRc1JzJWg7yL") # Get your API key here: https://dashboard.cohere.com/api-keys

In [2]:
# Define documents
documents = [
    {
        "title": "Tall penguins",
        "text": "Emperor penguins are the tallest."},
    {
        "title": "Penguin habitats",
        "text": "Emperor penguins only live in Antarctica."},
    {
        "title": "What are animals?",
        "text": "Animals are different from plants."}
]

In [3]:
# Generate response with citations

# Get the use message 
message = "What are the tallest living penguins?"

# Generat the response
response = co.chat_stream(
    message = message,
    model = "command-a-03-2025",
    documents = documents
)
# Display the response
citations = []
cited_documents = []
for event in response:
    if event.event_type == "text-generation":
        print(event.text, end="")
    elif event.event_type == "citation-generation":
        citations.extend(event.citations)
    elif event.event_type == "stream-end":
      cited_documents = event.response.documents

# Display the citations and source documents
if citations:
  print("\n\nCITATIONS:")
  for citation in citations:
    print(citation)

  print("\nDOCUMENTS:")
  for document in cited_documents:
    print(document)

Emperor penguins are the tallest living penguins.

CITATIONS:
start=0 end=16 text='Emperor penguins' document_ids=['doc_0'] type='TEXT_CONTENT'

DOCUMENTS:
{'id': 'doc_0', 'text': 'Emperor penguins are the tallest.', 'title': 'Tall penguins'}


## **RAG-Powered Chatbot with Chat, Embed, and Rerank**

#### Summary of steps involved
* Step 0: Ingest the documents – get documents, chunk, embed, and index.
* Step 1: Get the user message
* Step 2: Call the Chat endpoint in query-generation mode
* If at least one query is generated
    * Step 3: Retrieve and rerank relevant documents
    * Step 4: Call the Chat endpoint in document mode to generate a grounded response with citations
    * If no query is generated
    * Step 4: Call the Chat endpoint in normal mode to generate a response

In [4]:
import cohere
import uuid
import hnswlib
from typing import List, Dict
from unstructured.partition.html import partition_html
from unstructured.chunking.title import chunk_by_title

co = cohere.Client("j3dnErL21HLrrwoaSskJsQpmAn2SxRc1JzJWg7yL") # Get your API key here: https://dashboard.cohere.com/api-keys

In [5]:
# Create a vector store for ingestion and retrieval
raw_documents = [
    {
        "title": "Crafting Effective Prompts",
        "url": "https://docs.cohere.com/docs/crafting-effective-prompts"},
    {
        "title": "Advanced Prompt Engineering Techniques",
        "url": "https://docs.cohere.com/docs/advanced-prompt-engineering-techniques"},
    {
        "title": "Prompt Truncation",
        "url": "https://docs.cohere.com/docs/prompt-truncation"},
    {
        "title": "Preambles",
        "url": "https://docs.cohere.com/docs/preambles"}
]

In [13]:
class Vectorstore:
    """
    A class representing a collection of documents indexed into a vectorstore.

    Parameters:
    raw_documents (list): A list of dictionaries representing the sources of the raw documents. Each dictionary should have 'title' and 'url' keys.

    Attributes:
    raw_documents (list): A list of dictionaries representing the raw documents.
    docs (list): A list of dictionaries representing the chunked documents, with 'title', 'text', and 'url' keys.
    docs_embs (list): A list of the associated embeddings for the document chunks.
    docs_len (int): The number of document chunks in the collection.
    idx (hnswlib.Index): The index used for document retrieval.

    Methods:
    load_and_chunk(): Loads the data from the sources and partitions the HTML content into chunks.
    embed(): Embeds the document chunks using the Cohere API.
    index(): Indexes the document chunks for efficient retrieval.
    retrieve(): Retrieves document chunks based on the given query.
    """

    def __init__(self, raw_documents: List[Dict[str, str]]):
        self.raw_documents = raw_documents
        self.docs = []
        self.docs_embs = []
        self.retrieve_top_k = 10
        self.rerank_top_k = 3
        self.load_and_chunk()
        self.embed()
        self.index()


    def load_and_chunk(self) -> None:
        """
        Loads the text from the sources and chunks the HTML content.
        """
        print("Loading documents...")

        for raw_document in self.raw_documents:
            elements = partition_html(url=raw_document["url"])
            chunks = chunk_by_title(elements)
            for chunk in chunks:
                self.docs.append(
                    {
                        "title": raw_document["title"],
                        "text": str(chunk),
                        "url": raw_document["url"],
                    }
                )

    def embed(self) -> None:
        """
        Embeds the document chunks using the Cohere API.
        """
        print("Embedding document chunks...")

        batch_size = 90
        self.docs_len = len(self.docs)
        for i in range(0, self.docs_len, batch_size):
            batch = self.docs[i : min(i + batch_size, self.docs_len)]
            texts = [item["text"] for item in batch]
            docs_embs_batch = co.embed(
                texts=texts, model="embed-v4.0", input_type="search_document"
            ).embeddings
            self.docs_embs.extend(docs_embs_batch)

    def index(self) -> None:
        """
        Indexes the document chunks for efficient retrieval.
        """
        print("Indexing document chunks...")
    
        # Determine embedding dimensionality from first vector
        dim = len(self.docs_embs[0])
        
        self.idx = hnswlib.Index(space="ip", dim=dim)
        self.idx.init_index(max_elements=self.docs_len, ef_construction=512, M=64)
        self.idx.add_items(self.docs_embs, list(range(len(self.docs_embs))))
    
        print(f"Indexing complete with {self.idx.get_current_count()} document chunks (dim={dim}).")

    def retrieve(self, query: str) -> List[Dict[str, str]]:
        """
        Retrieves document chunks based on the given query.

        Parameters:
        query (str): The query to retrieve document chunks for.

        Returns:
        List[Dict[str, str]]: A list of dictionaries representing the retrieved document chunks, with 'title', 'text', and 'url' keys.
        """

        # Dense retrieval
        query_emb = co.embed(
            texts=[query], model="embed-v4.0", input_type="search_query"
        ).embeddings
        
        doc_ids = self.idx.knn_query(query_emb, k=self.retrieve_top_k)[0][0]

        # Reranking
        rank_fields = ["title", "text"] # We'll use the title and text fields for reranking

        docs_to_rerank = [self.docs[doc_id] for doc_id in doc_ids]
        rerank_results = co.rerank(
            query=query,
            documents=docs_to_rerank,
            top_n=self.rerank_top_k,
            model="rerank-english-v3.0",
            rank_fields=rank_fields
        )

        doc_ids_reranked = [doc_ids[result.index] for result in rerank_results.results]

        docs_retrieved = []
        for doc_id in doc_ids_reranked:
            docs_retrieved.append(
                {
                    "title": self.docs[doc_id]["title"],
                    "text": self.docs[doc_id]["text"],
                    "url": self.docs[doc_id]["url"],
                }
            )

        return docs_retrieved

In [14]:
# Create an instance of the Vectorstore class with the given sources
vectorstore = Vectorstore(raw_documents)

Loading documents...
Embedding document chunks...
Indexing document chunks...
Indexing complete with 131 document chunks (dim=1536).


## Dense retrieval
    * First, we embed the query using the same embed-v4.0 model we used to embed the document chunks, but this time we set input_type="search_query".

 * Search is performed by the knn_query() method from the hnswlib library. Given a query, it returns the document chunks most similar to the query. We can define the number of document chunks to return using the attribute self.retrieve_top_k=10.

## Reranking

In [15]:
# Test Retrieval
vectorstore.retrieve("Prompting by giving examples")

[{'title': 'Advanced Prompt Engineering Techniques',
  'text': 'Few-shot Prompting\n\nUnlike the zero-shot examples above, few-shot prompting is a technique that provides a model with examples of the task being performed before asking the specific question to be answered. We can steer the LLM toward a high-quality solution by providing a few relevant and diverse examples in the prompt. Good examples condition the model to the expected response type and style.',
  'url': 'https://docs.cohere.com/docs/advanced-prompt-engineering-techniques'},
 {'title': 'Crafting Effective Prompts',
  'text': 'Incorporating Example Outputs\n\nLLMs respond well when they have specific examples to work from. For example, instead of asking for the salient points of the text and using bullet points “where appropriate”, give an example of what the output should look like.',
  'url': 'https://docs.cohere.com/docs/crafting-effective-prompts'},
 {'title': 'Advanced Prompt Engineering Techniques',
  'text': 'In a

### Run chatbot

In [16]:
def run_chatbot(message, chat_history=None):
    if chat_history is None:
        chat_history = []
    
    # Generate search queries, if any        
    response = co.chat(message=message,
                        model="command-a-03-2025",
                        search_queries_only=True,
                        chat_history=chat_history)
    
    search_queries = []
    for query in response.search_queries:
        search_queries.append(query.text)

    # If there are search queries, retrieve the documents
    if search_queries:
        print("Retrieving information...", end="")

        # Retrieve document chunks for each query
        documents = []
        for query in search_queries:
            documents.extend(vectorstore.retrieve(query))

        # Use document chunks to respond
        response = co.chat_stream(
            message=message,
            model="command-a-03-2025",
            documents=documents,
            chat_history=chat_history,
        )

    else:
        response = co.chat_stream(
            message=message,
            model="command-a-03-2025",
            chat_history=chat_history,
        )
        
    # Print the chatbot response, citations, and documents
    chatbot_response = ""
    print("\nChatbot:")

    for event in response:
        if event.event_type == "text-generation":
            print(event.text, end="")
            chatbot_response += event.text
        if event.event_type == "stream-end":
            if event.response.citations:
                print("\n\nCITATIONS:")
                for citation in event.response.citations:
                    print(citation)
            if event.response.documents:
                print("\nCITED DOCUMENTS:")
                for document in event.response.documents:
                    print(document)
            # Update the chat history for the next turn
            chat_history = event.response.chat_history

    return chat_history

In [17]:
# Turn # 1
chat_history = run_chatbot("Hello, I have a question")


Chatbot:
Hello! I'm here to help. Please go ahead and ask your question, and I'll do my best to provide a helpful and informative answer.

In [18]:
# Turn # 2
chat_history = run_chatbot("What's the difference between zero-shot and few-shot prompting", chat_history)


Chatbot:
Great question! Zero-shot and few-shot prompting are techniques used in natural language processing (NLP) to guide language models in generating or classifying text without explicit fine-tuning. Here’s the difference between the two:

### **Zero-Shot Prompting**
- **Definition**: Zero-shot prompting involves providing the model with a task or question without any examples. The model relies solely on its pre-trained knowledge to generate a response.
- **Example**: Asking a model, *"What is the capital of France?"* without providing any context or examples of similar questions.
- **Use Case**: Useful when the model is expected to generalize its knowledge to new tasks or questions it hasn't explicitly seen before.
- **Advantage**: Requires no additional training data or examples, making it efficient for quick queries.
- **Limitation**: May not perform well on highly specific or complex tasks where examples could improve understanding.

### **Few-Shot Prompting**
- **Definition**

In [19]:
# Turn # 3
chat_history = run_chatbot("How would the latter help?", chat_history)


Chatbot:
Few-shot prompting helps by providing the language model with a small set of examples that illustrate the task or question format. These examples act as a **contextual guide**, allowing the model to better understand what is expected of it. Here’s how few-shot prompting improves performance:

### **1. Clarifies Task Expectations**
   - The examples in few-shot prompting explicitly show the model the input-output pairs or the reasoning process required for the task.
   - For instance, if you’re asking the model to classify sentiment, providing examples like *"Text: 'I love this movie!' Sentiment: Positive"* helps the model understand the format and criteria for classification.

### **2. Improves Generalization**
   - While the model is pre-trained on a vast corpus of text, few-shot examples help it **adapt** to the specific nuances of the task at hand.
   - This is particularly useful for tasks that are not well-represented in the pre-training data or require domain-specific k

In [20]:
# Turn # 4
chat_history = run_chatbot("What do you know about 5G networks?", chat_history)


Chatbot:
Great question! 5G networks represent the fifth generation of wireless communication technology, designed to significantly enhance speed, capacity, and responsiveness compared to previous generations (like 4G LTE). Here’s a breakdown of what you need to know about 5G:

---

### **Key Features of 5G Networks**
1. **Higher Speeds**:
   - 5G offers **peak speeds** of up to 20 Gbps (gigabits per second), though real-world speeds are typically lower but still much faster than 4G.
   - This enables seamless streaming of 4K/8K video, faster downloads, and improved performance for data-intensive applications.

2. **Lower Latency**:
   - 5G reduces **latency** (delay in data transmission) to as low as **1 millisecond** (compared to 20–30 ms for 4G).
   - This is crucial for real-time applications like autonomous vehicles, remote surgery, and online gaming.

3. **Increased Capacity**:
   - 5G supports a **higher density of connected devices** per square kilometer, making it ideal for t

In [21]:

print("Chat history:")
for c in chat_history:
    print(c, "\n")
print("="*50)

Chat history:
role='USER' message='Hello, I have a question' tool_calls=None 

role='CHATBOT' message="Hello! I'm here to help. Please go ahead and ask your question, and I'll do my best to provide a helpful and informative answer." tool_calls=None 

role='USER' message="What's the difference between zero-shot and few-shot prompting" tool_calls=None 

role='CHATBOT' message='Great question! Zero-shot and few-shot prompting are techniques used in natural language processing (NLP) to guide language models in generating or classifying text without explicit fine-tuning. Here’s the difference between the two:\n\n### **Zero-Shot Prompting**\n- **Definition**: Zero-shot prompting involves providing the model with a task or question without any examples. The model relies solely on its pre-trained knowledge to generate a response.\n- **Example**: Asking a model, *"What is the capital of France?"* without providing any context or examples of similar questions.\n- **Use Case**: Useful when the mo