In [1]:
%pip install --quiet --upgrade langchain langchain-community langchain-chroma langchain-google-genai google-cloud-aiplatform

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m33.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.8/41.8 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.2/6.2 MB[0m [31m103.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m615.5/615.5 kB[0m [31m33.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m74.3 MB/s[0m eta [36m0:00:00[0

In [2]:
from langchain_google_genai import GoogleGenerativeAI
from vertexai.language_models import TextGenerationModel, \
                                     TextEmbeddingModel, \
                                     ChatModel, \
                                     InputOutputTextPair, \
                                     CodeGenerationModel, \
                                     CodeChatModel

In [3]:
from google.colab import userdata
import google.generativeai as genai

GOOGLE_API_KEY = userdata.get('GEMINI_API_KEY')
genai.configure(api_key=GOOGLE_API_KEY)

In [4]:
generation_config = {
    "temperature": 0.2,
    "max_output_tokens": 1024,
    "top_k": 40,
    "top_p": 0.8
}

In [5]:
# Initialize the generative model
class CustomLLM:
    def __init__(self, model_name, generation_config):
        self.llm = genai.GenerativeModel(model_name=model_name, generation_config=generation_config)

    def predict(self, prompt):
        # Generate response using the configured generative model
        response = self.llm.generate_content(prompt, generation_config=generation_config)
        return response.text

# Initialize CustomLLM with model configuration
llm = CustomLLM(model_name="gemini-1.5-flash", generation_config=generation_config)

In [6]:
llm.predict("Explain RAG ")

'RAG stands for **Retrieval Augmented Generation**.  It\'s a technique used in large language models (LLMs) to improve their accuracy, factual consistency, and overall performance by augmenting their generation capabilities with external knowledge retrieval.\n\nHere\'s a breakdown:\n\n1. **Retrieval:**  Before generating text, the LLM first retrieves relevant information from an external knowledge base. This knowledge base could be a database, a set of documents, a vector database, or any other structured or unstructured data source.  The retrieval process is crucial; it needs to efficiently find the most pertinent information related to the user\'s prompt or query.  Techniques like keyword search, semantic search (using embeddings), or hybrid approaches are commonly employed.\n\n2. **Augmentation:** The retrieved information is then "augmented" into the LLM\'s input.  This means the retrieved context is added to the user\'s prompt before the LLM generates its response.  This allows th

##Lets get our embedding model

In [7]:
from vertexai.language_models import TextGenerationModel

def get_embedding(text):
    """Generates embeddings for the input text using the embedding model."""
    response = genai.embed_content(
        model='models/text-embedding-004',
        content=text,
        task_type="classification"
    )
    return response["embedding"]

# Example usage
text_to_embed = "Explain RAG"
embedding = get_embedding(text_to_embed)
print(f"Embedding for '{text_to_embed}': {embedding}")

Embedding for 'Explain RAG': [-0.031265818, -0.01200324, -0.018277168, -0.057719175, -0.05360208, 0.037008822, 0.014218218, -0.0071408856, 0.01883804, -0.02317546, -0.010959513, -0.0020360718, 0.054083217, -0.02349909, 0.0149142025, -0.026392942, 0.02560917, -0.01579812, -0.04219144, -0.054731935, 0.040816985, -0.015227761, -0.01477886, -0.074283645, 0.015418062, 0.010576395, 0.027904585, 0.04222901, -0.039662503, -0.047601696, 0.08199424, -0.00094831927, 0.04987184, 0.0019320913, -0.026516354, 0.06062386, 0.037170526, -0.018388355, 0.04653768, -0.038431577, -0.031458832, 0.053330466, 0.012552603, 0.0065822, -0.014078477, -0.06999556, 0.0153857935, 0.015722433, -0.033375785, 0.025558593, -0.07875834, 0.01948818, -0.05803678, 0.022905083, -0.027393917, -0.041031953, -0.006873871, -0.02668544, 0.03120448, 0.010211601, 0.061373048, 0.020327562, 0.042873304, -0.03155664, 0.024771523, 0.0053938786, -0.026198937, 0.040809836, -0.07226313, 0.022172326, -0.04383284, 0.07055953, 0.034766685, 0.

In [10]:
from langchain_core.embeddings import Embeddings
from typing import List

##Lets create custom class since this was made for openai
class CustomGeminiEmbeddings(Embeddings):
    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return [get_embedding(text) for text in texts]

    def embed_query(self, text: str) -> List[float]:
        return get_embedding(text)

class CustomLLM:
    def __init__(self, model_name, generation_config):
        self.llm = genai.GenerativeModel(model_name=model_name, generation_config=generation_config)

    def predict(self, prompt):
        # Handle ChatPromptValue
        if isinstance(prompt, ChatPromptValue):
            # Extract the messages and combine their content
            messages = prompt.messages
            combined_content = "\n".join([msg.content for msg in messages])
            response = self.llm.generate_content(combined_content)
            return response.text

        # Handle tuple format (messages)
        elif isinstance(prompt, tuple) and len(prompt) == 2:
            if prompt[0] == 'messages':
                message_content = prompt[1][0].content
                response = self.llm.generate_content(message_content)
                return response.text

        # Handle string format
        elif isinstance(prompt, str):
            response = self.llm.generate_content(prompt)
            return response.text

        # Handle dictionary format
        elif isinstance(prompt, dict):
            formatted_prompt = f"""Context: {prompt.get('context', '')}

Question: {prompt.get('question', '')}

Please answer the question based on the context provided."""
            response = self.llm.generate_content(formatted_prompt)
            return response.text

        raise ValueError(f"Unsupported prompt format: {type(prompt)}, content: {prompt}")

    def __call__(self, prompt):
        return self.predict(prompt)


## Lets start building!

## Lets first build our vector storage

In [11]:
import bs4
from langchain import hub
from langchain_chroma import Chroma
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.prompt_values import ChatPromptValue

# Initialize your models
llm = CustomLLM(model_name="gemini-1.5-flash-8b", generation_config=generation_config)
embeddings = CustomGeminiEmbeddings()

# Load and process documents
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
docs = loader.load()

# Split documents
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

# Create vector store
vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings)
retriever = vectorstore.as_retriever()

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)




## What stored in our Vector database?

In [12]:
# Get total number of documents
print(f"Total documents in vector store: {vectorstore._collection.count()}")

# Get all the documents
documents = vectorstore.get()
print("\nDocument Contents:")
for i, doc in enumerate(documents['documents'], 1):
    print(f"\nDocument {i}:")
    print(doc[:500] + "..." if len(doc) > 500 else doc)  # Print first 500 chars of each doc

# You can also search for similar documents for a specific query
query = "task decomposition"
similar_docs = vectorstore.similarity_search(query, k=2)  # k is the number of documents to return
print("\nMost similar documents to query:", query)
for i, doc in enumerate(similar_docs, 1):
    print(f"\nSimilar Document {i}:")
    print(doc.page_content[:500] + "..." if len(doc.page_content) > 500 else doc.page_content)

# Get all the metadata
metadatas = vectorstore.get()['metadatas']
print("\nMetadata for documents:")
for i, metadata in enumerate(metadatas, 1):
    print(f"\nMetadata {i}:", metadata)

Total documents in vector store: 66

Document Contents:

Document 1:
LLM Powered Autonomous Agents
    
Date: June 23, 2023  |  Estimated Reading Time: 31 min  |  Author: Lilian Weng


Building agents with LLM (large language model) as its core controller is a cool concept. Several proof-of-concepts demos, such as AutoGPT, GPT-Engineer and BabyAGI, serve as inspiring examples. The potentiality of LLM extends beyond generating well-written copies, stories, essays and programs; it can be framed as a powerful general problem solver.
Agent System Overview#
In a LLM-p...

Document 2:
Memory

Short-term memory: I would consider all the in-context learning (See Prompt Engineering) as utilizing short-term memory of the model to learn.
Long-term memory: This provides the agent with the capability to retain and recall (infinite) information over extended periods, often by leveraging an external vector store and fast retrieval.


Tool use

The agent learns to call external APIs for extra informat

In [15]:
# Create a simple RAG chain
class SimpleRAG:
    def __init__(self, llm, retriever):
        self.llm = llm
        self.retriever = retriever

    def get_response(self, question: str) -> str:
        # Get relevant documents using invoke
        docs = self.retriever.invoke(question)
        context = "\n\n".join(doc.page_content for doc in docs)

        # Create prompt
        prompt = f"""Use the following context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.

Context: {context}

Question: {question}

Answer:"""

        # Get response
        response = self.llm.predict(prompt)
        return response

# Use implementation
simple_rag = SimpleRAG(llm, retriever)
response = simple_rag.get_response("What is Task Decomposition?")
print(response)

Task decomposition is a technique, like Chain of Thought (CoT), used to break down complex tasks into smaller, simpler steps.  It helps models understand and solve problems more effectively.  This method improves model performance on complex tasks.



In [16]:
# Use implementation
simple_rag = SimpleRAG(llm, retriever)
response = simple_rag.get_response("What are agents?")
print(response)

Agents are systems with LLMs as their brains.  They break down complex tasks into smaller steps and can learn from past actions.  They use commands like "google search" and "browse website" to complete tasks.



In [17]:
# Use implementation
simple_rag = SimpleRAG(llm, retriever)
response = simple_rag.get_response("Who is the author of LLM Powered Autonomous Agents?")
print(response)

Lilian Weng wrote the article.  It was published on June 23, 2023.  The article is about LLM-powered autonomous agents.



## Are we sure the model is using the RAG and not its training?

In [18]:
llm.predict("Who is the author of LLM Powered Autonomous Agents?")

'Unfortunately, there isn\'t a single definitive author for a work titled "LLM Powered Autonomous Agents."  The concept is a field of study and research, not a single authored book.  Many researchers and authors have contributed to the various aspects of this field.  To find relevant information, you\'d need to search for papers and articles on the topic of large language models and autonomous agents.\n'