In [None]:
!pip -q install langchain huggingface_hub openai tiktoken pypdf
!pip -q install google-generativeai faiss-cpu chromadb unstructured
!pip -q install sentence_transformers
!pip -q install -U FlagEmbedding
!pip install langchain-openai

In [None]:
import os
os.environ["OPENAI_API_KEY"] = ""


In [None]:
# prompt: write code to mount drive

from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install langchain_community -q

In [None]:
from langchain.schema import Document
from langchain.vectorstores import Chroma
from langchain.retrievers import ParentDocumentRetriever

## Text Splitting & Docloader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.storage import InMemoryStore
from langchain.document_loaders import TextLoader

# from langchain.embeddings.openai import OpenAIEmbeddings
# embeddings = OpenAIEmbeddings()

In [None]:

!pip install -U langchain-huggingface
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.embeddings import HuggingFaceBgeEmbeddings

In [None]:
model_name = "BAAI/bge-small-en"
model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": True}#for cosine similarity
hf = HuggingFaceBgeEmbeddings(
    model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
)

In [None]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader(
    "/content/comments_new.docx_20241223_174156_0000.pdf",
)

In [None]:
docs = loader.load()
docs[0]

In [None]:
len(docs)

In [None]:
from langchain.text_splitter import CharacterTextSplitter

In [None]:
!pip install -U langchain-chroma
from langchain_chroma import Chroma

retrieving of complete/full document rather than in chunks

In [None]:
# This text splitter is used to create the child documents
child_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=20)
# The vectorstore to use to index the child chunks
vectorstore = Chroma(
    collection_name="full_documents", embedding_function=hf
)
# The storage layer for the parent documents
store = InMemoryStore()
retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
)

In [None]:
retriever.add_documents(docs)

In [None]:
len(list(store.yield_keys()))

In [None]:
sub_docs= vectorstore.similarity_search("what is the common complaint",k=2)

In [None]:
len(sub_docs)

In [None]:
print(sub_docs[1].page_content)

In [None]:
rd=retriever.invoke("what is the common complaint")

In [None]:
len(rd[1].page_content)

In [None]:
print(rd[0].page_content)

retrieving larger chunks

In [None]:
# This text splitter is used to create the parent documents
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
# This text splitter is used to create the child documents
# It should create documents smaller than the parent
child_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=20)
# The vectorstore to use to index the child chunks
vectorstore = Chroma(
    collection_name="split_parents", embedding_function=hf
)
# The storage layer for the parent documents
store = InMemoryStore()

In [None]:
chunk_retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter,
)

In [None]:
chunk_retriever.add_documents(docs)

In [None]:
len(list(store.yield_keys()))

In [None]:
sub_docs= vectorstore.similarity_search("what is the common complaint")

In [None]:
print(sub_docs[0].page_content)

In [None]:
len(sub_docs)

In [None]:
rd=retriever.invoke("what is the common complaint")

In [None]:
len(rd)

In [None]:
len(rd[0].page_content)

In [None]:
print(rd[0].page_content)

In [None]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(
    model="gpt-3.5-turbo",  # Specify the OpenAI model you wish to use
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=3,
    # other params...
)

In [None]:
from langchain.chains import RetrievalQA


qa = RetrievalQA.from_chain_type(llm=llm,
                                 chain_type="stuff",
                                 retriever=chunk_retriever)

In [None]:
query="what is the common complaint"
qa.invoke(query)

dspy

In [None]:
!pip install -U dspy

In [None]:
import dspy
from dsp.utils import deduplicate
#turbo=dspy.Together(model="mistralai/Mixtral-8x7B-Instruct-v0.1",api_key="6c818e6aeda3d7c5ce440139ecd84a9b25941be08d0ad05e690d5ff8c528c61a")
import dspy
lm = dspy.LM('openai/gpt-4o-mini', api_key='sk-proj-dRzfw5ulDRwOAeW24LwXT3BlbkFJAqkZtWMz8qkW3h0fLIwy')


In [None]:
retriever=chunk_retriever
class ChromaRetriever:
    def __init__(self, retriever):
        self.retriever = retriever

    def __call__(self, query, k=5):
        results = self.retriever.get_relevant_documents(query)
        # Format results to include a `long_text` attribute
        return [
            {"long_text": result.page_content, "metadata": result.metadata}
            for result in results
        ]

# Instantiate your retriever
chroma_retriever = ChromaRetriever(retriever)

# Configure DSPy to use the ChromaRetriever for RM
dspy.configure(lm=lm)


In [None]:
import dspy
from dspy import Signature, InputField, OutputField, Module, ChainOfThought

class GenerateQuery(Signature):
    context = InputField(desc="may contain relevant facts")
    question = InputField()
    query = OutputField()

class GenerateAnswer(Signature):
    context = InputField(desc="contains parent documents")
    question = InputField()
    answer = OutputField(desc="a concise and relevant response")

class ParentDocumentRetriever(Module):
    def __init__(self, retriever, max_docs=5):
        super().__init__()
        self.generate_query = ChainOfThought(GenerateQuery)
        self.retriever = retriever
        self.generate_answer = ChainOfThought(GenerateAnswer)
        self.max_docs = max_docs

    def retrieve_parent_documents(self, query):
        # Use your retriever to fetch parent documents
        results = self.retriever.get_relevant_documents(query)
        # Format the results as DSPy expects
        return [
            {"long_text": result.page_content, "metadata": result.metadata}
            for result in results
        ]

    def forward(self, question, context=None):
        context = context or []  # Ensure context is initialized

        # Step 1: Generate query from input question and context
        query_response = self.generate_query(context=context, question=question)
        query = query_response.query

        # Step 2: Retrieve parent documents using the generated query
        parent_documents = self.retrieve_parent_documents(query)[:self.max_docs]

        # Add the retrieved documents to the context
        context.extend([doc["long_text"] for doc in parent_documents])

        # Step 3: Generate an answer based on the updated context
        answer_response = self.generate_answer(context=context, question=question)

        return {
            "context": context,  # Contains parent documents
            "answer": answer_response.answer,
        }

# Instantiate ParentDocumentRetriever with your custom retriever
parent_doc_retriever = ParentDocumentRetriever(retriever=retriever)

# Ensure the inputs are correctly initialized
original_query = "Why few customers gave one star rating to the app?"  # Example question
rag_fusion_context = []  # Initialize as empty if not explicitly provided

# Run the ParentDocumentRetriever pipeline
output = parent_doc_retriever.forward(question=original_query, context=rag_fusion_context or [])

# Display Results
print("Retrieved Parent Documents Context:\n", output["context"])
print("\nGenerated Answer:\n", output["answer"])
