In [1]:
from langchain_chroma import Chroma # vector database
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate,ChatMessagePromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.pydantic_v1 import BaseModel, Field
import PyPDF2
from langchain_core.documents import Document

In [2]:
# ! pip install -q langchain_community tiktoken langchain-openai langchainhub chromadb langchain langgraph tavily-python

In [3]:
reader = PyPDF2.PdfReader("AWS CLOUD VIRTUAL INTERNSHIP.pdf")
docs=[]
no=1
for page in reader.pages:
    docs.append(Document(page_content=page.extract_text(),metadata={"page_number":str(no)}))
    no += 1

    

In [4]:
print(docs[0])

page_content='                  1 \n \n        AWS CLOUD VIRTUAL INTERNSHIP  \n               Internship -I report submitted in partial fulfillment of   \n                       requirements for the award of degree of  \n \n      BACHELOR OF TECHNOLOGY  \n                  IN \n     COMPUTER SCIENCE AND ENGINEERING    with CSD and AI & ML  \n                           By \nMOHAN KRISHNA                            (21131A4452)  \nSAI GANESH THYADI                         (22135A4407 ) \nSRI SADHIK VARMA                         (21131A4260)  \n \n \n                                         Under the esteemed guidance of  \n \n \n \n \n \n \nDepartment of Computer Science and Engineering  \nGAYATRI VIDYA PARISHAD COLLEGE OF ENGINEERING (AUTONOMOUS)  \n(Affiliated to JNTU -K, Kakinada)  \nVISAKHAPATNAM  \n2022  – 2023  \nName of Course Coordinator                                 Name of Course Mentors  \n Dr. CH. SITA KUMARI                             Mr. G. DURGA  RAO  \n (Associate Prof

In [5]:
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=256,chunk_overlap=0)
doc_splits = text_splitter.split_documents(docs)

In [6]:
from langchain_community.embeddings import GPT4AllEmbeddings
embedding = GPT4AllEmbeddings(model_name="all-MiniLM-L6-v2.gguf2.f16.gguf")

In [7]:
vectorstore = Chroma.from_documents(documents = doc_splits, collection_name = "rag-data",embedding=embedding)

In [8]:
retriever = vectorstore.as_retriever()

In [9]:
class GradeDocuments(BaseModel):
    binary_score: str = Field(description="Documents are relevant to the question , 'yes' or 'no'")



In [10]:
from langchain_fireworks import ChatFireworks
llm = ChatFireworks(model="accounts/fireworks/models/firefunction-v1")
structured_llm_grader = llm.with_structured_output(GradeDocuments)



In [11]:
prompt = PromptTemplate(
    template="""You are a grader assessing relevance of a retrieved document to a user question. \n 
    Here is the retrieved document: \n\n {document} \n\n
    Here is the user question: {question} \n
    If the document contains keywords related to the user question, grade it as relevant. \n
    It does not need to be a stringent test. The goal is to filter out erroneous retrievals. \n
    Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question or not.Just give yes or no as answer \n
    """,
    input_variables=["question", "document"],
)
retrival_grader = prompt | structured_llm_grader # It tells yes or no,whether the question and documentsa re same or not
ans=retrival_grader.invoke({"question":"What is AWS?", "document":doc_splits[0].page_content})

In [12]:
from langchain import hub


prompt = hub.pull("rlm/rag-prompt")
llm = ChatFireworks(model="accounts/fireworks/models/firefunction-v1")

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)    

rag_chain =  prompt | llm | StrOutputParser()

In [13]:
### Question Re-writer



# Prompt
re_write_prompt = PromptTemplate(
    template="""You a question re-writer that converts an input question to a better version that is optimized \n 
     for vectorstore retrieval. Look at the initial and formulate an improved question. \n
     Here is the initial question: \n\n {question}. Improved question with no preamble: \n """,
    input_variables=["generation", "question"],
)

question_rewriter = re_write_prompt | llm | StrOutputParser()


In [14]:
### Search

from langchain_community.tools.tavily_search import TavilySearchResults

web_search_tool = TavilySearchResults(k=3)

In [15]:
from typing import List

from typing_extensions import TypedDict


class GraphState(TypedDict):
    """
    Represents the state of our graph.

    Attributes:
        question: question
        generation: LLM generation
        web_search: whether to add search
        documents: list of documents
    """

    question: str
    generation: str
    web_search: str
    documents: List[str]

In [16]:
from langchain.schema import Document


def retrieve(state):
    """
    Retrieve documents

    Args:
        state (dict): The current graph state

    Returns:ß
        state (dict): New key added to state, documents, that contains retrieved documents
    """
    print("---RETRIEVE---")
    question = state["question"]

    # Retrieval
    documents = retriever.get_relevant_documents(question)
    return {"documents": documents, "question": question}


def generate(state):
    """
    Generate answer

    Args:
        state (dict): The current graph state

    Returns:
        state (dict): New key added to state, generation, that contains LLM generation
    """
    print("---GENERATE---")
    question = state["question"]
    documents = state["documents"]

    # RAG generation
    generation = rag_chain.invoke({"context": documents, "question": question})
    return {"documents": documents, "question": question, "generation": generation}


def grade_documents(state):
    """
    Determines whether the retrieved documents are relevant to the question.

    Args:
        state (dict): The current graph state

    Returns:
        state (dict): Updates documents key with only filtered relevant documents
    """

    print("---CHECK DOCUMENT RELEVANCE TO QUESTION---")
    question = state["question"]
    documents = state["documents"]

    # Score each doc
    filtered_docs = []
    web_search = "No"
    for d in documents:
        score = retrival_grader.invoke(
            {"question": question, "document": d.page_content}
        )
        grade = score.binary_score
        if grade == "yes":
            print("---GRADE: DOCUMENT RELEVANT---")
            filtered_docs.append(d)
        else:
            print("---GRADE: DOCUMENT NOT RELEVANT---")
            web_search = "Yes"
            continue
    return {"documents": filtered_docs, "question": question, "web_search": web_search}


def transform_query(state):
    """
    Transform the query to produce a better question.

    Args:
        state (dict): The current graph state

    Returns:
        state (dict): Updates question key with a re-phrased question
    """

    print("---TRANSFORM QUERY---")
    question = state["question"]
    documents = state["documents"]

    # Re-write question
    better_question = question_rewriter.invoke({"question": question})
    return {"documents": documents, "question": better_question}


def web_search(state):
    """
    Web search based on the re-phrased question.

    Args:
        state (dict): The current graph state

    Returns:
        state (dict): Updates documents key with appended web results
    """

    print("---WEB SEARCH---")
    question = state["question"]
    documents = state["documents"]

    # Web search
    docs = web_search_tool.invoke({"query": question})
    web_results = "\n".join([d["content"] for d in docs])
    web_results = Document(page_content=web_results)
    documents.append(web_results)

    return {"documents": documents, "question": question}


### Edges


def decide_to_generate(state):
    """
    Determines whether to generate an answer, or re-generate a question.

    Args:
        state (dict): The current graph state

    Returns:
        str: Binary decision for next node to call
    """

    print("---ASSESS GRADED DOCUMENTS---")
    web_search = state["web_search"]
    

    if web_search == "Yes":
        # All documents have been filtered check_relevance
        # We will re-generate a new query
        print(
            "---DECISION: ALL DOCUMENTS ARE NOT RELEVANT TO QUESTION, TRANSFORM QUERY---"
        )
        return "transform_query"
    else:
        # We have relevant documents, so generate answer
        print("---DECISION: GENERATE---")
        return "generate"

In [17]:
from langgraph.graph import END, StateGraph

workflow = StateGraph(GraphState)

# Define the nodes
workflow.add_node("retrieve", retrieve)  # retrieve
workflow.add_node("grade_documents", grade_documents)  # grade documents
workflow.add_node("generate", generate)  # generatae
workflow.add_node("transform_query", transform_query)  # transform_query
workflow.add_node("web_search_node", web_search)  # web search

# Build graph
workflow.set_entry_point("retrieve")
workflow.add_edge("retrieve", "grade_documents")
workflow.add_conditional_edges(
    "grade_documents",
    decide_to_generate,
    {
        "transform_query": "transform_query",
        "generate": "generate",
    },
)
workflow.add_edge("transform_query", "web_search_node")
workflow.add_edge("web_search_node", "generate")
workflow.add_edge("generate", END)

# Compile
app = workflow.compile()

In [18]:
from pprint import pprint

# Run
inputs = {"question": "What are the types of AWS services?"}
for output in app.stream(inputs):
    for key, value in output.items():
        # Node
        pprint(f"Node '{key}':")
        # Optional: print full state at each node
        # pprint.pprint(value["keys"], indent=2, width=80, depth=None)
    pprint("\n---\n")

# Final generation
pprint(value["generation"])

---RETRIEVE---
"Node 'retrieve':"
'\n---\n'
---CHECK DOCUMENT RELEVANCE TO QUESTION---


  warn_deprecated(


---GRADE: DOCUMENT RELEVANT---
---GRADE: DOCUMENT RELEVANT---
---GRADE: DOCUMENT RELEVANT---
---GRADE: DOCUMENT RELEVANT---
---ASSESS GRADED DOCUMENTS---
---DECISION: GENERATE---
"Node 'grade_documents':"
'\n---\n'
---GENERATE---
"Node 'generate':"
'\n---\n'
('AWS services can be categorized into Compute, Containers, Storage, and '
 'Database. The AWS Global Infrastructure consists of Regions and Availability '
 'Zones, with Edge locations and regional edge caches for improved '
 'performance.')
