### LLM

In [6]:
from langchain_ollama import ChatOllama
local_llm = 'llama3.2:3b-instruct-fp16'
llm = ChatOllama(model = local_llm, temperature = 0)
llm_json_mode = ChatOllama(model = local_llm, temperature = 0, format = 'json')

### search

In [7]:
import os
import getpass

def _set_env(var:str):
    if not os.environ.get(var):
        os.environ[var] = getpass.getpass(f"{var}:")

_set_env('TAVILY_API_KEY')
os.environ["TOKENIZERS_PARALLELISM"] =  "true"


### Vectorstore

In [8]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import SKLearnVectorStore
from langchain_nomic.embeddings import NomicEmbeddings

urls = [
    "https://lilianweng.github.io/posts/2023-06-23-agent/",
    "https://lilianweng.github.io/posts/2023-03-15-prompt-engineering/",
    "https://lilianweng.github.io/posts/2023-10-25-adv-attack-llm/",
]

# load docs
docs = [WebBaseLoader(url).load() for url in urls]
docs_list = [item for sublist in docs for item in sublist]

# split docs
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size = 1000, chunk_overlap = 200
)

doc_splits = text_splitter.split_documents(docs_list)

# add to vectorDB
vectorstore = SKLearnVectorStore.from_documents(
    documents = doc_splits,
    embedding = NomicEmbeddings(model = 'nomic-embed-text-v1.5', inference_mode='local')
)

retriever = vectorstore.as_retriever(k=3)


## components

### router

In [10]:
import json
from langchain_core.messages import HumanMessage, SystemMessage

#prompt

router_instructions = '''You are an expert at routing a user question to a vectorstore or web search
The vectorstore contains documents related to agents, prompt engineering, and adversarial attacks.
use the vectorstore for questions on these topicas. For all else, and specially for current events, use web-search.

return Json iwth single key, datasource, that is 'websearch' or 'vectorstore' depending on the question
'''

# Test router
test_web_search = llm_json_mode.invoke(
    [SystemMessage(content=router_instructions)]
    + [
        HumanMessage(
            content="Who is favored to win the NFC Championship game in the 2024 season?"
        )
    ]
)
test_web_search_2 = llm_json_mode.invoke(
    [SystemMessage(content=router_instructions)]
    + [HumanMessage(content="What are the models released today for llama3.2?")]
)
test_vector_store = llm_json_mode.invoke(
    [SystemMessage(content=router_instructions)]
    + [HumanMessage(content="What are the types of agent memory?")]
)
print(
    json.loads(test_web_search.content),
    json.loads(test_web_search_2.content),
    json.loads(test_vector_store.content),
)

{'datasource': 'websearch'} {'datasource': 'websearch'} {'datasource': 'vectorstore'}


### retrieval grader

In [14]:
#doc grader instructions

doc_grader_intructions = '''
you are a grader assessing relevance of a retrieved document to a user question.
if the document contains keyword(s) or semantic meaning related to the question, 
grade it as relevant
'''

#grader prompt
doc_grader_prompt = """
here is the retrieved document: \n\n {document}
here is the user question: \n\n {question}.

then carefully and objectively assess whether the document contains at least 
some information that is relevant to the question.

return json with single key, binary_score, that is 'yes' or 'no' score to indicate 
whether the document contains at least some information that is relevant to the question
"""

# test

question = 'What is chain of thought prompting?'
docs = retriever.invoke(question)
doc_text = docs[1].page_content
doc_grader_prompt_formatted = doc_grader_prompt.format(document = doc_text, question = question )

result = llm_json_mode.invoke(
[SystemMessage(content = doc_grader_intructions)] +
[HumanMessage(content= doc_grader_prompt_formatted)]
)

json.loads(result.content)

{'binary_score': 'yes'}

### generate answer

In [15]:
rag_prompt = """
You are an assistant for quertion-answering takss.
here is the context to use to answer the question:
{context}
think carefully about the above context.

Now, review the user question:
{question}

provide an answer to this questions using only the above context.

use three sentences maximum and keep the answer concise

Answer: """

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

docs = retriever.invoke(question)
docs_txt = format_docs(docs)
rag_prompt_formatted = rag_prompt.format(context = docs_txt, question = question)
generation = llm.invoke([HumanMessage(content = rag_prompt_formatted)])
print(generation.content)


Chain-of-thought prompting is a technique used in natural language processing (NLP) where a model generates a sequence of reasoning steps, or "thoughts", to arrive at a desired output. This approach involves using external search queries, such as Wikipedia APIs, to retrieve relevant information and then incorporating it into the context. Chain-of-thought prompting can be further extended by exploring multiple reasoning possibilities at each step, generating multiple thoughts per step, and evaluating each state with a classifier or majority vote.
