In [1]:
import os
import json
from pydantic import BaseModel
from langchain.chat_models import init_chat_model
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import WebBaseLoader, PyPDFLoader
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langgraph.graph import START, StateGraph
from typing_extensions import List, TypedDict
from langchain_core.prompts import ChatPromptTemplate
from langchain_pinecone import PineconeVectorStore
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv

USER_AGENT environment variable not set, consider setting it to identify your requests.
  from tqdm.autonotebook import tqdm


#### Embedding Model

In [2]:
load_dotenv()
if "OPENAI_API_KEY" not in os.environ:
    os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
pc_api_key = os.getenv("PINECONE_API_KEY")

In [3]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

#### Vector Store

In [4]:
pc = Pinecone(api_key = pc_api_key)
index_name = "marketo"

if not pc.has_index(index_name):
    print("Creating new index")
    pc.create_index(
        name=index_name,
        # vector_type="dense",
        dimension=3072, 
        metric="cosine", 
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        ) 
    )
else:
    print("Index Already Exists")

index = pc.Index(index_name)

vector_store = PineconeVectorStore(index=index, embedding=embeddings)

Index Already Exists


#### Load, Embedd and Store Docs (Web & PDFs)

In [5]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # chunk size (characters)
    chunk_overlap=200,  # chunk overlap (characters)
    add_start_index=True,  # track index in original document
)

In [6]:
file_list = os.listdir('pdfs')

In [7]:
# this works but will be better if pull the data and clean before embedding
urls = ["https://uncyclopedia.com/wiki/HowTo:Become_a_Pope", "https://uncyclopedia.com/wiki/HowTo:Beat_a_Polar_Bear_in_a_Fight"]

In [8]:
def already_indexed(source_list, source_type="file"):
    l = []
    for source in source_list:
        if source_type == "file":
            s = './pdfs/' + source
        else:
            s = source
        results = index.query(vector=[0]*3072, top_k=10000, filter={"source": {"$eq": s}}, include_metadata=True)
        if len(results['matches']) == 0:
            l.append(source)
        else:
            # ids = [i['id'] for i in results['matches']] can return matches if we need to
            print(f"{source} exists in index already")

    return l

In [9]:
def process_and_store_documents(source_list, source_type="file"):
    for source in source_list:
        if source_type == "file":
            full_path = './pdfs/' + source
            loader = PyPDFLoader(full_path)
        elif source_type == "web":
            loader = WebBaseLoader(source)
        else:
            raise ValueError("Invalid source type. Use 'file' or 'web'.")

        documents = loader.load()
        docs = text_splitter.split_documents(documents)
        print("Adding {source} to vector store")
        vector_store.add_documents(docs)

In [10]:
file_list = already_indexed(file_list)
if file_list: process_and_store_documents(file_list)

bigfoot_sasquatch_legend.pdf exists in index already


In [11]:
urls = already_indexed(urls, source_type="web")
if urls: process_and_store_documents(urls, source_type="web")

https://uncyclopedia.com/wiki/HowTo:Become_a_Pope exists in index already
https://uncyclopedia.com/wiki/HowTo:Beat_a_Polar_Bear_in_a_Fight exists in index already


In [29]:
query_embedding = embeddings.embed_query("Polar Bear Fight")
results = vector_store.similarity_search_by_vector_with_score(query_embedding,k=10)
doc, score = results[0]
doc.page_content

"HowTo:Beat a Polar Bear in a Fight - Uncyclopedia, the content-free encyclopedia\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nHowTo:Beat a Polar Bear in a Fight\n\nFrom Uncyclopedia, the content-free encyclopedia\n\n\n\nJump to navigation\nJump to search\n“What punk-ass muthafucka can't beat a honky like dat in a fight?”~ Malcolm X on Polar Bears\n Figure One: Polar Bear.\n\n\n\n\nHowToThis article is part of Uncyclopedia's HowTo series.See more HowTos"

#### Generate Summary base on Document Context

In [11]:
llm = init_chat_model("gpt-4o-mini", model_provider="openai")

In [12]:
prompt = ChatPromptTemplate.from_template("""Answer the question based only on 
    the following context:
{context}

Question: {question}
""")

In [13]:
class State(TypedDict):
    question: str
    context: List[Document]
    answer: str


def retrieve(state: State):
    retrieved_docs = vector_store.similarity_search(state["question"])
    return {"context": retrieved_docs}


def generate(state: State):
    docs_content = "\n\n".join(doc.page_content for doc in state["context"])
    messages = prompt.invoke({"question": state["question"], "context": docs_content})
    response = llm.invoke(messages)
    return {"answer": response.content}

In [14]:
graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()

In [15]:
# Define the request body model
class QueryRequest(BaseModel):
    question: str

# Define the response format
class QueryResponse(BaseModel):
    # context: List[str]
    answer: str

In [16]:
result = graph.invoke({"question": 'how to fight a polar bear'})

In [18]:
result['question']

'how to fight a polar bear'

In [None]:
# soruce document
set([result['context'][i].metadata['source'] for i in range(len(result['context']))])

{'https://uncyclopedia.com/wiki/HowTo:Beat_a_Polar_Bear_in_a_Fight'}

In [44]:
print(result['answer'])

To fight a polar bear, you must first convince it to engage with you. Here are the steps based on the provided context:

1. **Approach the bear**: Walk up to the polar bear and tap it on the shoulder.
2. **Create a scenario**: Explain to the bear that it is eating your stinky dead seal. Be cautious not to be too aggressive, as polar bears are not naturally aggressive and may leave.
3. **Seek a reaction**: If the bear swipes at your face with its claws, this is a potential opening for a fight. Ensure that the swipe makes a deep cut on your face (at least two inches deep).
4. **Avoid protecting yourself**: If you instinctively bring your hand to your face, the bear might slash your wrist instead, leading to a loss of respect and diminishing your chances of engaging further.
5. **Show annoyance**: After receiving several cuts, demonstrate annoyance (but not anger) towards the bear.
6. **Wrestle for the seal**: Attempt to wrestle the dead seal from the bear’s jaws when it does not give it 

#### Delete a Source

In [31]:
index.describe_index_stats()

{'dimension': 3072,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 60}},
 'total_vector_count': 60}

In [None]:
# delete_source
delete_source = "https://uncyclopedia.com/wiki/UnNews:I%27m_a_doctor_%E2%80%94_You%27re_peeing_all_wrong"

In [None]:
results = index.query(vector=[0]*3072, top_k=10000, filter={"source": {"$eq": delete_source}}, include_metadata=True)

ids_to_delete = [match["id"] for match in results["matches"]]

if ids_to_delete:
    index.delete(ids=ids_to_delete)
    print(f"Deleted {len(ids_to_delete)} vectors from source: {delete_source}")
else:
    print("No vectors found for the specified source.")

Deleted 15 vectors from source: https://uncyclopedia.com/wiki/UnNews:I%27m_a_doctor_%E2%80%94_You%27re_peeing_all_wrong


In [38]:
index.describe_index_stats()

{'dimension': 3072,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 45}},
 'total_vector_count': 45}