In [None]:
import langchain

In [None]:
from langchain_community.llms.llamafile import Llamafile

llm = Llamafile(temperature=0)

# test
llm.invoke("What is Generative AI?")

In [None]:
from langchain.document_loaders import PyPDFLoader
loader = PyPDFLoader('https://arxiv.org/pdf/2402.07927')
pages = loader.load()


In [None]:
len(pages)


In [None]:
pages[5].page_content[0:500]

In [None]:
pages[5].metadata


In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 150
)
splits = text_splitter.split_documents(pages)

In [None]:
len(splits)

In [None]:
splits[2]


In [None]:
from langchain_community.embeddings import LlamafileEmbeddings
embedding = LlamafileEmbeddings()

In [None]:
sentence1 = "i like dogs"
sentence2 = "i like canines"
sentence3 = "the weather is ugly outside"
sentence4 = "it is humid and hot"

embedding1 = embedding.embed_query(sentence1)
embedding2 = embedding.embed_query(sentence2)
embedding3 = embedding.embed_query(sentence3)
embedding4 = embedding.embed_query(sentence4)

In [None]:
import numpy as np

def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    magnitude_vec1 = np.linalg.norm(vec1)
    magnitude_vec2 = np.linalg.norm(vec2)
    similarity = dot_product / (magnitude_vec1 * magnitude_vec2)
    print(f"Cosine Similarity: {similarity}")

In [None]:
cosine_similarity(embedding1, embedding2)
cosine_similarity(embedding2, embedding1)
cosine_similarity(embedding1, embedding3)
cosine_similarity(embedding3, embedding4)

In [None]:
from langchain.vectorstores import Chroma
persist_directory = 'db/chroma'

In [None]:
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embedding,
    persist_directory=persist_directory
)

In [None]:
print(vectordb._collection.count())

In [None]:
question = "what is chain of thoughts?"
docs = vectordb.similarity_search(question,k=3)

In [None]:
docs

In [None]:
for page in docs:
    print(f"found in page {page.metadata['page']}")

In [None]:
docs[0].page_content

In [None]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "Use the given context to answer the question. "
    "If you don't know the answer, say you don't know. "
    "Context: {context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

retriever =  vectordb.as_retriever()

question_answer_chain = create_stuff_documents_chain(llm, prompt)
chain = create_retrieval_chain(retriever, question_answer_chain)


results = chain.invoke({"input": question})
results


In [None]:
results.keys()

In [None]:
results['input']

In [None]:
results['context']

In [None]:
print(results['answer'])

In [None]:
question = "What are major topics for this paper?"
result = chain.invoke({"input": question})
print(result['answer'])

In [None]:
question = "What was my last question?"
result = chain.invoke({"input": question})
print(result['answer'])