In [None]:
! brew install libmagic

In [None]:
! pip install langchain faiss-cpu tiktoken openai python-dotenv pypdf langchain-community

In [None]:
import os
import openai

from dotenv import load_dotenv

load_dotenv()  # take environment variables from .env.
openai.api_key = os.environ['OPENAI_API_KEY']

In [None]:

from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import ConversationalRetrievalChain
from langchain.llms import OpenAI
from langchain.memory import ConversationBufferMemory
import tiktoken


In [None]:
pwd

In [None]:
from langchain_community.document_loaders import PyPDFLoader

file_path = ("data/All You Need to Know About Florence-2! _ by Sunidhi Ashtekar _ Jun, 2024 _ Medium.pdf")
loader = PyPDFLoader(file_path)
pages = loader.load_and_split()

pages[0]

In [None]:
import tiktoken


# Step 2: Concatenate all pages into a single text
full_text = " ".join([page.page_content for page in pages])

# Step 3: Tokenize the text using the appropriate tokenizer
tokenizer = tiktoken.get_encoding('cl100k_base')  # Use the appropriate encoding for your model
tokens = tokenizer.encode(full_text)

# Step 4: Calculate the number of tokens
num_tokens = len(tokens)
print(f"Total number of tokens in the PDF file: {num_tokens}")


In [None]:

# Step 3: Estimate cost for embeddings
# Assume you're using OpenAI's embedding model with cost estimates
# Example costs: $0.0004 per 1K tokens for text-embedding-ada-002 (as of August 2024)
cost_per_1k_tokens = 0.0004
cost = (num_tokens / 1000) * cost_per_1k_tokens
print(f"Estimated cost for generating embeddings: ${cost:.4f}")

In [None]:

# Step 2: Split the text into smaller chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
split_documents = text_splitter.split_documents(pages)

In [None]:
split_documents

In [None]:

# Step 3: Generate embeddings for the text chunks
embeddings = OpenAIEmbeddings()
doc_embeddings = embeddings.embed_documents([doc.page_content for doc in split_documents])


In [None]:
doc_embeddings[0]

In [None]:
len(doc_embeddings)

In [None]:

# Step 4: Store the embeddings in a local FAISS vector store
vectorstore = FAISS.from_documents(split_documents, embeddings)


In [18]:
vectorstore.save_local("faiss_index")


In [None]:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5})

retrieved_docs = retriever.invoke("Tell me about architecture of Florence 2")

len(retrieved_docs)
retrieved_docs

In [None]:
from langchain.chains import ConversationalRetrievalChain
from langchain.llms import OpenAI

# Assuming you've already created the vector store and retriever as in previous steps
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 4})

# Set up the conversational retrieval chain without memory
llm = OpenAI()
qa_chain = ConversationalRetrievalChain.from_llm(llm, retriever, memory=None)

# Query the stored embeddings for similar documents
query = "Tell me about architecture of Florence 2"
response = qa_chain.run({"question": query, "chat_history": []})

print(response)