# LangChain RAG Local

In [None]:
import os 

from dotenv import find_dotenv, load_dotenv
_ = load_dotenv(find_dotenv())

In [None]:
embedding_name = "text-embedding-ada-002"
chat_name = "gpt-3.5-turbo"

## 1/5 - Load Documents

In [None]:
from langchain.document_loaders import PyPDFLoader # pip install pypdf

pdf_loader = PyPDFLoader('bitcoin.pdf')
pdf_pages = pdf_loader.load()

print(pdf_pages[1].page_content[:100])
print(pdf_pages[-1].metadata)

## 2/5 - Split Documents

In [None]:
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter, TokenTextSplitter, MarkdownTextSplitter

chunk_size = 512
chunk_overlap = 20
separators = ["\n\n", "\n", " ", ""]

r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    separators=separators
)

splits = r_splitter.split_documents(pdf_pages)

In [None]:
# r_splitter.split_text(pdf_pages[2].page_content)

# splits = []
# 
# for page in pdf_pages:
#     splits.extend(r_splitter.split_text(page.page_content))
#     
#     
# splits[:5]

## 3/5 - Vector Store & Embedding

In [None]:
from langchain.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings

chroma_persist_directory = "../Cache/chroma"

embedding = OpenAIEmbeddings(
    model=embedding_name
)

vector_db = Chroma.from_documents(
    documents=splits,
    embedding=embedding,
    persist_directory=chroma_persist_directory
)

# vector_db.persist() 

## 4/5 - Retrieve Strategy
https://python.langchain.com/v0.1/docs/modules/data_connection/retrievers/

In [None]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain_openai import OpenAI

compressor = LLMChainExtractor.from_llm(
    llm=OpenAI(temperature=0.0, model="gpt-3.5-turbo-instruct")
)

retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=vector_db.as_retriever(search_type="mmr")
)

In [None]:
# question = "What is Bitcoin?"
# 
# retrieved_docs = vector_db.similarity_search(
#     question,
#     k=3
# )

# retrieved_docs = vector_db.max_marginal_relevance_search(
#     question,
#     k=3,
#     fetch_k=3
# )

# retrieved_docs = vector_db.max_marginal_relevance_search(
#     question,
#     k=3,
#     filter={"source": "bitcoin.pdf"}
# )

# retrieved_docs = compression_retriever.get_relevant_documents(question)
# retrieved_docs[1].page_content

## 5/5 - Chain & Query

In [None]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model_name=chat_name, temperature=0.0)
# llm.invoke("What is the capital of France?")