In [None]:
pip install -q --upgrade langchain langchain-openai langchain-core langchain_community langchain_chroma sentence_transformers docx2txt pypdf

In [None]:
import langchain
print(langchain.__version__)

In [None]:
import os

from google.colab import userdata
open_ai_key = os.getenv("OPENAI_API_KEY") or userdata.get("OPENAI_API_KEY")
langchain_api_key = os.getenv("LANGCHAIN_API_KEY") or userdata.get("LANGCHAIN_API_KEY")

#print(open_ai_key)
#print(langchain_api_key)


In [None]:
os.environ["OPENAI_API_KEY"] = open_ai_key

os.environ["LANGCHAIN_TRACING_V2"] = 'true'
os.environ["LANGCHAIN_PROJECT"] = 'langchain-rag'
os.environ["LANGCHAIN_API_KEY"] = langchain_api_key

In [None]:
from langchain_openai import ChatOpenAI
from langchain_core.tracers.context import tracing_v2_enabled

# with tracing_v2_enabled():
#   llm_resonse = llm.invoke(query_to_llm)

In [None]:
from langchain_community.document_loaders import PyPDFLoader, TextLoader, Docx2txtLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_core.documents import Document

from typing import List

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200, length_function=len)


# pdf_loader = PyPDFLoader("/content/documents/Generative AI.pdf")
# documents = pdf_loader.load()

# splits = text_splitter.split_documents(documents)

# print(f'Document splitted into {len(splits)} chunks')

In [None]:
# print(splits[0])

In [None]:
# print(splits[0].metadata)

In [None]:
# print(splits[0].page_content)

In [None]:
# Split / Chunking

# function to fetch all files from documents folder
def fetch_files(path: str) -> List[Document]:
  documents = []
  for file_name in os.listdir(path):
    file_path = os.path.join(path, file_name)
    if file_name.endswith(".pdf"):
      loader = PyPDFLoader(file_path)
    elif file_name.endswith(".docx"):
      loader = Docx2txtLoader(file_path)
    else:
      print(f'Unsupported file type: {file_name}')
      continue
    documents.extend(loader.load())
  return documents


path = '/content/documents'
documents = fetch_files(path)

print(f'Loaded {len(documents)} documents from folder.')

splits = text_splitter.split_documents(documents)

print(f'Document splitted into {len(splits)} chunks')

In [None]:
# Embedding

from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings

embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
document_embeddings = embedding_function.embed_documents([split.page_content for split in splits])

document_embeddings[0]

In [None]:
# # Store in Vector DB

# from langchain_chroma import Chroma

# openai_embedding = OpenAIEmbeddings()
# collection_name = 'apple_hig_documents'
# vector_store = Chroma.from_documents(documents=splits, embedding=openai_embedding, collection_name=collection_name)

# print('Vector store created and persisted to "./chroma_db"')

In [None]:
pip install sentence-transformers


In [None]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_chroma import Chroma

embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

persist_dir = 'chroma_db'
vector_store = Chroma.from_documents(
    documents=splits,
    embedding=embedding_model,
    collection_name='apple_hig_documents',
    persist_directory=persist_dir
)
print('Vector store created and persisted to "chroma_db"')

In [None]:
# Perform similarty search

quey = 'You can position toolbar items in which three locations?'

seatch_results = vector_store.similarity_search(quey, k=4)

print(f'\nTop 4 most relevant chunks for the query: "{quey}"\n')
for index, result in enumerate(seatch_results):
  print(f'Result {index}')
  print(f'Source: {result.metadata.get('source', 'Unknown')}')
  print(f'Content: {result.page_content}')

  print()

In [None]:
from re import search
retriever = vector_store.as_retriever(search_kwargs={'k': 4})
retriever.invoke('You can position toolbar items in which three locations?')

In [None]:
from langchain_core.prompts import ChatPromptTemplate
template = """
Answer the question based only on the following context:
{context}

Question: {question}
Answer:
"""
prompt = ChatPromptTemplate.from_template(template)

In [None]:
from langchain_core.runnables import RunnablePassthrough

rag_chain = (
    {'context': retriever, 'question': RunnablePassthrough()}
    | prompt
    #| ChatOpenAI(temperature=0)
)
rag_chain.invoke('You can position toolbar items in which three locations?')

In [None]:
def doctToStr(docs):
  return '\n\n'.join(doc.page_content for doc in docs)

In [None]:
rag_chain = (
    {'context': retriever | doctToStr, 'question': RunnablePassthrough()}
    | prompt
)
rag_chain.invoke('You can position toolbar items in which three locations?')

In [None]:
# parsing output
from langchain_core.output_parsers import StrOutputParser

output_parser = StrOutputParser()
#output_parser.invoke(llm_resonse)


In [None]:
llm = ChatOpenAI(model_name="gpt-4o-mini")

In [None]:
rag_chain = (
    {'context': retriever | doctToStr, 'question': RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)
question = 'You can position toolbar items in which three locations?'
response = rag_chain.invoke(question)
print(f'Question: {question}\n')
print(f'Answer: {response}')
