In [None]:
from decouple import AutoConfig
config = AutoConfig(search_path='./../.env')

In [None]:
import os
import openai

openai.api_key = config('OPENAI_API_KEY')
os.environ['OPENAI_API_KEY'] = openai.api_key
os.environ["COHERE_API_KEY"] = config('COHERE_TOKEN')

### Loading Embedding Model

##### OpenAI Embedding Model

In [None]:
from langchain_openai import OpenAIEmbeddings
ada2 = OpenAIEmbeddings()

##### Open-Source Embedding Models from Huggingace

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings
import torch

In [None]:
model_id = "intfloat/e5-base-v2"
model_kwargs = {'device': 'cuda' if torch.cuda.is_available() else 'cpu'}
encode_kwargs = {'normalize_embeddings': True}

In [None]:
e5_base = HuggingFaceEmbeddings(
    model_name=model_id,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

## Generation Models

##### Openai Model

In [None]:
from langchain_openai import ChatOpenAI
llm_name= "gpt-4"
gpt4 = ChatOpenAI(model_name=llm_name, temperature=0, openai_api_key=openai.api_key)

##### Open-Source models via huggingface text generation pipeline

In [None]:
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

In [None]:
model_id = 'TinyLlama/TinyLlama-1.1B-Chat-v1.0'
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)
temp = 0.3
top_k =10
max_tokens = 256
pipe = pipeline("text-generation", 
                model=model, 
                tokenizer=tokenizer,
                pad_token_id=tokenizer.eos_token_id,
                model_kwargs={"temperature": temp, "top_k": top_k},
                max_new_tokens=max_tokens,
                trust_remote_code=True, 
                device_map="auto",
        )
hf = HuggingFacePipeline(pipeline=pipe)

### Data Loading and Ingestion

In [None]:
from langchain_community.document_loaders import PyPDFLoader

file_dir = './../../data/nifty10_reports/'
file_name = 'TCS.pdf'

# Load, chunk and index the contents of the file.
loader = PyPDFLoader(os.path.join(file_dir, file_name))
pages = loader.load_and_split()

In [None]:
pages[145]

#### Embedding and Storing documents into vectorstore

**Chunking/Splitting** the data into relevant chunks is very important as it helps decide what context will be passed for answer generation. It is advisable to carefully go through the data and identify the `seperators` and `chunk_size`.

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

separators=[" \n\n ", " \n "]
text_splitter = RecursiveCharacterTextSplitter(
    separators=separators,
    chunk_size=1500,
    chunk_overlap=300,
    length_function=len,
)

In [None]:
docs = text_splitter.split_documents(pages)
len(docs)

In [None]:
from langchain_community.vectorstores import Chroma
from langchain.docstore.document import Document

if not os.path.exists('./doc'):
    os.mkdir('./doc')
Chroma().delete_collection()
vectorstore = Chroma.from_documents(docs, ada2, persist_directory='./doc/chroma_db')

### Loaidng the VectorDB and Retriever

In [None]:
from langchain.vectorstores.base import VectorStoreRetriever
from langchain_community.vectorstores import Chroma

In [None]:

top_k_docs = 5
vectorstore_path = './doc/chroma_db'
vectorstore = Chroma(persist_directory=vectorstore_path, embedding_function=ada2)
retriever = VectorStoreRetriever(vectorstore=vectorstore, search_kwargs={"k": top_k_docs})

##### With Cohere Reranker

In [None]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import CohereRerank

In [None]:
compressor = CohereRerank()
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=retriever,
    top_n=5
)

#### Semantic Search

In [None]:
question = "Who is the CEO of TCS?"
top_k = vectorstore.similarity_search(question)
top_k

In [None]:
retriever.get_relevant_documents(question)

### Answer Generation

In [None]:
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.prompts import PromptTemplate

template = """You are an AI assistant for answering questions about the provided text.
If you don't know the answer, just say that you don't know, don't try to make up an answer. 
Use three sentences maximum and keep the answer as concise as possible. 
Always say "thanks for asking!" at the end of the answer. 
{summaries}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

qa_src_chain = RetrievalQAWithSourcesChain.from_chain_type(
    gpt4,
    retriever=retriever, return_source_documents=True,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT},
)

In [None]:
question = "Who are the members in top managemnet of TCS?"
result = qa_src_chain.invoke({"question": question})
result

In [None]:
print(result['answer'])

In [None]:
question = "What are key highlights from the letter to shareholders?"
result = qa_src_chain.invoke({"question": question})
result

In [None]:
question = "What are the key highlights from consolidated financial statement?"
result = qa_src_chain.invoke({"question": question})
result

P.S.: Running this notebook will help you realise the importance of efficient data parsing.