---------------
**Retrieval Augmented Generation**
- Retrieval Augmented Generation (RAG) is a technique for enhancing the accuracy and reliability of generative AI models with facts fetched from external sources
- It is also a way to feed new/updated data into LLMs
- For eg: kjnowledge cut off date for gpt-3.5-turbo is Jan 2022 so it won't know anything after that
- https://blogs.nvidia.com/blog/what-is-retrieval-augmented-generation/
---------------
**Project Setup**
- We'll feed Netflix's engagement score to LLM and ask questions based on that
- The dataset refers to engagement scores from Jan-Jun 2023
- Out ChatGPT model (3.5-trubo) has no idea about this because it's knowledge cut off date was Jan 2022
---------------
Part 1: Prepare the documents (one time setup)
- Load the data into LongChain Documents
- Split the documents into chunks
- Create embeddings for the chunks (i.e create vectors)
- Save the chunks & embeddings to a vector database
---------------
Part 2: Search (once per query)
- Create embeddings for user's question
- Using the input embedding & stored embeddings:
  - Rank the vectors based on similarity search
  - The nearest vectors respresent chunks similar to user's question
---------------
Part 3: Integrate with LLM
- Get original data chunks corressponding to vectors in step 2.4 above
- Feed that as an input to LLM
- Get the answer to user's question from LLM in a natural language
---------------

#### 1. Install Dependencies

In [None]:
pip install -r ../requirements.txt -q

#### 2. Verify python-dotenv

In [None]:
import os
import pinecone
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

#### 3. Helper functions

In [None]:
# 
# langchain supports gazillion data sources
# showing just a couple of examples here
# we'll use csv loader for our actual RAG use case
#


# load from local docs
def load_document_local(file_path):
    
  import os
  name, ext = os.path.splitext(file_path)
  
  if ext == '.csv':
    from langchain.document_loaders.csv_loader import CSVLoader
    loader = CSVLoader(file_path)
  elif ext == '.pdf':
    from langchain.document_loaders import PyPDFLoader
    loader = PyPDFLoader(file_path)
  else:
    print(f'unsupported file type: {ext}')
    return None
  
  return loader.load()


# load from wikipedia
def load_document_wikipedia(query, lang='en', load_max_docs=2):
  from langchain.document_loaders import WikipediaLoader
  loader = WikipediaLoader(query=query, lang=lang, load_max_docs=load_max_docs)
  return loader.load()


# create chunks
def chunk_data(data, chunk_size=256):
  from langchain.text_splitter import RecursiveCharacterTextSplitter
  text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = chunk_size,
    chunk_overlap = 0
  )
  chunks = text_splitter.split_documents(data)
  return chunks


# compute embedding cost for a given set of chunks
def get_embedding_cost(chunks):
  import tiktoken
  enc = tiktoken.encoding_for_model('text-embedding-ada-002')
  total_tokens = sum([len(enc.encode(page.page_content)) for page in chunks])
  cost = (total_tokens / 1000 * 0.0004)
  print(f'total tokens: {total_tokens}')
  return cost


# delete a given pinecone index or all indexes
def delete_pinecone_index(idx_name='all'):
  import os
  import pinecone
  pinecone.init(
    api_key=os.environ.get("PINECONE_API_KEY"),
    environment=os.environ.get("PINECONE_ENV")
  )  
  if idx_name == 'all':
    for index in pinecone.list_indexes():
      pinecone.delete_index(index)
  else:
    pinecone.delete_index(idx_name)


# create a pinecone index and insert embeddings or 
# fetch existing embeddings if index already exists
def insert_or_fetch_embeddings(idx_name, chunks):
  import os
  import pinecone
  from langchain.vectorstores import Pinecone
  from langchain.embeddings import OpenAIEmbeddings
  
  embeddings = OpenAIEmbeddings()
  pinecone.init(
    api_key=os.environ.get("PINECONE_API_KEY"),
    environment=os.environ.get("PINECONE_ENV")
  )
  
  if idx_name in pinecone.list_indexes():
    print(f'index: {idx_name} already exist, loading embeddings')
    vector_store = Pinecone.from_existing_index(idx_name, embeddings)
  else:
    print(f'creating index {idx_name} and embeddings')
    pinecone.create_index(name=idx_name, dimension=1536, metric='cosine')
    vector_store = Pinecone.from_documents(chunks, embeddings, index_name = idx_name)
    
  return vector_store


# get chunks from vector store based on similarity index
# pass those chunks to LLM to get answers in natural language
def get_answers(vector_store, query):
  from langchain.chains import RetrievalQA
  from langchain.chat_models import ChatOpenAI
  
  # retriever makes it easy to combine documents with llms
  llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=1.0)
  retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': 3})

  # create a chain that combines llm & retriever
  # chain_type='stuff' is a predefined chain that uses all text from documents
  chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, chain_type='stuff')
  resp = chain.run(query)
  return resp


# same as above but with memory i.e context awareness
def get_answers_with_memory(vector_store, query, chat_history=[]):
  from langchain.chat_models import ChatOpenAI
  from langchain.chains import ConversationalRetrievalChain
  
  llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=1.0)
  retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': 3})
  
  crc = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever)
  resp = crc({'question': query, 'chat_history': chat_history})
  chat_history.append((query, resp['answer']))
  return resp, chat_history


#### 5. Main Business Logic

In [None]:
# load csv
data = load_document_local('./netflix-engagement-jan-jun-2023.csv')
print(f'total records: {len(data)}')
  
# create chunks
chunks = chunk_data(data)
print(f'total chunks: {len(chunks)}')

# get embedding cost
cost = get_embedding_cost(chunks)
print(f'embedding cost: {cost:.6f}')

# create pinecone index and insert embeddings
idx_name = 'netflix-engagement-jan-jun-2023'
# delete_pinecone_index(idx_name)
vector_store = insert_or_fetch_embeddings(idx_name, chunks)

#### 6. Start Asking Questions

In [None]:
while True:
  
  query = input('your prompt: ')
  
  # break if user wants to quit
  if query in ['quit', 'exit', 'bye', 'q']:
    print('Goodbye!')
    break
  
  # else ask the model for a response
  resp = get_answers(vector_store, query)
  print(f'prompt: {query}')
  print(f'resp: {resp}')
  print('_' * 50)

#### 7. Start Asking Questions With Memory

In [None]:
chat_history = []

while True:
  
  query = input('your prompt: ')
  
  # break if user wants to quit
  if query in ['quit', 'exit', 'bye', 'q']:
    print('Goodbye!')
    break
  
  # else ask the model for a response
  resp, chat_history = get_answers_with_memory(vector_store, query, chat_history)
  print(f'prompt: {query}')
  print(f'resp: {resp["answer"]}')
  print(f'chat_history: {chat_history}')
  print('_' * 50)