In [6]:
import os
import re
import nltk
import bs4
from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.runnables import RunnablePassthrough
from langchain_community.vectorstores import Chroma
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from langchain_community.vectorstores.utils import filter_complex_metadata
import google.generativeai as genai

from langchain_google_genai import ChatGoogleGenerativeAI
import urllib
import warnings
from pathlib import Path as p
from pprint import pprint

import pandas as pd
from langchain.document_loaders import PyPDFLoader
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from dotenv import load_dotenv
import langchain.vectorstores as vectorstores
from langchain_community.document_loaders.pdf import PyPDFLoader
from langchain_community.llms import Cohere
from langchain.retrievers import  ContextualCompressionRetriever
from langchain.retrievers.document_compressors import CohereRerank
from langchain.prompts import PromptTemplate

In [7]:
def load_dot_env():
    load_dotenv("../keys.env")
    llm_api_key = os.getenv("Gemini_key")
    return llm_api_key
    
def load_model(llm_api_key):
    model = ChatGoogleGenerativeAI(model="gemini-1.5-pro-latest",google_api_key=llm_api_key,
                             temperature=0.2,convert_system_message_to_human=True)
    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001",google_api_key=llm_api_key)
    return model  ,embeddings  


In [8]:
llm_api_key = load_dot_env()
llm,embeddings = load_model(llm_api_key)

In [9]:
import re
def load_document(document_path):
    pdf_loader = PyPDFLoader(document_path)
    pages = pdf_loader.load_and_split()
    return pages
def split_text(pages):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
    context = "\n\n".join(str(p.page_content) for p in pages)
    texts = text_splitter.split_text(context)
    return texts

def remove_emojis(string):
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F" # emoticons
        u"\U0001F300-\U0001F5FF" # symbols & pictographs
        u"\U0001F680-\U0001F6FF" # transport & map symbols
        u"\U0001F1E0-\U0001F1FF" # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        "]+", 
        flags=re.UNICODE
    )
    
    return emoji_pattern.sub(r'', string)


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


In [10]:
document=load_document("../docs/Practical Statistics for Data Scientists.pdf")
filtered_docs = filter_complex_metadata(document)
splits=split_text(filtered_docs)

for i in range(len(splits)):
    splits[i] =remove_emojis(splits[i])



In [11]:
def vector_index(texts,embeddings,persist_directory="./Database"):
    vector_index = Chroma.from_texts(texts, embeddings)
    db = Chroma.from_texts(texts, embeddings, persist_directory=persist_directory)
    db.persist()
    # db2 = Chroma.from_documents(text, embedding_function, persist_directory="./chroma_db")
    return vector_index
vector_index = vector_index(splits, embeddings)


  warn_deprecated(


In [12]:
def get_retriever(vector_index,k=20):
    retriever = vector_index.as_retriever(search_kwargs={"k":k})
    return retriever
retriever = get_retriever(vector_index)

In [None]:
def retrieve_compressed_documents(retriever, question, top_k=5):
    compressor = CohereRerank(top_n=top_k)
    compression_retriever = ContextualCompressionRetriever(
        base_compressor=compressor, base_retriever=retriever
    )
    compressed_docs = compression_retriever.get_relevant_documents(question)
    return compressed_docs

# Usage example
question = "What is data science"
compressed_documents = retrieve_compressed_documents(retriever, question)

In [None]:
# Re-ran

def get_reranked_answer(prompt, llm):
    reranked_rag_chain = (
        prompt
        | llm
        | StrOutputParser()
    )
    return reranked_rag_chain  


prompt_template = """
Answer the following question based on the provided context.

{context}

Question: {question}
Answer:
"""

prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])

# Chain
question="What is data science"
rag_chain = get_reranked_answer(prompt, llm, compressed_documents)

answer=rag_chain.invoke({"context":compressed_documents,"question":question})
print(answer)

In [16]:
pip freeze > requirements_advanced_rag.txt

Note: you may need to restart the kernel to use updated packages.
