In [None]:
import pathlib
import textwrap
import google.generativeai as genai
from IPython.display import display
from IPython.display import Markdown

__import__('pysqlite3')
import sys
sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')
import shutil

import os
import chromadb
import re
import time
from pypdf import PdfReader
import google.generativeai as genai
from chromadb import Documents, EmbeddingFunction, Embeddings
from tqdm.auto import tqdm
os.environ["GEMINI_API_KEY"]="AIzaSyBOuvb9pa0OuOd4Npiu-jZMR4OYDDA1TvM"

In [None]:
def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

In [None]:
genai.configure(api_key=os.environ['GEMINI_API_KEY'])

In [None]:
model = genai.GenerativeModel(model_name="models/gemini-1.5-flash")

In [None]:
response = model.generate_content("Me forneça um exemplo de edital de licitação")
to_markdown(response.text)

In [None]:
model.count_tokens("Me faça um relatório de auditoria sobre a empresa XPTO")

In [None]:
model.count_tokens(response.text)

In [None]:
to_markdown(response.text)

## RAG gemini


In [1]:
def load_pdf(file_path):
    """
    Reads the text content from a PDF file and returns it as a single string.

    Parameters:
    - file_path (str): The file path to the PDF file.

    Returns:
    - str: The concatenated text content of all pages in the PDF.
    """
    # Logic to read pdf
    reader = PdfReader(file_path)

    # Loop over each page and store it in a variable
    text = ""
    for page in reader.pages:
        text += page.extract_text()

    return text

In [2]:
def split_text(text: str):
    """
    Splits a text string into a list of non-empty substrings based on the specified pattern.
    The "\n \n" pattern will split the document para by para
    Parameters:
    - text (str): The input text to be split.

    Returns:
    - List[str]: A list containing non-empty substrings obtained by splitting the input text.

    """
    split_text = re.split('\n', text)
    return [i for i in split_text if i != ""]

In [5]:
class GeminiEmbeddingFunction(EmbeddingFunction):
    """
    Custom embedding function using the Gemini AI API for document retrieval.

    This class extends the EmbeddingFunction class and implements the __call__ method
    to generate embeddings for a given set of documents using the Gemini AI API.

    Parameters:
    - input (Documents): A collection of documents to be embedded.

    Returns:
    - Embeddings: Embeddings generated for the input documents.
    """
    def __call__(self, input: Documents) -> Embeddings:
        gemini_api_key = os.getenv("GEMINI_API_KEY")
        if not gemini_api_key:
            raise ValueError("Gemini API Key not provided. Please provide GEMINI_API_KEY as an environment variable")
        genai.configure(api_key=gemini_api_key)
        model = "models/text-embedding-004"
        title = "Custom query"
        return genai.embed_content(model=model,
                                   content=input,
                                   task_type="retrieval_document",
                                   title=title)["embedding"]

In [6]:
def create_chroma_db(documents: list, path:str, name:str):
    """
    Creates a Chroma database using the provided documents, path, and collection name.

    Parameters:
    - documents: An iterable of documents to be added to the Chroma database.
    - path (str): The path where the Chroma database will be stored.
    - name (str): The name of the collection within the Chroma database.

    Returns:
    - Tuple[chromadb.Collection, str]: A tuple containing the created Chroma Collection and its name.
    """
    chroma_client = chromadb.PersistentClient(path=path)
    db = chroma_client.create_collection(name=name, embedding_function=GeminiEmbeddingFunction())

    for i, d in tqdm(enumerate(documents), total=len(documents)):
        db.add(documents=d, ids=str(i))
            
    return db, name

In [7]:
def load_chroma_collection(path, name):
    """
    Loads an existing Chroma collection from the specified path with the given name.

    Parameters:
    - path (str): The path where the Chroma database is stored.
    - name (str): The name of the collection within the Chroma database.

    Returns:
    - chromadb.Collection: The loaded Chroma Collection.
    """
    chroma_client = chromadb.PersistentClient(path=path)
    db = chroma_client.get_collection(name=name, embedding_function=GeminiEmbeddingFunction())

    return db

In [8]:
def get_relevant_passage(query, db, n_results):
  passage = db.query(query_texts=[query], n_results=n_results)['documents'][0]
  return passage

In [9]:
pdf_text  = load_pdf("/var/projetos/Jupyterhubstorage/victor.silva/HelBERTModel/Codigos/LLMs/Documentos/EDITAL CORRETO.pdf")

In [None]:
chunked_text = split_text(text=pdf_text)
if os.path.exists("./db"):
    shutil.rmtree("./db")
    os.makedirs("./db")
else:
    os.makedirs("./db")
db,name =create_chroma_db(documents=chunked_text, 
                          path="./db",
                          name="rag_experiment")

In [None]:
def make_rag_prompt(query, relevant_passage):
  escaped = relevant_passage.replace("'", "").replace('"', "").replace("\n", " ")
  prompt = ("""Você é um bot prestativo e informativo que responde a perguntas usando o texto do trecho de referência incluído abaixo. \
  Certifique-se de responder de forma completa, sendo abrangente, incluindo todas as informações relevantes de contexto. \
  No entanto, você está falando com um público não técnico, por isso, é importante simplificar conceitos complicados e \
  manter um tom amigável e conversacional. \
  Se o trecho não for relevante para a resposta, você pode ignorá-lo.
  PERGUNTA: '{query}'
  TRECHO: '{relevant_passage}'

  RESPOSTA:
  """).format(query=query, relevant_passage=escaped)

  return prompt

In [None]:
def generate_answer(query, db):
    relevant_text = get_relevant_passage(query,db,n_results=3)
    prompt = make_rag_prompt(query, 
                             relevant_passage="".join(relevant_text))
    gemini_api_key = os.getenv("GEMINI_API_KEY")
    if not gemini_api_key:
        raise ValueError("Gemini API Key not provided. Please provide GEMINI_API_KEY as an environment variable")
    genai.configure(api_key=gemini_api_key)
    model = genai.GenerativeModel('models/gemini-1.5-flash')
    answer = model.generate_content(prompt)
    return answer.text

In [None]:
db=load_chroma_collection(path="./db", name="rag_experiment")

In [None]:
answer = generate_answer(db=db, query="Qual é o objeto da licitação?")
print(answer)