# Loading PDF file

In [2]:
import os
import unstructured
os.environ["GEMINI_API_KEY"]="your_key"

In [28]:
from pypdf import PdfReader

def load_pdf(file_path):
    """
    Reads the text content from a PDF file and returns it as a single string.

    Parameters:
    - file_path (str): The file path to the PDF file.

    Returns:
    - str: The concatenated text content of all pages in the PDF.

    Raises:
    - FileNotFoundError: If the specified file_path does not exist.
    - PyPDF2.utils.PdfReadError: If the PDF file is encrypted or malformed.

    Example:
    >>> pdf_text = load_pdf("example.pdf")
    >>> print(pdf_text)
    "This is the text content extracted from the PDF file."
    """
    # Logic to read pdf
    reader = PdfReader(file_path)

    # Loop over each page and store it in a variable
    text = ""
    for page in reader.pages:
        text += page.extract_text()

    return text

#for i in range(20):
 #   
 #   text.append(load_pdf(file_path=f"scraping\pdf_downloads\child_page_{i+1}.pdf"))
text = load_pdf(file_path=f"scraping\pdf_downloads\child_page_1.pdf")

  text = load_pdf(file_path=f"scraping\pdf_downloads\child_page_1.pdf")


In [29]:
text

'Celiaki\n\x01\x02\x03\x04\x05\x06\x04\x07\x08\x07\t\x03\n\x0b\x02\x0c\x04\x0c\x0b\r\x03\x02\x0e\x05\x0c\x0f\nCeliaki\tinnebär\tatt\tdu\tinte\ttål\tproteinet\tgluten\tsom\tfinns\ti\tvete,\tråg\toch\tkorn.\tDin\ttunntarm\tblir\tinflammerad\noch\tkan\tinte\tta\tupp\tnäring\tsom\tden\tska.\tDe\tvanligaste\tsymtomen\tär\tmagbesvär,\ttrötthet\toch\tatt\tgå\tner\ti\nvikt.\tBehandlingen\tgår\tut\tpå\tatt\tsluta\täta\tgluten.\tDet\tgör\tatt\ttarmen\tläker\toch\tsymtomen\tgår\töver\tinom\nnågra\tmånader.\nCeliaki\tkallas\tockså\tför\tglutenintolerans.\tDet\tgår\tatt\tfå\tceliaki\tsom\tbarn\teller\tsom\tvuxen.\tDen\thär\ttexten\thandlar\tom\nceliaki\thos\tvuxna.\tHär\tkan\tdu\tläsa\tmer\tom\t\nceliaki\thos\tbarn.\n\x10\x11\x12\x0b\r\x12\nDu\tkan\tfå\tett\teller\tflera\tav\tföljande\tsymtom\tom\tdu\thar\tobehandlad\tceliaki:\nVilka\tsymtom\tman\tfår\toch\thur\tkraftiga\tde\tär\tkan\tskilja\tsig\tfrån\tperson\ttill\tperson.\tSymtomen\tkan\tockså\tändras\tmed\nåldern.\t\t\n\x13\x04\x05\x0e\x0e\x14\

# Splitting the text

In [32]:
import re

def clean_text(text):
    # Replace all \n and \t characters with a space
    clean_text = re.sub(r'[\n\t]', ' ', text)
    # Optionally remove extra spaces
    clean_text = re.sub(r'\s+', ' ', clean_text).strip()
    clean_text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', clean_text)  # Remove control characters
    return clean_text

In [33]:
pdf_text = clean_text(text)
pdf_text

'Celiaki       Celiaki innebär att du inte tål proteinet gluten som finns i vete, råg och korn. Din tunntarm blir inflammerad och kan inte ta upp näring som den ska. De vanligaste symtomen är magbesvär, trötthet och att gå ner i vikt. Behandlingen går ut på att sluta äta gluten. Det gör att tarmen läker och symtomen går över inom några månader. Celiaki kallas också för glutenintolerans. Det går att få celiaki som barn eller som vuxen. Den här texten handlar om celiaki hos vuxna. Här kan du läsa mer om celiaki hos barn.   Du kan få ett eller flera av följande symtom om du har obehandlad celiaki: Vilka symtom man får och hur kraftiga de är kan skilja sig från person till person. Symtomen kan också ändras med åldern.         Det är vanligt att ha besvär från magen och tarmarna vid obehandlad celiaki. Det kan till exempel vara diarré som inte går över, gaser i magen och ibland förstoppning . Att ha besvär från magen och tarmarna behöver inte bero på celiaki. Ibland kan det vara symtom på s

In [6]:
# not used
import re
def split_text(text: str):
    """
    Splits a text string into a list of non-empty substrings based on the specified pattern.
    The "\n \n" pattern will split the document para by para
    Parameters:
    - text (str): The input text to be split.

    Returns:
    - List[str]: A list containing non-empty substrings obtained by splitting the input text.

    """
    split_text = re.split('\n \n', text)
    return [i for i in split_text if i != ""]

In [7]:
#text = split_text(pdf_text)

# Embedding the text

In [8]:
import google.generativeai as genai
from chromadb import Documents, EmbeddingFunction, Embeddings
import os

class GeminiEmbeddingFunction(EmbeddingFunction):
    """
    Custom embedding function using the Gemini AI API for document retrieval.

    This class extends the EmbeddingFunction class and implements the __call__ method
    to generate embeddings for a given set of documents using the Gemini AI API.

    Parameters:
    - input (Documents): A collection of documents to be embedded.

    Returns:
    - Embeddings: Embeddings generated for the input documents.

    Raises:
    - ValueError: If the Gemini API Key is not provided as an environment variable (GEMINI_API_KEY).

    Example:
    >>> gemini_embedding_function = GeminiEmbeddingFunction()
    >>> input_documents = Documents(["Document 1", "Document 2", "Document 3"])
    >>> embeddings_result = gemini_embedding_function(input_documents)
    >>> print(embeddings_result)
    Embeddings for the input documents generated by the Gemini AI API.
    """
    def __call__(self, input: Documents) -> Embeddings:
        gemini_api_key = os.getenv("GEMINI_API_KEY")
        if not gemini_api_key:
            raise ValueError("Gemini API Key not provided. Please provide GEMINI_API_KEY as an environment variable")
        genai.configure(api_key=gemini_api_key)
        model = "models/embedding-001"
        title = "Custom query"
        return genai.embed_content(model=model,
                                   content=input,
                                   task_type="retrieval_document",
                                   title=title)["embedding"]


  from .autonotebook import tqdm as notebook_tqdm


# Storing vectors into DB

In [9]:
import chromadb
def create_chroma_db(documents, path, name):
    """
    Creates a Chroma database using the provided documents, path, and collection name.

    Parameters:
    - documents: An iterable of documents to be added to the Chroma database.
    - path (str): The path where the Chroma database will be stored.
    - name (str): The name of the collection within the Chroma database.

    Returns:
    - Tuple[chromadb.Collection, str]: A tuple containing the created Chroma Collection and its name.
    """
    chroma_client = chromadb.PersistentClient(path=path)
    db = chroma_client.create_collection(name=name, embedding_function=GeminiEmbeddingFunction())

    for i, d in enumerate(documents):
        db.add(documents=d, ids=str(i))

    return db, name


In [10]:
# delete old "contents"
#import shutil
#shutil.rmtree("contents")

In [11]:
db,name =create_chroma_db(documents=text, path="contents", name="rag_experiment")

UniqueConstraintError: Collection rag_experiment already exists

In [12]:
def load_chroma_collection(path, name):
    """
    Loads an existing Chroma collection from the specified path with the given name.

    Parameters:
    - path (str): The path where the Chroma database is stored.
    - name (str): The name of the collection within the Chroma database.

    Returns:
    - chromadb.Collection: The loaded Chroma Collection.
    """
    chroma_client = chromadb.PersistentClient(path=path)
    db = chroma_client.get_collection(name=name, embedding_function=GeminiEmbeddingFunction())

    return db


In [13]:
db=path=load_chroma_collection("contents", name="rag_experiment")

# Retrieval

In [14]:
def get_relevant_passage(query, db, n_results):
  passage = db.query(query_texts=[query], n_results=n_results)['documents'][0]
  return passage

In [15]:
relevant_text = get_relevant_passage("anpassa träningen",db,3)
relevant_text

[' ', ' ', ' ']

# Generation

In [16]:
def make_rag_prompt(query, relevant_passage):
  escaped = relevant_passage.replace("'", "").replace('"', "").replace("\n", " ")
  prompt = ("""You are a helpful and informative bot that answers questions using text from the reference passage included below. \
  Be sure to respond in a complete sentence, being comprehensive, including all relevant background information. \
  However, you are talking to a non-technical audience, so be sure to break down complicated concepts and \
  strike a friendly and converstional tone. \
  If the passage is irrelevant to the answer, you may ignore it.
  QUESTION: '{query}'
  PASSAGE: '{relevant_passage}'

  ANSWER:
  """).format(query=query, relevant_passage=escaped)

  return prompt

In [17]:
import google.generativeai as genai
def generate_response(prompt):
    gemini_api_key = os.getenv("GEMINI_API_KEY")
    if not gemini_api_key:
        raise ValueError("Gemini API Key not provided. Please provide GEMINI_API_KEY as an environment variable")
    genai.configure(api_key=gemini_api_key)
    model = genai.GenerativeModel('gemini-pro')
    answer = model.generate_content(prompt)
    return answer.text

# Bringing it all together

In [18]:

def generate_answer(db,query):
    #retrieve top 3 relevant text chunks
    relevant_text = get_relevant_passage(query,db,n_results=3)
    prompt = make_rag_prompt(query, 
                             relevant_passage="".join(relevant_text)) # joining the relevant chunks to create a single passage
    answer = generate_response(prompt)

    return answer
    
    
    

In [19]:
db=load_chroma_collection(path="contents", #replace with path of your persistent directory
                          name="rag_experiment") #replace with the collection name

answer = generate_answer(db,query="Hur kan man anpassa träningen?")
print(answer)


I'm afraid I can't answer that question because the reference passage does not have any information on how to customize training.
