# Using OpenAI

In [1]:
import os
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
os.environ['OPENAI_API_KEY'] = os.getenv("OPENAI_API_KEY")

In [3]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader

documents = SimpleDirectoryReader("data").load_data()

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
documents

[Document(id_='0a6ae125-aed9-4ba3-893c-8c287955301c', embedding=None, metadata={'page_label': '1', 'file_name': 'Xangars.pdf', 'file_path': 'c:\\Users\\soham\\Desktop\\LangChain\\RAG Pipeline\\data\\Xangars.pdf', 'file_type': 'application/pdf', 'file_size': 81884, 'creation_date': '2024-05-24', 'last_modified_date': '2024-05-24'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='Home\nCaution:\nXangars\nInfratech\nSolutions\nPvt.\nLtd.\nis\nthe\nonly\nXangars™.\nDo\nnot\nmistake\nfraudulent\nimitation\ncompanies\noffering\nemployment\nand\nservices\nusing\nthe\nXangars™\nname.\nNavigation\n●\nHome\n●\nApply\nNow\n●\nWork@Xangars\n●\nLife@Xangars\n●\nAbout\n●\nWho\nWe\nAre\n●\nOur\nClients\n●\nLocations\n●\nTechnology\n●\nInsure\nTech\n●\nContact\nUs\nWhy\

In [5]:
index = VectorStoreIndex.from_documents(documents, show_progress=True)

Parsing nodes: 100%|██████████| 10/10 [00:00<00:00, 416.52it/s]
Generating embeddings: 100%|██████████| 10/10 [00:01<00:00,  7.67it/s]


In [6]:
index

<llama_index.core.indices.vector_store.base.VectorStoreIndex at 0x17a9a1f6810>

In [7]:
query_engine = index.as_query_engine()

In [7]:
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.indices.postprocessor import SimilarityPostprocessor
from llama_index.core.query_engine import RetrieverQueryEngine

retriever = VectorIndexRetriever(index=index, similarity_top_k=4)
postprocessor = SimilarityPostprocessor(similarity_cutoff=0.75)

query_engine = RetrieverQueryEngine(
    retriever=retriever,
    node_postprocessors=[postprocessor],
)

In [22]:
# response = query_engine.query("On which social media platforms we can see Xangars?, Can you also give the link for that")
# response = query_engine.query("Who is Tanvi Sharma?")
# response = query_engine.query("Why should I work at Xangars?")
# response = query_engine.query("How can I apply to Xangars?, Can you also provide link for that?")
response = query_engine.query("What are the positions we can apply in Xangars?")
# response = query_engine.query("Who are the clients of Xangars?")
# response = query_engine.query("What benefits does Xangars offer to full-time employees?")

In [23]:
from llama_index.core.response.pprint_utils import pprint_response
pprint_response(response, show_source=True)
# print(response)

Final Response: Software Developer and HR Team positions are available
to apply for at Xangars.
______________________________________________________________________
Source Node 1/4
Node ID: fe155118-9cef-4ff6-ae8b-5809916f7aaf
Similarity: 0.8422941093761509
Text: culture in any other company. They will listen! That is a really
good thing." - Divya Kanparia, Xangars Software Developer Moreover, we
offer the following benefits to full-time employees: ● 24 leaves per
working year ● 6 months maternity leave ● Medical insurance Diversity
As a global company, diversity is in our DNA. We actively recruit and
fo...
______________________________________________________________________
Source Node 2/4
Node ID: 66db7cd5-eb7b-46ba-aba3-e302f515a8de
Similarity: 0.8262494565502201
Text: Home Caution: Xangars Infratech Solutions Pvt. Ltd. is the only
Xangars™. Do not mistake fraudulent imitation companies offering
employment and services using the Xangars™ name. Navigation ● Home ●
Apply Now ● Wor

# Using Gemini

In [2]:
os.environ['GEMINI_API_KEY'] = os.getenv("GEMINI_API_KEY")

In [3]:
import os
import google.generativeai as genai
import re
from pypdf import PdfReader

  from .autonotebook import tqdm as notebook_tqdm


Load Pdf Function

In [4]:
def load_pdf(file_path):
    """
    Reads the text content from a PDF file and returns it as a single string.

    Parameters:
    - file_path (str): The file path to the PDF file.

    Returns:
    - str: The concatenated text content of all pages in the PDF.
    """
    reader = PdfReader(file_path)

    # Loop over each page and store it in a variable
    text = ""
    for page in reader.pages:
        text += page.extract_text()

    return text


Load PDFs from Directory Function

In [5]:
def load_pdfs_from_directory(directory_path):
    """
    Reads all PDF files from a directory and concatenates their text content.

    Parameters:
    - directory_path (str): The path to the directory containing PDF files.

    Returns:
    - str: The concatenated text content of all PDFs in the directory.
    """
    combined_text = ""
    
    # Loop over each file in the directory
    for filename in os.listdir(directory_path):
        if filename.endswith(".pdf"):
            file_path = os.path.join(directory_path, filename)
            combined_text += load_pdf(file_path)
    
    return combined_text


Split Text Function

In [6]:
def split_text(text: str):
    """
    Splits a text string into a list of non-empty substrings based on the specified pattern.
    The "\n \n" pattern will split the document para by para

    Parameters:
    - text (str): The input text to be split.

    Returns:
    - List[str]: A list containing non-empty substrings obtained by splitting the input text.
    """
    split_text = re.split(r'\n\s*\n', text)
    return [i for i in split_text if i.strip() != ""]


Load and Process PDFs

In [7]:
# Replace the path with your directory path
directory_path = r"data"
all_pdfs_text = load_pdfs_from_directory(directory_path)
chunked_text = split_text(text=all_pdfs_text)


Verify the Chunks

In [8]:
# Print chunked text for verification (optional)
for i, chunk in enumerate(chunked_text):
    print(f"Chunk {i+1}:\n{chunk}\n{'-'*50}\n")

Chunk 1:
Piyush Malviya
Web Developer
About Me
Professional Experience
Projects"As a dedicated Computer Engineering student, I am
passionate about leveraging technology to solve
complex problems. Proficient in programming
languages such as C,Java,JavaScript. My coursework
has equipped me with knowledge in Data Structures,
Operating Systems, DBMS, Analysis of Algorithm etc
and I am adept at collaborating in team environments.
Hard Skill
Java
 C
 Data Structures
  MERN Stack
 Web development
 Artificial Intelligence, Deep learning
Soft Skill
Communication
Time Management
Leadership
Adaptability
Education BackgroundMy Contact
malviyapiyush744@gmail.com
Mumbai.India 9004530891
http://www.linkedin.com/in/piyush-
malviya-9a819724a
Snappy — Real time chat application "Developed a real-time
chat application using React for the front end, Node.js and
Express for the server, Socket.io for real-time communication,
and MongoDB for data storage. The application is responsive
and styled using Styled

In [9]:
class GeminiEmbeddingFunction:
    """
    Custom embedding function using the Gemini AI API for document retrieval.

    This class uses a list to store documents and directly passes it to the API.
    """
    def __call__(self, input: list) -> list:
        gemini_api_key = os.getenv("GEMINI_API_KEY")
        if not gemini_api_key:
            raise ValueError("Gemini API Key not provided. Please provide GEMINI_API_KEY as an environment variable")
        genai.configure(api_key=gemini_api_key)
        model = "models/embedding-001"
        title = "Custom query"
        response = genai.embed_content(model=model,
                                       content=input,  # Pass documents directly
                                       task_type="retrieval_document",
                                       title=title)
        return response["embedding"]


In [10]:
# Example usage (assuming you have your documents in a list named `chunked_text`)
embedding_function = GeminiEmbeddingFunction()
embeddings = embedding_function(chunked_text)
# The `embeddings` dictionary will contain the generated embedding information

print(embeddings)

[[0.023856727, -0.07120652, -0.022611776, -0.04398759, 0.060492326, 0.0073271706, 0.017052762, -0.00013126034, 0.016264765, 0.02745753, -0.008973525, 0.037594035, -0.052452285, -0.011599275, 0.012249292, -0.040934615, -0.032409064, -0.012297112, 0.0053334176, -0.009393403, 0.035250947, -0.019576162, 0.01453795, -0.0357358, 0.01931854, -0.0077099097, -0.008313586, -0.029181631, -0.05460287, 0.06586123, -0.029698078, 0.044256885, -0.046699967, 0.01033952, -0.013325517, -0.044093862, -0.040156946, 0.052303, -0.010392453, 0.04251214, 0.02462576, -0.020578634, -0.029886896, -0.0099197235, -0.015694449, -0.021038488, -0.023698919, 0.017678924, 0.02874683, -0.039521933, 0.025878336, 0.012548636, 0.016351493, -0.018169269, 0.04341597, 0.05271816, 0.04631879, 0.0014580679, 0.032588996, 0.010418512, 0.02306712, 0.010210882, -0.01773213, 0.052713856, -0.017365012, -0.039551467, -0.060051773, 0.03140145, 0.010923344, -0.009293717, -0.013929951, -0.014081405, 0.10500787, -0.06947021, -0.08266426, -

In [11]:
%pip install chromadb

Note: you may need to restart the kernel to use updated packages.


In [22]:
from typing import List
import chromadb

In [23]:
def create_chroma_db(documents: List, path: str, name: str):
    """
    Creates a Chroma database using the provided documents, path, and collection name.

    Parameters:
    - documents: An iterable of documents to be added to the Chroma database.
    - path (str): The path where the Chroma database will be stored.
    - name (str): The name of the collection within the Chroma database.

    Returns:
    - Tuple[chromadb.Collection, str]: A tuple containing the created Chroma Collection and its name.
    """
    chroma_client = chromadb.PersistentClient(path=path)
    db = chroma_client.create_collection(name=name, embedding_function=GeminiEmbeddingFunction())

    for i, d in enumerate(documents):
        db.add(documents=[d], ids=[str(i)])  # Adjusted to match expected input format

    return db, name


In [25]:
db, name = create_chroma_db(documents=chunked_text, 
                            path="contents",  # Replace with your path
                            name="Dimple1")


In [27]:
def load_chroma_collection(path, name):
    """
    Loads an existing Chroma collection from the specified path with the given name.

    Parameters:
    - path (str): The path where the Chroma database is stored.
    - name (str): The name of the collection within the Chroma database.

    Returns:
    - chromadb.Collection: The loaded Chroma Collection.
    """
    chroma_client = chromadb.PersistentClient(path=path)
    db = chroma_client.get_collection(name=name, embedding_function=GeminiEmbeddingFunction())

    return db

db=load_chroma_collection(path="contents", name="Dimple1")

In [28]:
def get_relevant_passage(query, db, n_results):
    passage = db.query(query_texts=[query], n_results=n_results)['documents'][0]
    return passage


In [29]:
# Assuming you have a database object `db` that you can query
relevant_text = get_relevant_passage(query="Who is Piyush?", db=db, n_results=3)
print(relevant_text)


Number of requested results 3 is greater than number of elements in index 1, updating n_results = 1


['Piyush Malviya\nWeb Developer\nAbout Me\nProfessional Experience\nProjects"As a dedicated Computer Engineering student, I am\npassionate about leveraging technology to solve\ncomplex problems. Proficient in programming\nlanguages such as C,Java,JavaScript. My coursework\nhas equipped me with knowledge in Data Structures,\nOperating Systems, DBMS, Analysis of Algorithm etc\nand I am adept at collaborating in team environments.\nHard Skill\nJava\n C\n Data Structures\n  MERN Stack\n Web development\n Artificial Intelligence, Deep learning\nSoft Skill\nCommunication\nTime Management\nLeadership\nAdaptability\nEducation BackgroundMy Contact\nmalviyapiyush744@gmail.com\nMumbai.India 9004530891\nhttp://www.linkedin.com/in/piyush-\nmalviya-9a819724a\nSnappy — Real time chat application "Developed a real-time\nchat application using React for the front end, Node.js and\nExpress for the server, Socket.io for real-time communication,\nand MongoDB for data storage. The application is responsive

In [35]:
def make_rag_prompt(query, relevant_passage):
  escaped = relevant_passage.replace("'", "").replace('"', "").replace("\n", " ")
  prompt = ("""You are a helpful and informative bot that answers questions using text from the reference passage included below. \
  Be sure to respond in a complete sentence, being comprehensive, including all relevant background information. \
  However, you are talking to a non-technical audience, so be sure to break down complicated concepts and \
  strike a friendly and converstional tone. \
  If the passage is irrelevant to the answer, you may ignore it.
  QUESTION: '{query}'
  PASSAGE: '{relevant_passage}'

  ANSWER:
  """).format(query=query, relevant_passage=escaped)

  return prompt

In [43]:
def generate_text_answer(prompt):
    gemini_api_key = os.getenv("GEMINI_API_KEY")
    if not gemini_api_key:
        raise ValueError("Gemini API Key not provided. Please provide GEMINI_API_KEY as an environment variable")
    genai.configure(api_key=gemini_api_key)
    model = "models/generative-001"
    response = genai.generate_text(model=model, prompt=prompt)
    return response['generated_text']


In [44]:
def get_answer(db, query):
    relevant_text = get_relevant_passage(query, db, n_results=3)
    prompt = make_rag_prompt(query, relevant_passage="".join(relevant_text))
    answer = generate_text_answer(prompt)
    return answer


In [45]:
db = load_chroma_collection(path="contents", name="Dimple1")  # Replace with your path and collection name
query = "what is the purpose of SOP?"
answer = get_answer(db, query)
print(answer)


Number of requested results 3 is greater than number of elements in index 1, updating n_results = 1


NotFound: 404 models/generative-001 is not found for API version v1beta, or is not supported for generateText. Call ListModels to see the list of available models and their supported methods.