<a href="https://colab.research.google.com/github/tararajagopalan/WebsiteQueryingProject/blob/main/RAGCode.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install openai

In [None]:
import os
from openai import OpenAI

In [None]:
from google.colab import userdata
from openai import OpenAI

#installing api key in secrets and getting it from there
#openai.api_key = userdata.get('openapikey') # This line is not needed

#creating open ai client with key from secrets
openai_client = OpenAI(api_key=userdata.get('openapikey'),)

In [None]:
#milvus installations

!pip install -U pymilvus
!pip install --upgrade pymilvus
!pip install "pymilvus[model]"

In [None]:
#enabling google drive access to notebook

from google.colab import drive
drive.mount('/content/drive')

In [None]:
#SETTING UP VECTOR DATA BASE: creating a database on the mounted google drive
#sets up data base in the milvus_demo.db file on google drive
from pymilvus import MilvusClient

milvus_client = MilvusClient("/content/drive/MyDrive/project_work.db")

In [None]:
milvus_client.describe_collection("project_collection")

In [None]:
from pymilvus import model

embedding_fn = model.DefaultEmbeddingFunction()

In [None]:
#question of interest to query chat gpt

question = "How to train a dog?"

In [None]:
#placing question into query_vectors to be embedded and specifying search_params here

query_vectors = embedding_fn.encode_queries([question])


search_params = {
    "metric_type": "L2",          # or "IP" depending on your index
    "params": {"nprobe": 10}
}

In [None]:
#querying the project_collection milvus database according to the question of interest

search_res = milvus_client.search(
    collection_name="project_collection",
    data=query_vectors,
    anns_field="embedding",
    search_params = search_params,
    limit=5, #returns top 2 similar results
    output_fields=["text","FilePath","FileExtension"], #this is what you select -> in SQL: select text, subject from demo_collection where vector is similar to data

)

#2 relevant entries to the query are basically printed below
print(search_res)

In [None]:
#this is basically showing us the results of the search query which was serialized into a json formatting string
import json

retrieved_lines_with_distances = [
    (res["entity"]["text"], res["distance"]) for res in search_res[0]
]
print(json.dumps(retrieved_lines_with_distances, indent=4))

In [None]:
#Using LLM to get a RAG Response

#converting retrieved documents into string format
context = "\n".join(
    [line_with_distance[0] for line_with_distance in retrieved_lines_with_distances]
)

In [None]:
#define system and user prompts for Lanage Model. Prompt is assembled with retrieved documents from milvus

#SYSTEM_PROMPT: instructs the system on how to behave
#USER_PROMPT: basically is saying to use context of the retrieved documents from the milvus query to appropriately answer the question of interest

SYSTEM_PROMPT = """
Human: You are an AI assistant. You are able to find answers to the questions from the contextual passage snippets provided.
"""
USER_PROMPT = f"""
Use the following pieces of information enclosed in <context> tags to provide an answer to the question enclosed in <question> tags.
<context>
{context}
</context>
<question>
{question}
</question>
"""

In [None]:
#Use OpenAI ChatGPT to generate a response based on the prompts

response = openai_client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": USER_PROMPT},
    ],
)
print(response.choices[0].message.content)