In [1]:
# Import the libraries
import os, json, ast
import openai
import pandas as pd
import csv
from pathlib import Path
from tenacity import retry, wait_random_exponential, stop_after_attempt
from llama_index.core import VectorStoreIndex,Document
from llama_index.readers.file import CSVReader
from llama_index.llms.openai import OpenAI
from llama_index.core.prompts import PromptTemplate
from llama_index.core import Settings  # <-- NEW instead of ServiceContext
from json2html import *
from IPython.display import display, HTML

### **Installing the required Libraries for Colab in case if running in Google Golab**

In [2]:
#!pip install -qU openai
#!pip install json2html
#!pip install llama-index


**Mounting the Goole drive for Colab**<br>
In case if testing in colab please un comment this section and Kindly ensure to create the Path as in the COLAB PATHS Section

In [3]:
# from google.colab import drive
# drive.mount('/content/drive')

#COLAB PATHS TO UNCOMMENT (create folders as applicable) in case if using

In [4]:
# base_path = "/content/drive/MyDrive/01OpenAI/"
# path_books=base_path+"Books/books_gpt.csv" #Source: Chatgpt
# path_api_key=base_path+"Resources/OpenAI_API_Key.txt"

### Configuring the File Paths

In [5]:
base_path = os.getcwd()+"\\"
path_books=base_path+"books_gpt.csv" #Source: Chatgpt
path_api_key=base_path+"OpenAI_API_Key.txt"


**Reading the OPENAPI KEY from the KEY FILE**

In [6]:
with open(path_api_key, "r") as f:
  openai.api_key = ' '.join(f.readlines())

# Stage 1 - Initialization , Intent Clarity and Confirmation

**Initializes the Conversation with the end user**
* Transparency :Tells the user they are interacting with LLM
* Explanation : Tells the user what they can expect from this HelpMate Bot and how their prompt should be provided for better response.
* Disclaimer & Warning – To avoid any unwarranted inputs like Killing etc.,




In [7]:
def initialize_conversation():
  print("Welcome to BookBot! Ask me for book recommendations.\n")
  print("You are Interacting with an AI system that will be help you to fetch books from specific catalogue and return top n matches\n")
  print("This bot scores your result basis these attributes :Genre,Published Year, Pages Count,Price :\n")
  print("For Faster and better results please ensure to include these attributes in your prompt.")
  print("Recommended way of interacting with this bot are as follows:\n")
  print("="*50)
  print("can you recommend a Physics book'")
  print("can you recommend a book for research'")
  print("can you recommend a book of price say 1000 INR'")
  print("can you recommend a book on Physica of price around 1500 INR'")
  print("can you recommend a book on Physica of price around 1500 INR and of 450 pages'")
  print("can you recommend a book on Physica of price around 1500 INR and of 450 pages' published within 5 years")
  print("="*50)
  print("Any query out of scope of the Books Catalogue will not be served")
  print("Any unwarranted questions that endangers the society, unethical etc,.will not be answered by the B")

**Gets Source Data and Does Necessary Formatting**

In [8]:
def getSourceData(path_books):
  col_Price='Price (INR)'
  #path_books=base_path+"Books/books_gpt.csv" #Source: Chatgpt
  df=pd.read_csv(path_books, quoting=csv.QUOTE_ALL,quotechar='"', encoding='utf-8')
  df[col_Price] = df[col_Price].str.replace(',', '').astype(int)
  return df

**Manages User Input and entire Chat flow**

In [9]:
def HelpMateAiBookBot():
  _message_prompt="\nPlease enter your query , or type 'exit' to  quit:"
  _recommended_attributes="'Genre', 'Publication Year', 'Page Count', 'Price (INR)'"
  _message_flagged="Sorry, I cannot process this as this query seems inappropriate from Books Context (Message flagged)\n. Quitting."
  user_input = ''
  _const_Flagged='Flagged'
  df=getSourceData(path_books)
  initialize_conversation()

  while(user_input.lower() != "exit"):
      user_input = input(_message_prompt)
      print("Processing you Query='",user_input,"'...")
      moderation = moderation_check(user_input)
      if moderation == _const_Flagged:
        display(_message_flagged)
        break
      response_intent,retrieved_nodes=llamaindex_intent_confirmation_layer(user_input,df)
      scored_results=mapProductsByScoring(retrieved_nodes,user_input)
      if response_intent.response=='No':
        print("Sorry I cant process this, it may be out of scope")
      if len(scored_results)==0:
        print("Please reword your Prompt it may be missing the recommended attributes:{}".format(_recommended_attributes))
      if len(scored_results)>0:
        filtered_df=information_extraction(scored_results,df)
        showProductRecommendation(filtered_df)
  if user_input.lower()=='exit':
    print("Quitting...")

**Intent Clarity - Moderation Check**

**Checks if the User Input is Unwarranted if yes then returns the flag**
* uses openai.moderations.create API
* checks if the User input has any  unwarranted content
* if yes , it appropriately returns the Flag.

In [10]:
def moderation_check(user_input):
    # Call the OpenAI API to perform moderation on the user's input.
    response = openai.moderations.create(input=user_input)
    #print(response)
    # Extract the moderation result from the API response.
    moderation_output = response.results[0].flagged
    # Check if the input was flagged by the moderation system.
    if response.results[0].flagged == True:
        return "Flagged"
    else:
        return "Not Flagged"

**Intent Confirmation** - llamaindex_intent_confirmation_layer

* Reads the data Source and creates  Documents from it.
* Creates a VectorStoreIndex  for querying basis the prompt.
* Sets the Context to avoid Hallucination (temperature Zero)
* Templating – Using Binary Prompt to return No if the Query is out of context from the data set being catalogued.
* Executes the query basis the User Prompt
* Returns the Query Response and RetrieverNodes for scoring down the line.


In [11]:
def llamaindex_intent_confirmation_layer(prompt_query,df):
  # Load CSV
  #reader = CSVReader()
  important_columns = ['Genre', 'Publication Year', 'Page Count', 'Price (INR)']
  documents = []
  for idx, row in df.iterrows():
      content = "\n".join([f"{col}: {row[col]}" for col in important_columns])
      documents.append(Document(text=content, metadata={"row": idx}))
  # Create an index
  index = VectorStoreIndex.from_documents(documents)
  # Configure the LLM globally using Settings
  Settings.llm = OpenAI(model="gpt-3.5-turbo", temperature=0)
  binary_prompt = PromptTemplate(
      "You are verifying product specifications based ONLY on the following context.\n"
      "If the information is found, answer 'Yes'.\n"
      "If the information is missing or unclear, answer 'No'.\n"
      "Be strictly based on context. Do not guess.\n"
      "Context:\n{context_str}\n\n"
      "Question: {query_str}\n"
      "Answer:"
  )

  query_engine = index.as_query_engine(
        text_qa_template=binary_prompt,
      similarity_top_k=10  # search top 3 most similar entries
  )
  # Query
  response = query_engine.query(prompt_query)
  retrieved_nodes=query_engine.retrieve(prompt_query)

  return response,retrieved_nodes





# Stage 2 - Product Mapping and Information Extraction

**Scoring -compute_match_score**
* Extracts the attributes from the Query
* Matches with the set of important columns in the data ('Genre', 'Publication Year', 'Page Count', 'Price (INR)’)
* Calculates and returns a Score ,on the Range of 1,2 and 3, with 1 being the lowest.


In [12]:
def compute_match_score(doc_text, query_text,important_columns= ['Genre', 'Publication Year', 'Page Count', 'Price (INR)']):
    total_score = 0

    for attr in important_columns:
        if attr.lower() in doc_text.lower():
            # Extract attribute value
            attr_value_start = doc_text.lower().find(attr.lower()) + len(attr) + 2  # Skip ": "
            attr_value_end = doc_text.lower().find('\n', attr_value_start)
            attr_value = doc_text[attr_value_start:attr_value_end if attr_value_end != -1 else None].strip()

            if attr_value:
                attr_value_lower = attr_value.lower()
                query_lower = query_text.lower()

                # 3 points → Strong match: full attribute value found
                if attr_value_lower in query_lower:
                    total_score += 3
                # 2 points → Medium match: all keywords from attribute found separately
                elif all(word in query_lower for word in attr_value_lower.split()):
                    total_score += 2
                # 1 point → Weak match: any keyword from attribute found
                elif any(word in query_lower for word in attr_value_lower.split()):
                    total_score += 1
    return total_score

**Product Mapping**
* Using RetreivedNodes from the LLAMAINDEX Search Query and the Query from user calculates the Score using the compute_match_score method explained above.
* If the Score is above the defined Threshold it is added to a dictionary.
* The scored result is returned as a Data Frame for Product information extraction.


In [13]:
def mapProductsByScoring(retrieved_nodes,query):
  scored_results = []
  dictionary_ScoredResults=[]
  score_threshold=2
  for node in retrieved_nodes:
      text = node.node.text
      match_score = compute_match_score(text, query)
      if match_score>=score_threshold:
        item={}
        item["Score"]=match_score
        for _text in text.split("\n"):
          key=_text.split(":")[0].strip()
          value=_text.split(":")[1].strip()
          item[key]=value
        dictionary_ScoredResults.append(item)

  dfScoredResult=pd.DataFrame.from_dict(dictionary_ScoredResults,)
  return dfScoredResult

**information_extraction**
* The Scored Result Dataframe is JOINED with the original dataframe to return the Extracted product Information


In [14]:
def information_extraction(dfScoredResult,df):
  filtered_df=df.copy()
  dfScoredResult['Publication Year']=dfScoredResult['Publication Year'].astype(int)
  dfScoredResult['Page Count']=dfScoredResult['Page Count'].astype(int)
  dfScoredResult['Price (INR)']=dfScoredResult['Price (INR)'].astype(float)


  new_df = pd.merge(
    left=df,
    right=dfScoredResult,
    how='inner',
    left_on=['Genre', 'Publication Year', 'Page Count', 'Price (INR)'],
    right_on=['Genre', 'Publication Year', 'Page Count', 'Price (INR)'],
   )
  return (new_df)


# Stage 3 - Product Recommendation

---




**The Extracted Producted information is displayed to user as Prompt Response in Human readbale and not as JSON or table format.**

In [15]:
def showProductRecommendation(filtered_df):
  print("Here's your current recommendation basis your prompt:")
  for idx,row in filtered_df.iterrows():
    strOutput=""
    strOutput="Title :" +str(row['Book Title'])+" \n "
    strOutput=strOutput+"Genre:"+str(row['Genre'])+" \n "
    strOutput=strOutput+"Published in the Year:"+str(row['Publication Year'])+" \n "
    strOutput=strOutput+"Priced at :"+str(row['Price (INR)'])+" \n "
    strOutput=strOutput+"of Pages :"+str(row['Page Count'])+" \n "
    print(strOutput)
    print("="*50)
  print("Query again as recommended for more")

In [16]:
#debugging llamaindex_intent_confirmation_layer
query="I need a book on Data Science"
df=getSourceData(path_books)
r1,r2=llamaindex_intent_confirmation_layer(query,df)
#print(r1.response)
#print(r2)
scored_results = []
scored_results=mapProductsByScoring(r2,query)
#print(scored_results)
extracted_data=information_extraction(scored_results,df)
showProductRecommendation(extracted_data)



Here's your current recommendation basis your prompt:
Title :Data Science in Practice 
 Genre:Data Science 
 Published in the Year:2019 
 Priced at :2100 
 of Pages :500 
 
Title :Introduction to Data Science 
 Genre:Data Science 
 Published in the Year:2022 
 Priced at :2000 
 of Pages :550 
 
Title :Data Science for Business 
 Genre:Data Science 
 Published in the Year:2020 
 Priced at :2300 
 of Pages :650 
 
Title :R for Data Science 
 Genre:Data Science 
 Published in the Year:2021 
 Priced at :1700 
 of Pages :400 
 
Title :Machine Learning Yearning 
 Genre:Data Science 
 Published in the Year:2021 
 Priced at :1100 
 of Pages :150 
 
Title :Data-Driven Science and Engineering 
 Genre:Data Science 
 Published in the Year:2019 
 Priced at :2100 
 of Pages :550 
 
Query again as recommended for more


# Entry Point

In [None]:
HelpMateAiBookBot()