## Install and import modules/libraries

In [3]:
pip install nltk


Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install pypdf

Note: you may need to restart the kernel to use updated packages.


In [6]:
# imports

import os
import glob
from dotenv import load_dotenv
import gradio as gr
import nltk

In [7]:
# imports for langchain, plotly and Chroma

from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import numpy as np
import plotly.graph_objects as go
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain

In [8]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/varunraghav/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [9]:
# price is a factor for our company, so we're going to use a low cost model

MODEL = "gpt-4o-mini"
db_name = "polity_db"

In [10]:
# Load environment variables in a file called .env

load_dotenv()
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')

## Load File and Split it into Chunks

In [11]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter

pdf_path = 'Indian_polity.pdf'  # Update with your file path
loader = PyPDFLoader(pdf_path)

documents = loader.load()


In [12]:
# Let's take a look at the 10th page of extracted text
print(documents[100].page_content)

of seats for the scheduled caste s and scheduled tribes to ensure adequate
representation to them.
12. Universal Adult Franchise
The Indian Constitution adopt s universal adult franchise as a basis of
elections to the Lok Sabha and the state legislative assemblies. Every
citizen who is not less than 18 years of age has a right to vote without any
discrimination of caste, race, religion, sex, literacy , wealth and so on. The
voting age was reduced to 18 years from 21 years in 1989 by the 61st
Constitutional Amendment Act of 1988.
The introduction of universal adult franchise by the Constitution-makers
was a bold experiment and highly remarkable in view of the vast size of the
country , its huge population, high poverty , social inequality and
overwhelming illiteracy .14
Universal adult franchise make s democracy broad-based, enhances the
self-respect and prestige of the common people, upholds the principle of
equality , enables minorities to protect their interests and opens up new
hope

In [13]:
from langchain.text_splitter import CharacterTextSplitter

# Use a text splitter to split the document into manageable chunks. Chunk size has been kept at manageable levels
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = text_splitter.split_documents(documents)

# Display the 10th chunk for reference
print(chunks[10])


page_content='Preface to the
Sixth Edition
I am pleased to place before the readers a thoroughly revised, enlar ged and
updated edition of this widely read book on Indian Polity .
In 2011  and 2013, the UPSC changed the pattern and syllabus of the
preliminary and main examinations, respectively . Both times, the scope of
Indian Polity has been consider ably increased. Hence, this new edition of
the book is more relevant now and is aimed to meet the expanded needs of
the aspirants.
In the course of revision and updation of this edition of the book, various
new developments related to the subject, like recent constitutional
amendments, parliamentary legislations, executive decisions and supreme
court judgments, have been taken into account.
Changes in this Edition:
1.  Addition of 6 new chapters.
2.  Inclusion of 2017, 2018 and 2019 preliminary questions with
answers.
3.  Inclusion of 2016, 2017, 2018 and 2019 mains questions.
4.  Updation of the year-wise break-up of the UPSC questions 

## Create Embedding, Vectorstore and Setup RAG

In [14]:
# Put the chunks of data into a Vector Store that associates a Vector Embedding with each chunk
# Chroma is a popular open source Vector Database based on SQLLite

embeddings = OpenAIEmbeddings()

# Delete if already exists

if os.path.exists(db_name):
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()

# Create vectorstore

vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)
print(f"Vectorstore created with {vectorstore._collection.count()} documents")

Vectorstore created with 1546 documents


In [15]:
# create a new Chat with OpenAI
llm = ChatOpenAI(temperature=0.7, model_name=MODEL)

# set up the conversation memory for the chat
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

# the retriever is an abstraction over the VectorStore that will be used during RAG
retriever = vectorstore.as_retriever()

# putting it together: set up the conversation chain with the GPT 3.5 LLM, the vector store and memory
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)

In [16]:
# Let's try a simple question

query = "Please explain what fundamental rights is in a couple of sentences"
result = conversation_chain.invoke({"question": query})
print(result["answer"])

Fundamental Rights are a set of rights guaranteed by the Constitution that ensure individual freedom and equality for all persons, protecting them from arbitrary actions by the state and private individuals. They are enshrined in Part III of the Constitution and include rights such as the right to equality, freedom, and protection against exploitation, among others.


## Setup Chat Function

#### Using both conventional LLM 


1. The user's input (question) is tokenized into individual words. Stop words (common words like "the", "is", etc.) are filtered out using the NLTK stopwords corpus. The filtered query is then re-embedded into a vector space using the embeddings.embed_query(filtered_query) method.
2. Both the query and document embeddings are normalized to unit length
3. A similarity threshold is set at 0.8. If the maximum similarity score between the query and any document is less than 0.8, the chatbot assumes that the query is not content-specific and sends it directly to the general LLM for a response.
4. If the similarity score is greater than or equal to 0.8, the chatbot assumes that the query is content-based and uses the Conversational Retrieval Chain (RAG) to generate the answer by retrieving relevant content from the vector store.

##### This method ensures that casual or less content-relevant queries are handled by a general LLM, while more specific queries that match document content are handled using the retrieval mechanism.

In [17]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
def chat(question, history):
    # Threshold for similarity score to determine if query is content-based
    similarity_threshold = 0.8

    
    stop_words = set(stopwords.words('english'))
    
    # Tokenize the query
    query_tokens = word_tokenize(question.lower())
    
    # Filter out stop words
    filtered_query = ' '.join([word for word in query_tokens if word not in stop_words])
    
    # Re-embed the filtered query
    query_embedding = embeddings.embed_query(filtered_query)
    
    # Get embeddings for all documents
    all_docs = vectorstore._collection.get()

    doc_embeddings = vectorstore._collection.get(include=['embeddings'])['embeddings']

    print(doc_embeddings[5])

    # Normalize query embedding
    query_embedding = query_embedding / np.linalg.norm(query_embedding)
    
    # Normalize document embeddings
    doc_embeddings = [doc_embedding / np.linalg.norm(doc_embedding) for doc_embedding in doc_embeddings]
    
    # Calculate cosine similarity between query and document embeddings
    similarities = [np.dot(query_embedding, doc_embedding) for doc_embedding in doc_embeddings]

    # Check if maximum similarity score exceeds threshold
    max_similarity = max(similarities)
    print(max_similarity)
    if max_similarity < similarity_threshold:
        result = llm.invoke(question)
        return result.content  # Access the content of the AIMessage
    else:
        # Content-based query, use ConversationalRetrievalChain
        print("entered rag")
        result = conversation_chain.invoke({"question": question})
        return result["answer"]  # This remains the same for the RAG path


In [18]:
view = gr.ChatInterface(chat).launch()

Running on local URL:  http://127.0.0.1:7867

To create a public link, set `share=True` in `launch()`.


--------
