In [2]:
from dotenv import load_dotenv
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
from langchain.vectorstores import FAISS
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.llms import HuggingFaceHub
from langchain.prompts.prompt import PromptTemplate
import numpy as np
from rouge import Rouge

In [4]:
fileObject = open("KnowledgeDocument(pan_card_services).txt", "r")
data = fileObject.read()

In [5]:
################ information retrieval (IR) system improvment by tuning text splitting #######################
###### After testing different chunk_size and chunk_overlap params, I found 1500 and 500 to be most
###### efficient in retaining information in each chunk as seen in the result  ###############################

def get_text_chunks(text,size=1500,overlap=500):
    text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=size,
        chunk_overlap=overlap,
        length_function=len
    )
    chunks = text_splitter.split_text(text)
    return chunks

chunks = get_text_chunks(data)
print("\n\n _____________________________________________________________________________________\n\n".join(chunks))

# About Pan Card

### What is Pan card?

The PAN card is a unique ten-digit alphanumeric identification number that is issued by the Income Tax Department of India to track the tax-related transactions of individuals and entities. The PAN card is mandatory for any financial transaction in India, including opening a bank account, buying or selling property, and filing income tax returns.

### Who needs a Pan card?

All individuals/non-individuals (including foreign citizens/entities) earning taxable income in India must have a PAN card.

### Types of PAN cards

In India, two types of PAN cards are available: e-PAN card and physical PAN card.

1. e-PAN card: An e-PAN card is a digitally-signed PAN card issued in electronic format. It contains the same PAN details as a physical PAN card but is available in a digital format. It can be downloaded online and used as a valid identification document for various purposes. The e-PAN card is usually issued in a PDF format.
2. Physical PAN card: A

In [6]:
# load env variables and openai API key
load_dotenv()

def get_vectorstore(text_chunks):
    embeddings = OpenAIEmbeddings()
    vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
    return vectorstore

vectorstore = get_vectorstore(chunks)

In [10]:
####### Detecting when our Information Retrieval returns 0 documents ############################

######## This is done by checking the similarity scores for questions provided in the sample 
#(relevant to the task of the chatbot) and for irrelevant questions 
#(generic questions that are in now way related to the knowledge document).######################

# checking the similarity score for given sample questions


import pandas as pd
sample_questions = pd.read_excel('SampleQuestions.xlsx')

print("checking the similarity score for given sample questions")

allscores1 = []
for i in range(0,sample_questions.shape[0]):
    similarity_score = vectorstore.similarity_search_with_score(sample_questions['Question'][i])
    scores = []
    for j in range(0,len(similarity_score)):
        scores.append(similarity_score[j][1])
    allscores1.append(scores)
allrelevantscores = pd.DataFrame(allscores1)

print(f"Minimum score observed in the sample questions is",allrelevantscores.min()[0],
      "and Maximum is", allrelevantscores.max()[3])

# calculating the scores for 30 irrelevant questions
irrelevant_questions = pd.read_excel('irrelevant_questions.xlsx')

print("checking the similarity score for irrelevant questions")

allscores = []
for i in range(0,irrelevant_questions.shape[0]):
    similarity_score = vectorstore.similarity_search_with_score(irrelevant_questions['questions'][i])
    scores = []
    for j in range(0,len(similarity_score)):
        scores.append(similarity_score[j][1])
    allscores.append(scores)
irrelevant_scores = pd.DataFrame(allscores)

print(f"Minimum score observed in the irrelevant questions is",irrelevant_scores.min()[0],
      "and Maximum is", irrelevant_scores.max()[3])

checking the similarity score for given sample questions
Minimum score observed in the sample questions is 0.1529625 and Maximum is 0.44510525
checking the similarity score for irrelevant questions
Minimum score observed in the irrelevant questions is 0.5810295 and Maximum is 0.72412884


In [3]:
template1 = """
You are an AI assistant for answering questions about PAN card in English.
You are given the following extracted parts of a long document and a question.
If you don't know the answer, just say "Hmm, I'm not sure." Don't try to make up an answer.
If the question is not about pan card, politely inform them that you are tuned to 
only answer questions about pan card.

CONTEXT:
{context}
=========
QUESTION: {question}

"""

template2 = """
Given the following context and a follow up question, answer in English
CONTEXT: 
{context}

Follow Up Input: {question}
If you don't know the answer, just say that you don't know. Don't try to make up an answer.
Please answer the following question using the context provided.
Your answer:
"""

template3= """
<|SYSTEM|>
- You are a helpful, polite, fact-based agent for answering questions about pan card. 
- Your answers include enough detail for someone to follow through on your suggestions. 
<|USER|>
If you don't know the answer, just say that you don't know. Don't try to make up an answer.
Please answer the following question using the context provided in English. 

CONTEXT: 
{context}
=========
QUESTION: {question} 
ANSWER: <|ASSISTANT|>
"""
QA_PROMPT = []
QA_PROMPT.append(PromptTemplate(template=template1, input_variables=["question", "context"]))
QA_PROMPT.append(PromptTemplate(template=template2, input_variables=["question", "context"]))
QA_PROMPT.append(PromptTemplate(template=template3, input_variables=["question", "context"]))

In [7]:
def get_conversation_chain(llm,vectorstore,i):

    memory = ConversationBufferMemory(
        memory_key='chat_history', return_messages=True)
    conversation_chain = ConversationalRetrievalChain.from_llm(
        llm=llm,
        retriever=vectorstore.as_retriever(),
        memory=memory,
        combine_docs_chain_kwargs={"prompt": QA_PROMPT[i]}
    )
    return conversation_chain


In [8]:
llm = []
llm.append(ChatOpenAI(temperature=0.7))
llm.append(ChatOpenAI(temperature=0.5))

In [11]:
compiled_answers1=[]
for i in range(0,len(llm)):
    for j in range(0,len(QA_PROMPT)):
        allanswers=[]
        for k in range(0,sample_questions.shape[0]):
            conversation = get_conversation_chain(llm[i],vectorstore,j)
            response = conversation({'question':sample_questions['Question'][k]})
            allanswers.append(response['answer'])
        compiled_answers1.append(allanswers)

In [13]:
rouge = Rouge()
human_baseline=np.array(sample_questions['Ideal Answer'])
rogue_values=[]
for i in range(0,6):
    model_prediction = np.array(compiled_answers1[i])
    rogue_values.append(rouge.get_scores(model_prediction, human_baseline,avg=True))


In [14]:
rogue_values

[{'rouge-1': {'r': 0.6390414059979216,
   'p': 0.710323441242082,
   'f': 0.6433658306321098},
  'rouge-2': {'r': 0.5162173906613079,
   'p': 0.5822512309630012,
   'f': 0.5185260783160213},
  'rouge-l': {'r': 0.6232599117687422,
   'p': 0.6911300734220012,
   'f': 0.6274397413655285}},
 {'rouge-1': {'r': 0.5563686481860747,
   'p': 0.692493914803144,
   'f': 0.583993027891705},
  'rouge-2': {'r': 0.42537604547190555,
   'p': 0.5303019132740581,
   'f': 0.4404712529874416},
  'rouge-l': {'r': 0.5242390460854863,
   'p': 0.6599527939611954,
   'f': 0.5532342348868652}},
 {'rouge-1': {'r': 0.7027433858425861,
   'p': 0.6067668705846492,
   'f': 0.6264468978647415},
  'rouge-2': {'r': 0.533677826489152,
   'p': 0.4726569341697459,
   'f': 0.47916847430333076},
  'rouge-l': {'r': 0.681298964166118,
   'p': 0.5923067586866915,
   'f': 0.6099929146310835}},
 {'rouge-1': {'r': 0.6145818630836734,
   'p': 0.7157558069052742,
   'f': 0.6255867046770297},
  'rouge-2': {'r': 0.5042908394732502,
 