In [2]:
import streamlit as st
from dotenv import load_dotenv
import os
from htmlTemplates import css, bot_template, user_template
from langchain_community.vectorstores.elasticsearch import ElasticsearchStore
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
import PyPDF2
from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams
from langchain_community.llms import WatsonxLLM
from elasticsearch import Elasticsearch
from langchain.prompts import PromptTemplate
from sentence_transformers import SentenceTransformer, util


In [3]:
load_dotenv()
es_model_id = '.elser_model_2_linux-x86_64'
index_name = "elser_index_vb_13"
llm_model_id = "meta-llama/llama-2-13b-chat"
wx_url = "https://us-south.ml.cloud.ibm.com"
wx_project_id = "b33db82c-437e-4d87-8b9c-719e9919003e"


In [12]:
template ="""[INST]You are a helpful, respectful, and honest assistant. Always answer as helpfully as possible, while being safe. Be brief in your answers. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.If you don\\'\''t know the answer to a question, please do not share false information. \n Answer with no more than 150 words, in 2 or 3 sentences. If you cannot base your answer on the given document, please state that you do not have an answer.\n\n{question} Answer with no more than 200 words. If you cannot base your answer on the given document, please state that you do not have an answer. do not include a question in your response. dont prompt to make select correct answers[/INST]"""

In [4]:

def prepare_docs(pdf_docs):
    docs = []
    metadata = []
    content = []

    for pdf in pdf_docs:

        pdf_reader = PyPDF2.PdfReader(pdf)
        for index, text in enumerate(pdf_reader.pages):
            doc_page = {'title': pdf + " page " + str(index + 1),
                        'content': pdf_reader.pages[index].extract_text()}
            docs.append(doc_page)
    for doc in docs:
        content.append(doc["content"])
        metadata.append({
            "title": doc["title"]
        })
    return content, metadata



In [5]:
def get_text_chunks(content, metadata):
    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=512,
        chunk_overlap=256,
    )
    split_docs = text_splitter.create_documents(content, metadatas=metadata)
    print(f"Split documents into {len(split_docs)} passages")
    return split_docs


In [6]:
def ingest_and_get_vector_store(split_docs):
    vector_store = ElasticsearchStore(
        es_url=os.environ["elastic_search_url"],
        es_api_key=os.environ["elastic_search_api_key"],
        index_name=index_name,
        strategy=ElasticsearchStore.SparseVectorRetrievalStrategy(model_id=es_model_id)
    )
    documents = vector_store.from_documents(
        split_docs,
        es_url=os.environ["elastic_search_url"],
        es_api_key=os.environ["elastic_search_api_key"],
        index_name=index_name,
        strategy=ElasticsearchStore.SparseVectorRetrievalStrategy(model_id=es_model_id)
    )

    return vector_store


In [7]:

def get_conversation_chain(vector_store):
    parameters = {
        GenParams.DECODING_METHOD: "sample",
        GenParams.MAX_NEW_TOKENS: 100,
        GenParams.MIN_NEW_TOKENS: 1,
        GenParams.TEMPERATURE: 0.5,
        GenParams.TOP_K: 50,
        GenParams.TOP_P: 1,
    }

    watsonx_llm = WatsonxLLM(
        model_id=llm_model_id,
        url=wx_url,
        project_id=wx_project_id,
        params=parameters,
        apikey=os.environ["WATSONX_APIKEY"]
    )
    retriever = vector_store.as_retriever()
    CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(template)

    memory = ConversationBufferMemory(
        memory_key='chat_history', return_messages=True, output_key='answer')

    conversation_chain = (ConversationalRetrievalChain.from_llm
                          (llm=watsonx_llm,
                           retriever=retriever,
                           condense_question_prompt=CONDENSE_QUESTION_PROMPT,
                           memory=memory,
                           return_source_documents=True))
    return conversation_chain


In [8]:

def validate_answer_against_sources(response_answer, source_documents):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    similarity_threshold = 0.5  # Example threshold
    source_texts = [doc.page_content for doc in source_documents]

    answer_embedding = model.encode(response_answer, convert_to_tensor=True)
    source_embeddings = model.encode(source_texts, convert_to_tensor=True)

    # Compute cosine similarities
    cosine_scores = util.pytorch_cos_sim(answer_embedding, source_embeddings)

    # Check if the similarity score for any document exceeds the threshold
    if any(score.item() > similarity_threshold for score in cosine_scores[0]):
        return True  # The answer has enough in common with at least one source document

    return False  # If no document is similar enough, consider the answer not validated



In [9]:
pdf_docs=["Industry accelerators - IBM Documentation.pdf"]
content, metadata = prepare_docs(pdf_docs)
split_docs = get_text_chunks(content, metadata)

Split documents into 3 passages


In [10]:
vectorstore = ingest_and_get_vector_store(split_docs)

In [13]:
conversation_chain=get_conversation_chain(vectorstore)

In [17]:
user_question = "who is vikram bhat?"

In [18]:
response=conversation_chain({"question": user_question})
print(response['answer'])

   Vikram Bhat is not mentioned in the provided document, therefore, there is no answer.


In [16]:
if response['source_documents']:
    response_answer = response['answer']
    source_docs = response['source_documents']

    # Post-processing step to validate the answer against the source documents
    is_valid_answer = validate_answer_against_sources(response_answer, source_docs)
    if not is_valid_answer:
        response['answer'] = "Sorry I can not answer the question based on the given documents"
else:
    response['answer'] ="Sorry, I cannot answer the question based on the given documents"

print(response['answer'] )

Sorry I can not answer the question based on the given documents
