In [1]:
import os

In [2]:
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(".env")

In [3]:
document_path = "/Users/vishalverma/Vishal/Learning - Fractal/GenAi/docs/Doc A - MDL-785_0.pdf"

In [4]:
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import numpy as np
from numpy.linalg import norm
from langchain.vectorstores import DocArrayInMemorySearch
from langchain.indexes import VectorstoreIndexCreator
from langchain.chains import RetrievalQA
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI
from langchain.embeddings import HuggingFaceEmbeddings

In [5]:
def cosine_similarity(vec1,vec2):
    return np.dot(vec1,vec2)/(norm(vec1)*norm(vec2))

In [6]:
sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')

In [7]:
## load document
loader1 = PyMuPDFLoader(document_path)
documents = loader1.load()

In [8]:
text_splitter = CharacterTextSplitter(chunk_size = 1000,chunk_overlap = 10)
docs = text_splitter.split_documents(documents)

In [9]:
test_case = {'question' : "What is the purpose of the act?",
            "answer" : "the purpose of the act is to establish regulations and guidelines for credit for reinsurance, including requirements for collateralization of unauthorized reinsurance recoverable and the establishment of trust funds as security for insureds, claimants, ceding insurers, assuming insurers, and the public. It also includes provisions for the control and distribution of funds by U.S. regulators and applies to all cessions under reinsurance agreements with an inception, anniversary, or renewal date not less than six months after the effective date of the act."}

In [10]:
embeddings = OpenAIEmbeddings()

In [11]:
db = Chroma.from_documents(docs,embeddings)

In [12]:
model_name = "gpt-3.5-turbo"
llm = OpenAI(model_name=model_name,temperature = 0.2)

chain = load_qa_chain(llm, chain_type="stuff")



In [18]:
query = "What is the purpose of the act?"
# prompt = f"Make this answer simple : {query}"
answer = chain.run(input_documents=db.similarity_search(query), question=query)
answer

'The purpose of the act is to protect the interest of insureds, claimants, ceding insurers, assuming insurers, and the public generally, and to ensure adequate regulation of insurers and reinsurers and adequate protection for those to whom they owe obligations. It also mandates that upon the insolvency of a non-U.S. insurer or reinsurer that provides security to fund its U.S. obligations in accordance with this Act, the assets representing the security shall be maintained in the United States and claims shall be filed with and valued by the state insurance commissioner with regulatory oversight, and the assets shall be distributed, in accordance with the insurance laws of the state in which the trust is domiciled that are applicable to the liquidation of domestic U.S. insurance companies.'

In [15]:
expected_ans_vector = sbert_model.encode(test_case['answer'])
predicted_vectore = sbert_model.encode(answer)
similarity = cosine_similarity(predicted_vectore,expected_ans_vector)
print(f"similarity : {similarity}")

similarity : 0.8110518455505371
