In [1]:
import os

In [2]:
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(".env")

In [3]:
document_path = "/Users/vishalverma/Vishal/Learning - Fractal/GenAi/docs/Doc A - MDL-785_0.pdf"

In [4]:
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from sentence_transformers import SentenceTransformer
import numpy as np
from numpy.linalg import norm

In [5]:
def cosine_similarity(vec1,vec2):
    return np.dot(vec1,vec2)/(norm(vec1)*norm(vec2))

In [6]:
sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')

In [7]:
## load document
loader1 = PyMuPDFLoader(document_path)
documents = loader1.load()

In [8]:
text_splitter = CharacterTextSplitter(chunk_size = 700,chunk_overlap = 50)
docs = text_splitter.split_documents(documents)

In [9]:
test_case = {'question' : "What is the purpose of the act?",
            "answer" : "the purpose of the act is to establish regulations and guidelines for credit for reinsurance, including requirements for collateralization of unauthorized reinsurance recoverable and the establishment of trust funds as security for insureds, claimants, ceding insurers, assuming insurers, and the public. It also includes provisions for the control and distribution of funds by U.S. regulators and applies to all cessions under reinsurance agreements with an inception, anniversary, or renewal date not less than six months after the effective date of the act."}

# Model - 1

Model Info: 
<br>
Embedding : HuggingFaceEmbeddings
<br>
API : T5 model

In [10]:
from langchain.embeddings import HuggingFaceEmbeddings 
from langchain.vectorstores import FAISS
from langchain.chains.question_answering import load_qa_chain
from langchain import HuggingFaceHub
from langchain.indexes import VectorstoreIndexCreator
from langchain.chains import RetrievalQA

In [11]:
## load huggingface embedding
embeddings = HuggingFaceEmbeddings()

In [12]:
## fiass to create embedding db
db = FAISS.from_documents(docs, embeddings)

In [13]:
## importing T5 llm model
llm=HuggingFaceHub(repo_id="declare-lab/flan-alpaca-large", model_kwargs={"temperature":0, "max_length":512})
chain = load_qa_chain(llm, chain_type="stuff")

In [14]:
query =  "What is the purpose of the act?"
#prompt = f"Answer it in atleast 70 words : ```{query}```"
prompt = query
similar_document = db.similarity_search(query)
model_1_ans = chain.run(input_documents=similar_document, question=prompt)
print(model_1_ans)

The purpose of the act is to provide credit for incorporated underwriters.


In [15]:
expected_ans_vector = sbert_model.encode(test_case['answer'])
predicted_ans_m1 = sbert_model.encode(model_1_ans)
m1_similarity = cosine_similarity(predicted_ans_m1,expected_ans_vector)
print(f"similarity : {m1_similarity}")

similarity : 0.6067509055137634


# Model - 2

Model Info: 
<br>
Embedding : OpenAIEmbedding
<br>
API : T5 model

In [16]:
from langchain.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()

In [17]:
## fiass to create embedding db
db = FAISS.from_documents(docs, embeddings)

In [18]:
## importing T5 llm model
llm=HuggingFaceHub(repo_id="declare-lab/flan-alpaca-large", model_kwargs={"temperature":0, "max_length":512})
chain = load_qa_chain(llm, chain_type="stuff")

In [19]:
query =  "What is the purpose of the act?"
# prompt = f"Answer it in atleast 100 words and summarize it : ```{query}```"
prompt = query
similar_document = db.similarity_search(query)
model_2_ans = chain.run(input_documents=similar_document, question=prompt)
print(model_2_ans)

The purpose of this Act is to protect the interest of insureds, claimants, ceding insurers, assuming insurers and the public generally.


In [20]:
expected_ans_vector = sbert_model.encode(test_case['answer'])
predicted_ans_m2 = sbert_model.encode(model_2_ans)
m2_similarity = cosine_similarity(predicted_ans_m2,expected_ans_vector)
print(f"similarity : {m2_similarity}")

similarity : 0.672654390335083


# Model - 3

Model Info: 
<br>
Embedding : OpenAIEmbedding
<br>
API : OpenAI
<br>
document searching method : Retrieval

In [21]:
from langchain.chains import RetrievalQA 
from langchain.indexes import VectorstoreIndexCreator 
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI

In [22]:
embeddings = OpenAIEmbeddings()

In [23]:
db = Chroma.from_documents(docs,embeddings)

In [24]:
# expose this index in a retriever interface
retriever = db.as_retriever(search_type="similarity")

In [25]:
llm = ChatOpenAI(temperature=0)

In [26]:
# create a chain to answer questions
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", 
                                 retriever=retriever, 
                                 verbose=True)

query =  "What is the purpose of the act?"
# prompt = f"Answer it in atleast 70 words: ```{query}```"
prompt = query
model_3_ans = qa.run(prompt)
print(model_3_ans)



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
The purpose of the Credit for Reinsurance Model Law is to protect the interest of insureds, claimants, ceding insurers, assuming insurers, and the public generally. The legislature declares its intent to ensure adequate regulation of insurers and reinsurers and adequate protection for those to whom they owe obligations. The matters contained in this Act are fundamental to the business of insurance.


In [27]:
expected_ans_vector = sbert_model.encode(test_case['answer'])
predicted_ans_m3 = sbert_model.encode(model_3_ans)
m3_similarity = cosine_similarity(predicted_ans_m3,expected_ans_vector)
print(f"similarity : {m3_similarity}")

similarity : 0.7591108083724976


# Model - 4

In [28]:
from langchain.vectorstores import DocArrayInMemorySearch

Model Info: 
<br>
Embedding : OpenAIEmbedding
<br>
API : OpenAI
<br>
document searching method : vectorstoreindexcreator

In [29]:
from langchain.vectorstores import DocArrayInMemorySearch

In [30]:
index = VectorstoreIndexCreator(
    vectorstore_cls=DocArrayInMemorySearch,
    embedding=embeddings).from_loaders([loader1])

In [31]:
llm = ChatOpenAI(temperature=0)

In [32]:
query =  "What is the purpose of the act?"
# prompt = f"Answer it in atleast 70 words: ```{query}```"
prompt = query
model_4_ans = index.query(prompt, llm=llm)

In [33]:
print(model_4_ans)

The purpose of the act is to protect the interest of insureds, claimants, ceding insurers, assuming insurers and the public generally, and to ensure adequate regulation of insurers and reinsurers and adequate protection for those to whom they owe obligations.


In [34]:
expected_ans_vector = sbert_model.encode(test_case['answer'])
predicted_ans_m4 = sbert_model.encode(model_4_ans)
m4_similarity = cosine_similarity(predicted_ans_m4,expected_ans_vector)
print(f"similarity : {m4_similarity}")

similarity : 0.7095122337341309


# Model - 5

In [35]:
from langchain.vectorstores import Chroma
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI

In [36]:
embeddings = OpenAIEmbeddings()
db = Chroma.from_documents(docs,embeddings)

In [37]:
model_name = "gpt-3.5-turbo"
llm = OpenAI(model_name=model_name,temperature = 0)

chain = load_qa_chain(llm, chain_type="stuff")



In [38]:
query = "What is the purpose of the act?"
answer = chain.run(input_documents=db.similarity_search(query), question=query)
answer

'The purpose of the act is to protect the interest of insureds, claimants, ceding insurers, assuming insurers, and the public generally, and to ensure adequate regulation of insurers and reinsurers and adequate protection for those to whom they owe obligations. It also mandates that upon the insolvency of a non-U.S. insurer or reinsurer that provides security to fund its U.S. obligations in accordance with this Act, the assets representing the security shall be maintained in the United States and claims shall be filed with and valued by the state insurance commissioner with regulatory oversight, and the assets shall be distributed, in accordance with the insurance laws of the state in which the trust is domiciled that are applicable to the liquidation of domestic U.S. insurance companies.'

In [39]:
expected_ans_vector = sbert_model.encode(test_case['answer'])
predicted_ans_m4 = sbert_model.encode(answer)
m5_similarity = cosine_similarity(predicted_ans_m4,expected_ans_vector)
print(f"similarity : {m5_similarity}")

similarity : 0.807817280292511
