# Question and Answer RAG system

In [2]:
packages_to_check = [
    'langchain',
    'openai',
    'weaviate-client',
    'tiktoken',
    'pypdf',
    'python-docx',
    'docx2pdf',
    'rapidocr-onnxruntime',
    'sentence-transformers',
    'python-dotenv',
    'langchain_openai'
    
]

def check_package(package_name):
    try:
        __import__(package_name)
        print(f"{package_name} is installed.")
    except ImportError:
        print(f"{package_name} is not installed.")

for package in packages_to_check:
    check_package(package)


langchain is installed.
openai is installed.
weaviate-client is not installed.
tiktoken is installed.
pypdf is installed.
python-docx is not installed.
docx2pdf is not installed.
rapidocr-onnxruntime is not installed.
sentence-transformers is not installed.
python-dotenv is not installed.
langchain_openai is installed.


### Weaviate vector DB

In [3]:
import os
import weaviate
from langchain.vectorstores import Weaviate

auth_config = weaviate.AuthApiKey(api_key = os.environ["WEAVIATE_API_KEY"])

client = weaviate.Client(
  url=WEAVIATE_URL,
  additional_headers={"X-OpenAI-Api-Key": os.environ["OPENAI_API_KEY"]},
  auth_client_secret=auth_config,
  startup_period=10
)

In [4]:
WEAVIATE_URL = os.environ['WEAVIATE_CLUSTER']
WEAVIATE_API_KEY = os.environ['WEAVIATE_API_KEY']


In [5]:
"""from langchain.vectorstores import Weaviate
import weaviate


client = weaviate.Client(
    url=WEAVIATE_URL,
    auth_client_secret=weaviate.AuthApiKey(WEAVIATE_API_KEY)
)
"""

'from langchain.vectorstores import Weaviate\nimport weaviate\n\n\nclient = weaviate.Client(\n    url=WEAVIATE_URL,\n    auth_client_secret=weaviate.AuthApiKey(WEAVIATE_API_KEY)\n)\n'

### Load data as pdf

In [11]:
from docx import Document

#doc_path = '/content/Raptor Contract.docx'
doc_path = '/home/tema/10X/week11/Legal_Expert_Contract_Advisor_RAG/data/evaluation_set/Raptor Contract.docx'
doc = Document(doc_path)


pages = []
for paragraph in doc.paragraphs:
    # For simplicity, consider each paragraph as a 'page'
    pages.append(paragraph.text)


for page_number, page_content in enumerate(pages, start=1):
    print(page_content)


STOCK PURCHASE AGREEMENT
BY AND AMONG
[BUYER],
[TARGET COMPANY],
THE SELLERS LISTED ON SCHEDULE I HERETO
AND
THE SELLERS’ REPRESENTATIVE NAMED HEREIN
Dated as of [●]

[This document is intended solely to facilitate discussions among the parties identified herein.  Neither this document nor such discussions are intended to create, nor will either or both be deemed to create, a legally binding or enforceable offer or agreement of any type or nature, unless and until a definitive written agreement is executed and delivered by each of the parties hereto.

This document shall be kept confidential pursuant to the terms of the Confidentiality Agreement entered into by the parties and, if applicable, its affiliates with respect to the subject matter hereof.]


TABLE OF CONTENTS
ARTICLE I DEFINITIONS; CERTAIN RULES OF CONSTRUCTION	2
Section 1.01	Definitions	2
Section 1.02	Certain Matters of Construction	13
ARTICLE II PURCHASE AND SALE OF SHARES AND WARRANTS; TREATMENT OF OPTIONS; CLOSING.	14
S

In [12]:
from langchain_community.document_loaders import PyPDFLoader

doc_path = '/home/tema/10X/week11/Legal_Expert_Contract_Advisor_RAG/data/evaluation_set/Raptor Contract.docx.pdf'

loader = PyPDFLoader(doc_path)
pages = loader.load_and_split()

pages

[Document(page_content='[R&G\nDraft\n12.__.2021]\nSTOCK\nPURCHASE\nAGREEMENT\nBY\nAND\nAMONG\n[BUYER],\n[TARGET\nCOMP ANY],\nTHE\nSELLERS\nLISTED\nON\nSCHEDULE\nI\nHERET O\nAND\nTHE\nSELLERS’\nREPRESENT ATIVE\nNAMED\nHEREIN\nDated\nas\nof\n[●]\n[This\ndocument\nis\nintended\nsolely\nto\nfacilitate\ndiscussions\namong\nthe\nparties\nidentified\nherein. \nNeither\nthis\ndocument\nnor\nsuch\ndiscussions\nare\nintended\nto\ncreate,\nnor\nwill\neither\nor\nboth\nbe \ndeemed\nto\ncreate,\na\nlegally\nbinding\nor\nenforceable\noffer\nor\nagreement\nof\nany\ntype\nor\nnature, \nunless\nand\nuntil\na\ndefinitive\nwritten\nagreement\nis\nexecuted\nand\ndelivered\nby\neach\nof\nthe\nparties \nhereto.\nThis\ndocument\nshall\nbe\nkept\nconfidential\npursuant\nto\nthe\nterms\nof\nthe\nConfidentiality \nAgreement\nentered\ninto\nby\nthe\nparties\nand,\nif\napplicable,\nits\naffiliates\nwith\nrespect\nto\nthe\nsubject \nmatter\nhereof.]\n112923184_5', metadata={'source': '/home/tema/10X/week11/Legal_E

In [13]:
length = 0
count = 0
for page in pages:
    length += len(page.page_content)
    count += 1

length, count
print(f"content: {length} Page size: {count}")


content: 225087 Page size: 76


In [14]:
print(len(pages[0].page_content))

801


### Chunking methods

Fixed size chunking

In [15]:
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter, TokenTextSplitter

CHUNK_SIZE = 300
CHUNK_OVERLAP = 30

text_splitter = CharacterTextSplitter(chunk_size = CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
docs = text_splitter.split_documents(pages)


Recursive Chunking

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
  
    chunk_size = 270,
    chunk_overlap  = 40
)

#docs = text_splitter.create_documents(pages)

Specialized Chunking

In [None]:
#LaTeX 
from langchain.text_splitter import LatexTextSplitter
latex_text = "..."
latex_splitter = LatexTextSplitter(chunk_size=100, chunk_overlap=0)
#docs = latex_splitter.create_documents([latex_text])

“Content-aware” Chunking

In [None]:
#naive Splitting

text = pages 
naive_docs = text.split(".")

In [None]:
#The Natural Language Toolkit (NLTK)
from langchain.text_splitter import NLTKTextSplitter


text_splitter = NLTKTextSplitter()
nltk_docs = text_splitter.split_text(text)

In [None]:
#spacy
from langchain.text_splitter import SpacyTextSplitter

text_splitter = SpaCyTextSplitter()
spacy_docs = text_splitter.split_text(pages)


### loading the doc to the vector database



In [16]:
vector_db = Weaviate.from_documents(
    docs, embeddings, client=client, by_text=False
)

/home/tema/10X/week11/.venv/lib/python3.12/site-packages/pydantic/main.py:1086: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.8/migration/


### similarity search Retrival

In [24]:
vector_db.similarity_search("what is Action?", k=3)

/home/tema/10X/week11/.venv/lib/python3.12/site-packages/pydantic/main.py:1086: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.8/migration/


[Document(page_content='Section\nIV.08\nGoverning\nLaw\n.\nThis\nAgreement,\nthe\nrights\nof\nthe\nparties\nhereunder\nand \nall\nActions\narising\nin\nwhole\nor\nin\npart\nunder\nor\nin\nconnection\nherewith\n(including\nany\nAction \nbased\nupon,\narising\nout\nof,\nor\nrelated\nto\nany\nrepresentation\nor\nwarranty\nmade\nin\nconnection\nwith \nthis\nAgreement\nor\nas\nan\ninducement\nto\nenter\ninto\nthis\nAgreement),\nwill\nbe\ngoverned\nby\nand \nconstrued\nand\nenforced\nin\naccordance\nwith\nthe\ndomestic\nsubstantive\nlaws\nof\nthe\nState\nof \nDelaware,\nincluding\nits\nstatute\nof\nlimitations,\nwithout\ngiving\neffect\nto\nany\nchoice\nor\nconflict\nof \nlaw\nprovision\nor\nrule\nthat\nwould\ncause\nthe\napplication\nof\nthe\nlaws\nof\nany\nother\njurisdiction.\nSection\nIV.09\nJurisdiction;\nVenue;\nService\nof\nProcess.\n(a)\nJurisdiction\n.\nSubject\nto\nthe\nprovisions\nof\nSections\n2.06\n,\neach\nof\nthe\nparties\nto\nthis \nAgreement,\nby\nits\nexecution\nhereof,\n(i

In [26]:
content = vector_db.similarity_search("what is Action?", k=3)[0].page_content
content = content.replace("\n", " ").strip()

print(content)

Section IV.08 Governing Law . This Agreement, the rights of the parties hereunder and  all Actions arising in whole or in part under or in connection herewith (including any Action  based upon, arising out of, or related to any representation or warranty made in connection with  this Agreement or as an inducement to enter into this Agreement), will be governed by and  construed and enforced in accordance with the domestic substantive laws of the State of  Delaware, including its statute of limitations, without giving effect to any choice or conflict of  law provision or rule that would cause the application of the laws of any other jurisdiction. Section IV.09 Jurisdiction; Venue; Service of Process. (a) Jurisdiction . Subject to the provisions of Sections 2.06 , each of the parties to this  Agreement, by its execution hereof, (i) hereby irrevocably submits to the exclusive jurisdiction  of the state courts located in Wilmington, Delaware or the courts of the United States located in  W

In [41]:
content = vector_db.similarity_search("what is Action?", k=3)[1].page_content
clean_content = content.replace("\n", " ").strip()

print(clean_content)


/home/tema/10X/week11/.venv/lib/python3.12/site-packages/pydantic/main.py:1086: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.8/migration/


AGREEMENT NOW THEREFORE, in consideration of the premises and mutual promises herein made,  and in consideration of the representations, warranties and covenants herein contained, the  parties to this Agreement hereby agree as follows: ARTICLE I  DEFINITIONS; CER TAIN RULES OF CONSTRUCTION Section I.01 Definitions . In addition to the other terms defined throughout this  Agreement, the following terms shall have the following meanings when used in this Agreement: [“ Accounting Principles ” means GAAP as in effect on the Most Recent Balance Sheet  Date and, to the extent consistent with GAAP , using the same accounting methods, principles,  practices, procedures and estimation methodologies as those utilized in the preparation of the  Most Recent Balance Sheet]. “ Acquired Companies ” means, collectively , the Company and each of its Subsidiaries. “ Action ” means any claim, action, suit, litigation, mediation, arbitration, known  investigation, known opposition, interference, audit, as

In [43]:
print(len(content))

2829


### Generate output from LLM

In [27]:
from langchain.prompts import ChatPromptTemplate

template="""You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer the question.
If you don't know the answer, just say that you don't know.

Question: {question}
Context: {context}
Answer:
"""

In [29]:
prompt=ChatPromptTemplate.from_template(template)
prompt

ChatPromptTemplate(input_variables=['context', 'question'], messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template="You are an assistant for question-answering tasks.\nUse the following pieces of retrieved context to answer the question.\nIf you don't know the answer, just say that you don't know.\n\nQuestion: {question}\nContext: {context}\nAnswer:\n"))])

#### openai embeddings

In [None]:

import os
from dotenv import load_dotenv, find_dotenv

_ = load_dotenv(find_dotenv()) # read local .env file
OPENAI_API_KEY = os.environ['OPENAI_API_KEY']

In [None]:
from langchain_openai.chat_models import ChatOpenAI
from langchain_openai.embeddings import OpenAIEmbeddings

MODEL = "gpt-3.5-turbo"


In [None]:

#model = ChatOpenAI(openai_api_key=OPENAI_API_KEY, model=MODEL)
#embeddings = OpenAIEmbeddings(model='text-embedding-ada-002')

from langchain.embeddings import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

print(embeddings)


In [57]:
from langchain.schema.runnable import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

model = ChatOpenAI(openai_api_key=OPENAI_API_KEY, model= MODEL)
parser = StrOutputParser()
rag_chain = model| parser

rag_chain.invoke("Tell me a joke related to law?")

'Why was the lawyer always calm in court? \n\nBecause he knew how to keep his lawsuit-ions under control!'

In [33]:
retriever=vector_db.as_retriever()

In [34]:

rag_chain = (
    {"context": retriever,  "question": RunnablePassthrough()}
    | prompt
    | model
    | parser
)

In [35]:
rag_chain.invoke('what is Action mean?')

/home/tema/10X/week11/.venv/lib/python3.12/site-packages/pydantic/main.py:1086: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.8/migration/


'Action means any claim, action, suit, litigation, mediation, arbitration, known investigation, known opposition, interference, audit, assessment, hearing, complaint, charge, demand, or other legal proceeding that is commenced, brought, conducted, tried or heard by or before, or otherwise involving, any Governmental Authority.'

In [36]:
print(rag_chain.invoke('what does Change of Control Payment mean?'))

/home/tema/10X/week11/.venv/lib/python3.12/site-packages/pydantic/main.py:1086: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.8/migration/


A "Change of Control Payment" refers to any bonus, severance, or other form of compensation that becomes payable to individuals associated with an acquired company as a result of a change in control, excluding certain types of severance payments.


### mistral LLm model

In [None]:
# specify embedding model (using huggingface sentence transformer)
from langchain.embeddings import HuggingFaceEmbeddings

embedding_model_name = "sentence-transformers/all-mpnet-base-v2"
#model_kwargs = {"device": "cuda"}
huggingface_embeddings = HuggingFaceEmbeddings(
  model_name=embedding_model_name,
  #model_kwargs=model_kwargs
)

In [None]:
import os
from langchain import HuggingFaceHub

huggingfacehub_api_token= os.environ['HUGGINGFACE_TOKEN']

In [None]:
model = HuggingFaceHub(
    huggingfacehub_api_token=huggingfacehub_api_token,
    repo_id="mistralai/Mistral-7B-Instruct-v0.1",
    model_kwargs={"temperature":1, "max_length":180}
)

In [None]:
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser

output_parser=StrOutputParser()

In [None]:
retriever=vector_db.as_retriever()

In [None]:

rag_chain = (
    {"context": retriever,  "question": RunnablePassthrough()}
    | prompt
    | model
    | output_parser
)

In [None]:
print(rag_chain.invoke("what is Action mean?"))

### Q&A chain generation 

In [44]:
import openai
from langchain.llms import OpenAIChat
from langchain.schema.runnable import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser


model = ChatOpenAI(openai_api_key=OPENAI_API_KEY, model= MODEL)
parser = StrOutputParser()
rag_chain = model| parser


In [47]:
query = [
    "What steps are outlined for resolving disputes between Buyer and Sellers' Representative?",
    "How long after delivery of a Dispute Notice can either party elect to submit disputed items to an Accounting Firm?",
    #"What criteria are used to select the Accounting Firm for resolving disputes?",
    "How does the Accounting Firm review and resolve disputed items according to the Agreement?",
    "Who bears the fees, costs, and expenses of the Accounting Firm, and how are they allocated?",
    #"What is the role of the Final Closing Statement in the dispute resolution process?",
    #"What efforts are parties expected to make in cooperating with the Accounting Firm during the dispute resolution process?"
]


In [48]:
retriever=vector_db.as_retriever()

rag_chain = (
    {"context": retriever,  "question": RunnablePassthrough()}
    | prompt
    | model
    | parser
)

In [51]:
from langchain.schema.runnable import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser


rag_chain = rag_chain | RunnablePassthrough()

# Loop through each question in the query list/dictionary and invoke the pipeline
for q in query:
    response = rag_chain.invoke(q)
    print(f"Question: {q}")
    print(f"Response: {response}\n")


/home/tema/10X/week11/.venv/lib/python3.12/site-packages/pydantic/main.py:1086: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.8/migration/


Question: What steps are outlined for resolving disputes between Buyer and Sellers' Representative?
Response: The steps outlined for resolving disputes between Buyer and Seller's Representative include the Buyer and Seller's Representative attempting to resolve the matters raised in any Dispute Notice in good faith. If they cannot resolve the dispute, they may elect to submit the disputed items to a nationally recognized independent accounting firm for review and resolution. The decision of the Accounting Firm will be final, conclusive, and binding on the parties.



/home/tema/10X/week11/.venv/lib/python3.12/site-packages/pydantic/main.py:1086: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.8/migration/


Question: How long after delivery of a Dispute Notice can either party elect to submit disputed items to an Accounting Firm?
Response: Either party can elect to submit disputed items to an Accounting Firm beginning ten (10) Business Days after the delivery of any Dispute Notice.



/home/tema/10X/week11/.venv/lib/python3.12/site-packages/pydantic/main.py:1086: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.8/migration/


Question: How does the Accounting Firm review and resolve disputed items according to the Agreement?
Response: The Accounting Firm reviews and resolves disputed items by jointly selecting a nationally recognized independent accounting firm with the Sellers' Representative. The Accounting Firm reviews unresolved items specifically objected to in the Dispute Notice and provides a written decision detailing the basis for its decision. The fees, costs, and expenses of the Accounting Firm are allocated between the Buyer and the Sellers' Representative based on the outcome of the dispute resolution process.



/home/tema/10X/week11/.venv/lib/python3.12/site-packages/pydantic/main.py:1086: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.8/migration/


Question: Who bears the fees, costs, and expenses of the Accounting Firm, and how are they allocated?
Response: The fees, costs, and expenses of the Accounting Firm will be allocated between the Buyer and the Sellers’ Representative (on behalf of the Company Securityholders) based on the percentage which the portion of the contested amount not awarded to each party bears to the amount actually contested by each party, as determined by the Accounting Firm.



In [52]:


from langchain.chains.qa_with_sources import load_qa_with_sources_chain

chain = load_qa_with_sources_chain(OpenAIChat(temperature=0.2, model_name='gpt-3.5-turbo'), chain_type='stuff')

answer = chain({'input_documents': content, 'question': query}, return_only_outputs=True)



  warn_deprecated(


AttributeError: 'str' object has no attribute 'page_content'

In [None]:
from langchain.chains import RetrievalQA 
from langchain.chains import RetrievalQAWithSourcesChain 

qa = RetrievalQA.from_chain_type(llm=OpenAIChat(temperature=0.2, model_name='gpt-3.5-turbo'), chain_type='stuff', retriever=qdrant.as_retriever())

answer = qa.run(query)

In [None]:
chain = RetrievalQAWithSourcesChain.from_chain_type(OpenAIChat(temperature=0.2, model_name='gpt-3.5-turbo'), chain_type='stuff', retriever=qdrant.as_retriever())

answer = chain({'question': query}, return_only_outputs=True)
