In [8]:
import requests
from langchain.document_loaders import TextLoader
from langchain_community.document_loaders import PyPDFLoader


documents = loader.load()

doc_path = '../data/evaluation_set/Raptor Contract.docx.pdf'

loader = PyPDFLoader(doc_path)
pages = loader.load()

pages

[Document(page_content='[R&G\nDraft\n12.__.2021]\nSTOCK\nPURCHASE\nAGREEMENT\nBY\nAND\nAMONG\n[BUYER],\n[TARGET\nCOMP ANY],\nTHE\nSELLERS\nLISTED\nON\nSCHEDULE\nI\nHERET O\nAND\nTHE\nSELLERS’\nREPRESENT ATIVE\nNAMED\nHEREIN\nDated\nas\nof\n[●]\n[This\ndocument\nis\nintended\nsolely\nto\nfacilitate\ndiscussions\namong\nthe\nparties\nidentified\nherein. \nNeither\nthis\ndocument\nnor\nsuch\ndiscussions\nare\nintended\nto\ncreate,\nnor\nwill\neither\nor\nboth\nbe \ndeemed\nto\ncreate,\na\nlegally\nbinding\nor\nenforceable\noffer\nor\nagreement\nof\nany\ntype\nor\nnature, \nunless\nand\nuntil\na\ndefinitive\nwritten\nagreement\nis\nexecuted\nand\ndelivered\nby\neach\nof\nthe\nparties \nhereto.\nThis\ndocument\nshall\nbe\nkept\nconfidential\npursuant\nto\nthe\nterms\nof\nthe\nConfidentiality \nAgreement\nentered\ninto\nby\nthe\nparties\nand,\nif\napplicable,\nits\naffiliates\nwith\nrespect\nto\nthe\nsubject \nmatter\nhereof.]\n112923184_5\n', metadata={'source': '../data/evaluation_set/Rapt

Character Text splitter

In [9]:
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter, TokenTextSplitter

CHUNK_SIZE = 500
CHUNK_OVERLAP = 50

text_splitter = CharacterTextSplitter(chunk_size = CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
docs = text_splitter.split_documents(pages)

In [None]:
# RAGAS expects a file_name dict as key
for document in chunks:
    document.metadata['file_name'] = document.metadata['source']

In [10]:

from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Weaviate
import weaviate
from weaviate.embedded import EmbeddedOptions
from dotenv import load_dotenv,find_dotenv


load_dotenv(find_dotenv())


client = weaviate.Client(
  embedded_options = EmbeddedOptions()
)

# Populate vector database
vectorstore = Weaviate.from_documents(
    client = client,    
    documents = chunks,
    embedding = OpenAIEmbeddings(),
    by_text = False
)

# Define vectorstore as retriever to enable semantic search
retriever = vectorstore.as_retriever()

            your code to use Python client v4 `weaviate.WeaviateClient` connections and methods.

            For Python Client v4 usage, see: https://weaviate.io/developers/weaviate/client-libraries/python
            For code migration, see: https://weaviate.io/developers/weaviate/client-libraries/python/v3_v4_migration
            


Binary /home/tema/.cache/weaviate-embedded did not exist. Downloading binary from https://github.com/weaviate/weaviate/releases/download/v1.23.7/weaviate-v1.23.7-Linux-amd64.tar.gz


In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser

# Define LLM
model = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

# Define prompt template
template = """You are an assistant for question-answering tasks. 
Use the following pieces of retrieved context to answer the question. 
If you don't know the answer, just say that you don't know. 
Use two sentences maximum and keep the answer concise.
Question: {question} 
Context: {context} 
Answer:
"""

prompt = ChatPromptTemplate.from_template(template)

# Setup RAG pipeline
rag_chain = (
    {"context": retriever,  "question": RunnablePassthrough()} 
    | prompt 
    | model
    | StrOutputParser() 
)

In [None]:
from datasets import Dataset

questions = ["What is the business objective PromptlyTech", 
             "What are the key services provided by PromptlyTech?",
             "What is prompt engineering?",
            ]
ground_truths = [["PromptlyTech aims to revolutionize how businesses interact with LLMs, making the technology more accessible, efficient, and effective."],
                ["PromptlyTech focuses on Automatic Prompt Generation, Automatic Evaluation Data Generation, and Prompt Testing and Ranking services."],
                ["Prompt engineering is the craft of designing queries or statements to guide LLMs to produce desired outcomes."]]
answers = []
contexts = []

# Inference
for query in questions:
  answers.append(rag_chain.invoke(query))
  contexts.append([docs.page_content for docs in retriever.get_relevant_documents(query)])

# To dict
data = {
    "question": questions, 
    "answer": answers, 
    "contexts": contexts, 
    "ground_truths": ground_truths
}

# Convert dict to dataset
dataset = Dataset.from_dict(data)