### Weaviate Vector DB Creation

In [1]:
!docker compose up -d

 Container rag-comprehensive-weaviate-1  Starting
Error response from daemon: driver failed programming external connectivity on endpoint rag-comprehensive-weaviate-1 (3392f371bb4bf9d5a78c0b7318e032cb0a1c4e81ff8c4ebd44e94a4ff6f16d32): Bind for 0.0.0.0:50051 failed: port is already allocated


In [2]:
import os
import weaviate
from dotenv import load_dotenv
load_dotenv()

# Create weaviate client
google_api_key=os.getenv("GOOGLE_API_KEY")
client = weaviate.Client(
    url = "http://localhost:8080",  # Replace with your endpoint
    additional_headers = {
        "X-Google-Api-Key": google_api_key # Replace with your inference API key
    }
)
client.is_ready()

            Please consider upgrading to the latest version. See https://weaviate.io/developers/weaviate/client-libraries/python for details.


True

### Data ingestion

In [3]:
from llama_index.core import SimpleDirectoryReader
# Load documents
documents=SimpleDirectoryReader("./data").load_data()

### Data chunking

Define semantic chunker

In [6]:
from llama_index.core.node_parser import (
    SemanticSplitterNodeParser,
)
from llama_index.embeddings.gemini import GeminiEmbedding

gemini_embed_model=GeminiEmbedding(model_name="models/embedding-001")
semantic_splitter = SemanticSplitterNodeParser(
    buffer_size=1, breakpoint_percentile_threshold=95, embed_model=gemini_embed_model
)

Chunk documents into nodes

In [7]:
nodes = semantic_splitter.get_nodes_from_documents(documents=documents)

### Data Indexing

In [8]:
from llama_index.llms.gemini import Gemini
from llama_index.embeddings.gemini import GeminiEmbedding
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core import Settings

Settings.llm = Gemini(models='gemini-pro', api_key=google_api_key)
Settings.embed_model = GeminiEmbedding(model_name="models/embedding-001")
Settings.node_parser = SentenceSplitter(chunk_size=512, chunk_overlap=20)
Settings.num_output = 512
Settings.context_window = 3900

In [10]:
from llama_index.core import VectorStoreIndex
from llama_index.core import StorageContext
from llama_index.vector_stores.weaviate import WeaviateVectorStore

# construct vector store 
vector_store = WeaviateVectorStore(weaviate_client=client)

# set up storage for embeddings
storage_context = StorageContext.from_defaults(vector_store=vector_store)
# set up the index 
index = VectorStoreIndex(
    nodes,
    storage_context=storage_context
)

### Data Querying

##### Query with Default Vector Search

In [11]:
from llama_index.core.response.notebook_utils import display_response

query_engine = index.as_query_engine()
response = query_engine.query("Tell me about the author's experience in YC")

display_response(response)


**`Final Response:`** The author was the president of YC and retired in March 2014.

### Evaluation with ragas

In [44]:
from datasets import Dataset
from llama_index.core.llama_dataset import LabelledRagDataset

eval_dataset = LabelledRagDataset.from_json("./data/eval_dataset.json")


In [21]:
data = Dataset.from_pandas(eval_dataset.to_pandas())
generated_answers = []
for example in data:
    answer =  query_engine.query(example['query'])
    generated_answers.append(answer)

In [47]:
data = Dataset.from_pandas(eval_dataset.to_pandas())
print(data)
question = [x for x in data['query']]
contexts = [x for x in data['reference_contexts']]
print(data['reference_answer'])
ground_truth = [[x] for x in data['reference_answer']]

generated_answers = [str(x) for x in generated_answers]

formatted_data =  Dataset.from_dict({
    "question": question,
    "contexts": contexts,
    "answer": generated_answers,
    "ground_truths": ground_truth
})


Dataset({
    features: ['query', 'reference_contexts', 'reference_answer', 'reference_answer_by', 'query_by'],
    num_rows: 44
})
['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']


In [12]:
file_path = "data.txt"

# Open the file in write mode
with open(file_path, "w") as file:
    # Iterate over the dictionary items and write them to the file
    for key, value in formatted_data.items():
        file.write(f"{key}: {value}\n")

NameError: name 'formatted_data' is not defined

In [13]:
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall,
)
from ragas.metrics.critique import harmfulness

metrics = [
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall,
    harmfulness,
]