### Weaviate Vector DB Creation

In [1]:
!docker compose up -d

 Container rag-comprehensive-weaviate-1  Starting
Error response from daemon: driver failed programming external connectivity on endpoint rag-comprehensive-weaviate-1 (5e50f0752e195469cde8b8dcd29bfc0ba2c8a7f0d2a47aafd74277baa912e587): Bind for 0.0.0.0:50051 failed: port is already allocated


In [2]:
import os
import weaviate
from dotenv import load_dotenv
load_dotenv()

# Create weaviate client
google_api_key=os.getenv("GOOGLE_API_KEY")
client = weaviate.Client(
    url = "http://localhost:8080",  # Replace with your endpoint
    additional_headers = {
        "X-Google-Api-Key": google_api_key # Replace with your inference API key
    }
)
client.is_ready()

            Please consider upgrading to the latest version. See https://weaviate.io/developers/weaviate/client-libraries/python for details.


True

### Data ingestion

In [3]:
from llama_index.core import SimpleDirectoryReader
# Load documents
documents=SimpleDirectoryReader("./data").load_data()

### Data Indexing

In [4]:
from llama_index.llms.gemini import Gemini
from llama_index.embeddings.gemini import GeminiEmbedding
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core import Settings

Settings.llm = Gemini(models='gemini-pro', api_key=google_api_key)
Settings.embed_model = GeminiEmbedding(model_name="models/embedding-001")
Settings.node_parser = SentenceSplitter(chunk_size=512, chunk_overlap=20)
Settings.num_output = 512
Settings.context_window = 3900

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
from llama_index.core import VectorStoreIndex
from llama_index.core import StorageContext
from llama_index.vector_stores.weaviate import WeaviateVectorStore

# construct vector store 
vector_store = WeaviateVectorStore(weaviate_client=client)

# set up storage for embeddings
storage_context = StorageContext.from_defaults(vector_store=vector_store)
# set up the index 
index = VectorStoreIndex.from_documents(
    documents,
    storage_context=storage_context
)

### Data Querying

##### Query with Default Vector Search

In [6]:
from llama_index.core.response.notebook_utils import display_response

query_engine = index.as_query_engine()
response = query_engine.query("What did the author do growing up?")

display_response(response)


**`Final Response:`** The author wrote short stories and programmed on an IBM 1401 computer.

### Evaluation with ragas

In [7]:
from datasets import Dataset
from llama_index.core.llama_dataset import LabelledRagDataset

eval_dataset = LabelledRagDataset.from_json("./eval_data/eval_dataset.json")


In [8]:
data = Dataset.from_pandas(eval_dataset.to_pandas())
generated_answers = []
for example in data:
    answer =  query_engine.query(example['query'])
    generated_answers.append(answer)

StopCandidateException: index: 0
finish_reason: SAFETY
safety_ratings {
  category: HARM_CATEGORY_SEXUALLY_EXPLICIT
  probability: HIGH
}
safety_ratings {
  category: HARM_CATEGORY_HATE_SPEECH
  probability: NEGLIGIBLE
}
safety_ratings {
  category: HARM_CATEGORY_HARASSMENT
  probability: NEGLIGIBLE
}
safety_ratings {
  category: HARM_CATEGORY_DANGEROUS_CONTENT
  probability: NEGLIGIBLE
}


In [None]:
data = Dataset.from_pandas(eval_dataset.to_pandas())
print(data)
question = [x for x in data['query']]
contexts = [x for x in data['reference_contexts']]
print(data['reference_answer'])
ground_truth = [[x] for x in data['reference_answer']]

generated_answers = [str(x) for x in generated_answers]

formatted_data =  Dataset.from_dict({
    "question": question,
    "contexts": contexts,
    "answer": generated_answers,
    "ground_truths": ground_truth
})


Dataset({
    features: ['query', 'reference_contexts', 'reference_answer', 'reference_answer_by', 'query_by'],
    num_rows: 44
})
['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']


In [None]:
file_path = "data.txt"

# Open the file in write mode
with open(file_path, "w") as file:
    # Iterate over the dictionary items and write them to the file
    for key, value in formatted_data.items():
        file.write(f"{key}: {value}\n")

In [None]:
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall,
)
from ragas.metrics.critique import harmfulness

metrics = [
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall,
    harmfulness,
]

In [None]:
from ragas import evaluate

result = evaluate(dataset=formatted_data,metrics=metrics)

  from .autonotebook import tqdm as notebook_tqdm


NameError: name 'formatted_data' is not defined