In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install langchain-openai langchain-core langchain-chroma langchain unstructured datasets ragas
%pip install --upgrade --quiet  rank_bm25 > /dev/null

Collecting langchain-openai
  Downloading langchain_openai-0.2.5-py3-none-any.whl.metadata (2.6 kB)
Collecting langchain-chroma
  Downloading langchain_chroma-0.1.4-py3-none-any.whl.metadata (1.6 kB)
Collecting unstructured
  Downloading unstructured-0.16.4-py3-none-any.whl.metadata (24 kB)
Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting ragas
  Downloading ragas-0.2.3-py3-none-any.whl.metadata (7.9 kB)
Collecting langchain-core
  Downloading langchain_core-0.3.15-py3-none-any.whl.metadata (6.3 kB)
Collecting tiktoken<1,>=0.7 (from langchain-openai)
  Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting chromadb!=0.5.4,!=0.5.5,<0.6.0,>=0.4.0 (from langchain-chroma)
  Downloading chromadb-0.5.17-py3-none-any.whl.metadata (6.8 kB)
Collecting fastapi<1,>=0.95.2 (from langchain-chroma)
  Downloading fastapi-0.115.4-py3-none-any.whl.metadata (27 kB)
Collecting filetype (from unstructu

**Učitavanje seta pitanja i seta ground_truth vrednosti za svako pitanje iz klase Evaluation koju smo definisali u evaluation fajlu.**

In [3]:
import sys
sys.path.append("/content/drive/MyDrive/project_work/code/")
from dotenv import load_dotenv
load_dotenv()
from  text_and_tables_retrieval.evaluation.evaluation import Evaluation, ground_truth # ucitati env fajl i data_dir obavezno

eval_instance = Evaluation(None, None)
questions = eval_instance.questions # all questions
ground_truths = ground_truth  # all ground truths

**Učitati 4 retriever - a: MultiVectorRetriever, ParentRetriever, ContextualCompression Retriever, Ensemble Retriever**

In [4]:
from databases.chroma_db_connection import create_retriever
from databases.retrievers import create_ensemble_retriever, create_parent_retriever, create_retriever_with_cc, Element

ensemble_retriever = create_ensemble_retriever("/content/data_dir/strategy_4/chunked_elements.pkl", "./data_dir/strategy_4/raw_table_elements.pkl", 5)

In [5]:
retriever_cc = create_retriever_with_cc("./data_dir/strategy_4/chunked_elements.pkl", "./data_dir/strategy_4/raw_table_elements.pkl", 5)

In [6]:
retriever = create_parent_retriever("./data_dir/strategy_4/chunked_elements.pkl", "./data_dir/strategy_4/raw_table_elements.pkl", 5)

  vectorstore = Chroma(


**Potrebno je da LLM asistira u kreiranju finalnih odgovora na osnovu ground_truth vrednosti za svako pitanje. Ovo ćemo pretvoriti u odgovore kako bismo omogućili pristup evaluaciji preciznosti koji je invarijantan u odnosu na veličinu chunk - a.**

In [7]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI
import os
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

template = """
Your task is to generate complete answer based on the provided document or concatenated documents.
You need to use complete context provided to you in order to make informative answers.
You are not allowed to use external knowledge in the answer generation process.
This is your context: {context}
This is input question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

chat = ChatOpenAI(temperature = 0, api_key = OPENAI_API_KEY)

chain = prompt | chat | StrOutputParser()

answers = [chain.invoke({"context":gt, "question":question}) for question, gt in zip(questions, ground_truths) ]


In [None]:
import pickle
with open("answers.pkl", "wb") as file:
  pickle.dump(answers, file)

In [8]:
import pickle
with open("/content/answers.pkl", "rb") as file:
  answers = pickle.load(file)

**Prilagoditi evaluacione metrike precision i recall, kao i F1 score za evaluaciju retrieve faze.**

In [9]:
import ragas
from ragas import SingleTurnSample
from ragas.metrics import LLMContextPrecisionWithReference, LLMContextRecall
import numpy as np
from langchain_core.documents.base import Document
from ragas.llms import LangchainLLMWrapper


def prepare_docs(docs:list[Document]):
  if isinstance(docs[0], str):
    return docs
  return [doc.page_content for doc in docs]

def evaluate_one_retrieval_precision(question:str, retrieved_context:list, answer:str):
  evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini",temperature=0,api_key=os.getenv("OPENAI_API_KEY")))
  context_precision = LLMContextPrecisionWithReference(llm=evaluator_llm)
  sample = SingleTurnSample(
      user_input = question,
      reference = answer,
      retrieved_contexts = prepare_docs(retrieved_context)
  )
  return context_precision.single_turn_score(sample)   # fali return proveri kad stignu tokeni!!!

def evaluate_one_retrieval_recall(question:str,retrieved_context:list, answer:str):
  evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini",temperature=0,api_key=os.getenv("OPENAI_API_KEY")))
  context_recall = LLMContextRecall(llm=evaluator_llm)
  sample = SingleTurnSample(
      user_input = question,
      reference = answer,
      retrieved_contexts = prepare_docs(retrieved_context)
  )
  return context_recall.single_turn_score(sample)


def calculate_f1_score(precision, recall):
  return (2*precision*recall)/(precision + recall)

def evaluate_all_retrievals(retriever):
  """ovaj retriever treba da bude napunjen dokumentima koji su chunkovani strategijom Strategija i mora da prima pitanje u obliku {question:question_txt}"""
  precisions = []
  recalls = []
  for question, answer in zip(questions, answers):
    precisions.append(evaluate_one_retrieval_precision(question, retriever.invoke(question), answer))
    recalls.append(evaluate_one_retrieval_recall(question, retriever.invoke(question), answer))
  return np.mean(precisions), np.mean(recalls), calculate_f1_score(np.mean(precisions), np.mean(recalls))


**Parent Retriever Evaluacije**

In [10]:
parent_precision, parent_recall, parent_f1 = evaluate_all_retrievals(retriever)
print(f"Precision:{parent_precision}, Recall: {parent_recall}, F1: {parent_f1}")

Precision:0.911220760198081, Recall: 0.8514325586694008, F1: 0.8803126684793736


**Contextual Compression Retriever**

In [11]:
cc_precision, cc_recall, cc_f1 = evaluate_all_retrievals(retriever_cc)
print(f"Precision:{cc_precision}, Recall: {cc_recall}, F1: {cc_f1}")

Precision:0.6885964912006944, Recall: 0.48679660982292566, F1: 0.5703733281410913


**Ensemble Retriever**

In [12]:
ens_precision, ens_recall, ens_f1 = evaluate_all_retrievals(ensemble_retriever)
print(f"Precision:{ens_precision}, Recall: {ens_recall}, F1: {ens_f1}")

Precision:0.8076542546614934, Recall: 0.8779477101845523, F1: 0.8413352835236318


**MultiVectorRetriever**

In [15]:
from databases.chroma_db_connection import create_retriever

chunks_path = "./data_dir/strategy_4/chunked_elements.pkl"
txt_summaries_path = "./data_dir/strategy_4/4_txt_summaries.pkl"
tbl_summaries_path = "data_dir/strategy_4/4_table_summaries.pkl"
mv_retriever = create_retriever(chunks_path,txt_summaries_path,tbl_summaries_path,"similarity",5)

588 loaded from ./data_dir/strategy_4/chunked_elements.pkl.
87 loaded from data_dir/strategy_4/4_table_summaries.pkl.
501 loaded from ./data_dir/strategy_4/4_txt_summaries.pkl.
221 loaded from data_dir/1_image_summaries.pkl.
268 loaded from data_dir/formula_descriptions.pkl.
588 documents stored.


In [16]:
mv_precision, mv_recall, mv_f1 = evaluate_all_retrievals(mv_retriever)
print(f"Precision:{mv_precision}, Recall: {mv_recall}, F1: {mv_f1}")

Precision:0.9301900584435129, Recall: 0.889180337206653, F1: 0.9092230057283154


****