In [None]:
!pip install ragatouille langchain-openai langchain-core langchain



In [None]:
import pandas as pd
from langchain_core.prompts import FewShotPromptTemplate, FewShotChatMessagePromptTemplate
from langchain_core.prompts import PromptTemplate
from langchain_openai.chat_models import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
import os
from dotenv import load_dotenv
load_dotenv()


dataset = pd.read_csv("/content/formulas_dataset (1).csv")

# examples
filtered_dataset = dataset.loc[dataset["upgraded_answer"].isna()==False]

# need to be filled
nan_upgraded_answer = dataset.loc[dataset["upgraded_answer"].isna()==True]

template = """ Your task is to create appropriate question based on the provided context and formula, where the answer of the created question is going to be provided formula.
"""

examples = [{"context":element["text_chunk"],"formula":element["formula_summary"],"question":element["question"],} for element in filtered_dataset.to_dict(orient="records")]


examples_for_fsp = [{"input":f"""Context:\n{dictionary["context"]}\nFormula:{dictionary["formula"]}""","output":dictionary["question"]} for dictionary in examples]


example_prompt = ChatPromptTemplate.from_messages(
    [
      ("human","{input}"),
      ("ai","{output}"),
    ]
)

few_shot_prompt = FewShotChatMessagePromptTemplate(
    examples=examples_for_fsp,
    example_prompt=example_prompt,
)


final_prompt = ChatPromptTemplate.from_messages(
    [
        ("system",template),
        few_shot_prompt,
        ("human","{input}"),
    ]
)


OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

chat = ChatOpenAI(temperature=0, api_key=OPENAI_API_KEY)

chain = final_prompt | chat | StrOutputParser()

questions = [chain.invoke({"input":f"""Context:\n{dictionary["text_chunk"]}\nFormula:{dictionary["formula_summary"]}"""}) for dictionary in nan_upgraded_answer.to_dict(orient="records")]


In [None]:
questions_for_new_formulas = pd.concat([nan_upgraded_answer.reset_index(drop=True), pd.Series(questions,name="new_questions")],axis=1) # pitanja koja je generisao model

In [None]:
template_qa = """Your task is to create concatenation of question and answer based on the provided question and answer. The answer is provided as the mathematical formula
which gives an answer of the provided question.
"""

examples_for_q_a_concatenation = [{"input":f"""Question:{dictionary["question"]}\nFormula:{dictionary["formula_summary"]}""","output":dictionary["upgraded_answer"]} for dictionary in filtered_dataset.to_dict(orient="records")]

examples_for_qa_concatenation_prompt = ChatPromptTemplate.from_messages(
                                          [
                                            ("human","{input}"),
                                            ("ai","{output}"),
                                          ]
                                       )
few_shot_prompt_qa = FewShotChatMessagePromptTemplate(
      examples=examples_for_q_a_concatenation,
      example_prompt=examples_for_qa_concatenation_prompt,
  )

final_prompt_q_a = ChatPromptTemplate.from_messages(
    [
        ("system",template_qa),
        few_shot_prompt_qa,
        ("human","{input}"),
    ]
)

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

chat = ChatOpenAI(temperature=0, api_key=OPENAI_API_KEY)

chain_qa = final_prompt_q_a | chat | StrOutputParser()

upgraded_answers = [chain_qa.invoke({"input":f"""Question:\n{dictionary["question"]}\nFormula:{dictionary["formula_summary"]}"""}) for dictionary in questions_for_new_formulas.to_dict(orient="records")]


In [None]:
result_dataset = pd.concat([questions_for_new_formulas[["new_questions"]].reset_index(drop=True),pd.Series(upgraded_answers,name="answer")], axis=1)
result_dataset.columns = ["question", "answer"]

In [None]:
original_few_shot_examples = filtered_dataset[["question","upgraded_answer"]]
original_few_shot_examples.columns = ["question","answer"]

In [None]:
dataset_for_model = pd.concat([original_few_shot_examples.reset_index(drop=True), result_dataset.reset_index(drop=True)], axis=0)

In [None]:
dataset_for_model

Unnamed: 0,question,answer
0,What is the formula for the support vector cla...,The formula for the support vector classifier ...
1,What is the formula for calculating the Bayes ...,The formula for calculating the Bayes error ra...
2,What is the purpose of the tuning parameter \(...,The purpose of the tuning parameter \( \lambda...
3,How do you calculate the posterior probability...,The posterior probability that a given observa...
4,What is the function F(X) derived in the conte...,The function F(X) derived in the context of us...
...,...,...
197,What is the formula for calculating the probab...,The formula for calculating the probability of...
198,What is the formulation of the loss function i...,The relationship between the loss function L(X...
199,How is the variance of a dataset calculated in...,The Proportion of Variance Explained (PVE) in ...
200,What is the formula for approximating the func...,The significance of the formula for the approx...


In [None]:
dataset_for_model.to_csv("final_formuals_dataset.csv", index=False)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd

dataset_for_model = pd.read_csv("/content/final_formuals_dataset.csv")
formulas_list = dataset_for_model.to_dict(orient="records")
q_a_dataset = [(element["question"],element["answer"]) for element in formulas_list]
full_corpus = [element["answer"] for element in formulas_list]

In [None]:
from ragatouille import RAGPretrainedModel,RAGTrainer

trainer = RAGTrainer(
    model_name = "colbert-formulas-model-5",
    pretrained_model_name = "colbert-ir/colbertv2.0"
)

trainer.prepare_training_data(raw_data=q_a_dataset,
                              data_out_path="./data/",
                              all_documents=full_corpus,
                              )

trainer.train(batch_size = 2,
              dim = 128,
              doc_maxlen=256,
              use_relu = False,
              learning_rate=2.3e-5,
              nbits = 2,
              maxsteps=540000,
              )

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading Hard Negative SimpleMiner dense embedding model BAAI/bge-small-en-v1.5...
Building hard negative index for 240 documents...
All documents embedded, now adding to index...
save_index set to False, skipping saving hard negative index
Hard negative index generated
#> Starting...
#> Joined...


In [None]:
model_path = "/content/.ragatouille/colbert/none/2024-10/16/08.25.43/checkpoints/colbert/"

RAG = RAGPretrainedModel.from_pretrained(model_path)

RAG.index(
    collection=full_corpus,
    index_name="colbert-formulas-index-5",
    max_document_length = 256,
    split_documents = False,
)

This is a behaviour change from RAGatouille 0.8.0 onwards.
This works fine for most users and smallish datasets, but can be considerably slower than FAISS and could cause worse results in some situations.
If you're confident with FAISS working on your machine, pass use_faiss=True to revert to the FAISS-using behaviour.
--------------------


[Oct 16, 08:28:36] #> Creating directory .ragatouille/colbert/indexes/colbert-formulas-index-5 




  self.scaler = torch.cuda.amp.GradScaler()


[Oct 16, 08:28:37] [0] 		 #> Encoding 243 passages..


  return torch.cuda.amp.autocast() if self.activated else NullContextManager()


[Oct 16, 08:28:37] [0] 		 avg_doclen_est = 82.60082244873047 	 len(local_sample) = 243
[Oct 16, 08:28:37] [0] 		 Creating 2,048 partitions.
[Oct 16, 08:28:37] [0] 		 *Estimated* 20,071 embeddings.
[Oct 16, 08:28:37] [0] 		 #> Saving the indexing plan to .ragatouille/colbert/indexes/colbert-formulas-index-5/plan.json ..


  sub_sample = torch.load(sub_sample_path)


used 16 iterations (0.2325s) to cluster 19069 items into 2048 clusters
[Oct 16, 08:28:37] Loading decompress_residuals_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...


If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].


[Oct 16, 08:28:38] Loading packbits_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...


If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].
  centroids = torch.load(centroids_path, map_location='cpu')
  avg_residual = torch.load(avgresidual_path, map_location='cpu')
  bucket_cutoffs, bucket_weights = torch.load(buckets_path, map_location='cpu')


[0.036, 0.031, 0.03, 0.033, 0.031, 0.034, 0.032, 0.03, 0.032, 0.032, 0.031, 0.033, 0.033, 0.031, 0.033, 0.032, 0.03, 0.031, 0.034, 0.031, 0.031, 0.035, 0.031, 0.035, 0.031, 0.03, 0.033, 0.031, 0.03, 0.033, 0.03, 0.028, 0.034, 0.029, 0.031, 0.029, 0.031, 0.034, 0.03, 0.037, 0.036, 0.032, 0.032, 0.031, 0.031, 0.029, 0.03, 0.038, 0.035, 0.032, 0.033, 0.032, 0.03, 0.031, 0.031, 0.036, 0.042, 0.032, 0.034, 0.033, 0.032, 0.031, 0.03, 0.034, 0.035, 0.033, 0.036, 0.032, 0.03, 0.032, 0.032, 0.035, 0.036, 0.029, 0.03, 0.034, 0.034, 0.032, 0.032, 0.035, 0.032, 0.033, 0.031, 0.033, 0.032, 0.035, 0.032, 0.035, 0.03, 0.032, 0.033, 0.032, 0.031, 0.031, 0.03, 0.031, 0.032, 0.032, 0.032, 0.031, 0.031, 0.034, 0.031, 0.033, 0.031, 0.029, 0.031, 0.032, 0.032, 0.028, 0.032, 0.035, 0.035, 0.029, 0.033, 0.031, 0.035, 0.032, 0.034, 0.03, 0.032, 0.035, 0.034, 0.034, 0.032, 0.033, 0.033, 0.032]


0it [00:00, ?it/s]

[Oct 16, 08:28:38] [0] 		 #> Encoding 243 passages..


  return torch.cuda.amp.autocast() if self.activated else NullContextManager()
1it [00:00,  1.98it/s]
  return torch.load(codes_path, map_location='cpu')
100%|██████████| 1/1 [00:00<00:00, 615.00it/s]

[Oct 16, 08:28:38] #> Optimizing IVF to store map from centroids to list of pids..
[Oct 16, 08:28:38] #> Building the emb2pid mapping..
[Oct 16, 08:28:38] len(emb2pid) = 20072



100%|██████████| 2048/2048 [00:00<00:00, 43073.08it/s]

[Oct 16, 08:28:38] #> Saved optimized IVF to .ragatouille/colbert/indexes/colbert-formulas-index-5/ivf.pid.pt
Done indexing!





'.ragatouille/colbert/indexes/colbert-formulas-index-5'

In [None]:
RAG.search("Retrieve to me formula for support vector classifier using polynomial kernel?")
#Retrieve to me formula for Multivariate Gaussian distribution.
#Retrieve to me formula for BIC.
#Retrieve to me formula for MSE (mean squared error).
# Retrieve to me formula for support vector classifier using polynomial kernel?
# Formula for the output of a neuron.

Loading searcher for index colbert-formulas-index-5 for the first time... This may take a few seconds
[Oct 16, 08:28:44] #> Loading codec...
[Oct 16, 08:28:44] #> Loading IVF...
[Oct 16, 08:28:44] #> Loading doclens...


  self.scaler = torch.cuda.amp.GradScaler()
  ivf, ivf_lengths = torch.load(os.path.join(self.index_path, "ivf.pid.pt"), map_location='cpu')
100%|██████████| 1/1 [00:00<00:00, 1737.49it/s]

[Oct 16, 08:28:44] #> Loading codes and residuals...



  return torch.load(residuals_path, map_location='cpu')
100%|██████████| 1/1 [00:00<00:00, 492.35it/s]

Searcher loaded!

#> QueryTokenizer.tensorize(batch_text[0], batch_background[0], bsize) ==
#> Input: . Retrieve to me formula for support vector classifier using polynomial kernel?, 		 True, 		 None
#> Output IDs: torch.Size([32]), tensor([  101,     1, 12850,  2000,  2033,  5675,  2005,  2490,  9207,  2465,
        18095,  2478, 17505, 16293,  1029,   102,   103,   103,   103,   103,
          103,   103,   103,   103,   103,   103,   103,   103,   103,   103,
          103,   103], device='cuda:0')
#> Output Mask: torch.Size([32]), tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')






[{'content': 'The formula for the support vector classifier (SVC) using a polynomial kernel, as described in Equation 9.22 in the context of machine learning, is: K of x sub i and x sub i prime equals one plus the sum from j equals one to p of x sub i j times x sub i prime j, all raised to the power of d.',
  'score': 23.09375,
  'rank': 1,
  'document_id': '730cf306-af09-413f-bf21-e1931a51130b',
  'passage_id': 0},
 {'content': 'The formula for the support vector classifier (SVC) using a polynomial kernel, as described in Equation 9.22 in the context of machine learning, is: K of x sub i and x sub i prime equals one plus the sum from j equals one to p of x sub i j times x sub i prime j, all raised to the power of d.',
  'score': 23.09375,
  'rank': 2,
  'document_id': 'fc9931c8-c116-4a0e-b64a-62bbade39abf',
  'passage_id': 208},
 {'content': 'The formula for the piecewise cubic polynomial regression discussed in the section on regression splines in the machine learning book is: Formul

In [None]:
%cp -r /content/.ragatouille/colbert/none/2024-10/16/08.25.43/checkpoints/colbert/ /content/drive/MyDrive/project_work/code/formulas_retrieval/FormulasColBERTCheckpointsFinal/

In [None]:
%cp -r /content/.ragatouille/colbert/indexes/colbert-formulas-index-5/ /content/drive/MyDrive/project_work/code/formulas_retrieval/FormulasColBERTIndexFinal/

**Korpus**

In [None]:
corpus_dict = dict()

for key, val in RAG.model.__dict__['pid_docid_map'].items():
  pid = key
  corpus_dict[str(val)] = {"text":full_corpus[pid], "title":""}

**List of queries**

In [None]:
queries = {str(ind):q for ind, q in enumerate(dataset_for_model["question"]) }

In [None]:
def find_doc_id(element:str, corpus:dict):
  for key, val in corpus.items():
    if val["text"] == element:
      return key

answer_doc_ids = [find_doc_id(desc,corpus_dict) for desc in dataset_for_model["answer"] ]

**Qrels dictionary**

In [None]:
qrels_dict = dict()

qrels_dict = {str(question_id):{str(document_id):1} for question_id, document_id in zip(list(queries.keys()),answer_doc_ids)} # '0', 'sdhaisdhasdkj'

**Pretraga na trening skupu**

In [None]:
search_results = RAG.search(list(queries.values()),k=10)

243it [00:01, 124.91it/s]


In [None]:
def normalize_data(search_results):
  normalized_scores = dict()
  max_score = search_results[0]["score"]
  min_score = search_results[-1]["score"]
  for result in search_results:
    norm_score = (result["score"] - min_score) / (max_score - min_score + 1e-10)
    normalized_scores[str(result["document_id"])] = norm_score
  return normalized_scores
scores = normalize_data(search_results[0])

In [None]:
results_of_retrieval = dict()
for query_id, query_results in zip(list(queries.keys()), search_results):
  results_of_retrieval[str(query_id)] = normalize_data(query_results)

**Evaluacija**

In [None]:
!pip install beir



In [None]:
from beir.retrieval.evaluation import EvaluateRetrieval
ndcg, _map, recall, precision = EvaluateRetrieval.evaluate(qrels_dict, results_of_retrieval, k_values=[1, 3, 5, 10])

In [None]:
print(_map)
print(recall)
print(precision)
print(ndcg)

{'MAP@1': 0.84362, 'MAP@3': 0.90466, 'MAP@5': 0.90775, 'MAP@10': 0.9088}
{'Recall@1': 0.84362, 'Recall@3': 0.97531, 'Recall@5': 0.98765, 'Recall@10': 0.99588}
{'P@1': 0.84362, 'P@3': 0.3251, 'P@5': 0.19753, 'P@10': 0.09959}
{'NDCG@1': 0.84362, 'NDCG@3': 0.92294, 'NDCG@5': 0.92825, 'NDCG@10': 0.93086}
