In [23]:
import pandas as pd
import os, json
from tqdm.auto import tqdm
from openai import OpenAI

!pip install hyperopt
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope



In [24]:
#!wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py
#!pip install minsearch

import minsearch

### Ingestion

In [25]:
df_ck = pd.read_csv('../data/cooking_knowledge.csv')
df_ck.insert(0, 'ID', df_ck.index)
df_ck.columns

Index(['ID', 'type', 'question', 'response'], dtype='object')

In [26]:
documents = df_ck.to_dict(orient='records')

index = minsearch.Index(
    text_fields=['type', 'question', 'response'],
    keyword_fields=['ID']
)
index.fit(documents)

<minsearch.minsearch.Index at 0x15e149350>

### RAG FLOW

In [None]:
# os.environ['OPENAI_API_KEY'] = 'OPENAI_API_KEY' # For testing purposes

In [28]:
client = OpenAI()

entry_template = """
    type: {type}
    question: {question}
    answer: {response}
    """.strip()

prompt_template = """
    You're a cooking course instructor. Answer the QUESTION based on the CONTEXT from the cooking knowledge database.
    Use only the facts from the CONTEXT when answering the QUESTION.
    
    QUESTION: {question}
    
    CONTEXT: 
    {context}
    """.strip()

In [None]:
def search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )
    return results

def build_prompt(query, search_results):
    context = ""
    
    for doc in search_results:
        context = context + entry_template.format(**doc) + "\n\n"
            
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

def llm(prompt, model='gpt-4o-mini'):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content
    
def rag(query, model='gpt-4o-mini'):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt, model=model)
    return answer

In [101]:
query = 'give me info on parmesan'

In [102]:
#search_results = search(query)
#prompt = build_prompt(query, search_results)

answer = rag(query)
print(answer)

Parmesan cheese pairs well with a variety of ingredients, including basil, pasta, olive oil, figs, grapes, honey, mushrooms, walnuts, and aged balsamic vinegar. These pairings are staples in Italian cooking. Additionally, for a refined meal starter, you can serve Parmigiano-Reggiano with Prosecco, which offers a palate-cleansing and elegant experience.


### Retrieval Evaluation

In [76]:
df_questions = pd.read_csv('../data/ground_truth_2.csv')
df_questions.head()

Unnamed: 0,id,question
0,0,Can you explain the purpose of HACCP in food s...
1,0,How does HACCP identify hazards in food produc...
2,0,What are the key principles of the HACCP system?
3,0,Why is it important to control hazards in cook...
4,0,How can I implement HACCP in my kitchen?


In [77]:
ground_truth = df_questions.to_dict(orient='records')
ground_truth[0]

{'id': 0, 'question': 'Can you explain the purpose of HACCP in food safety?'}

In [78]:
def minsearch_search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )
    return results

def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)
    
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['id']
        results = search_function(q)
        relevance = [d['ID'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

### Retrieval 
The basic approach - using non-boosed minsearch

In [79]:
evaluate(ground_truth, lambda q: minsearch_search(q['question']))

  0%|          | 0/5000 [00:00<?, ?it/s]

{'hit_rate': 0.8016, 'mrr': 0.5945765873015884}

In [80]:
df_validation = df_questions[:2000]
df_test = df_questions[2000:]

## Finding the best parameters

In [81]:
import random

def simple_optimize(param_ranges, objective_function, n_iterations=10):
    best_params = None
    best_score = float('-inf')  # Assuming we're minimizing. Use float('-inf') if maximizing.

    for _ in range(n_iterations):
        # Generate random parameters
        current_params = {}
        for param, (min_val, max_val) in param_ranges.items():
            if isinstance(min_val, int) and isinstance(max_val, int):
                current_params[param] = random.randint(min_val, max_val)
            else:
                current_params[param] = random.uniform(min_val, max_val)
        
        # Evaluate the objective function
        current_score = objective_function(current_params)
        
        # Update best if current is better
        if current_score > best_score:  # Change to > if maximizing
            best_score = current_score
            best_params = current_params
    
    return best_params, best_score

def minsearch_search_boost(query, boost=None):
    if boost is None:
        boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

def objective(boost_params):
    def search_function(q):
        return minsearch_search_boost(q['question'], boost_params)

    results = evaluate(gt_val, search_function)
    return results['mrr']

In [82]:
gt_val = df_validation.to_dict(orient='records')

param_ranges = {
    'id': (0.0, 3.0),
    'question': (0.0, 3.0),
    'response': (0.0, 3.0)
}

simple_optimize(param_ranges, objective, n_iterations=20)

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

({'id': 1.3712380633786596,
  'question': 1.2123315926140017,
  'response': 2.569534561384091},
 0.6277128968253967)

In [83]:
def minsearch_improved(query):
    boost = {
            'id': 1.3712380633786596,
            'question': 1.2123315926140017,
            'response': 2.569534561384091
        }

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )
    return results

evaluate(ground_truth, lambda q: minsearch_improved(q['question']))

  0%|          | 0/5000 [00:00<?, ?it/s]

{'hit_rate': 0.8422, 'mrr': 0.6336352380952389}

### RAG Evaluation

In [84]:
prompt2_template = """
You are an expert evaluator for a Retrieval-Augmented Generation (RAG) system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{{"Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"}}
""".strip()

In [85]:
df_questions.shape

(5000, 2)

In [88]:
df_sample = df_questions.sample(n=100, random_state=1)
sample = df_sample.to_dict(orient='records')

In [89]:
evaluations = []

for record in tqdm(sample):
        
    question = record['question']
    answer_llm = rag(question)

    prompt = prompt2_template.format(
        question=question,
        answer_llm=answer_llm)
    evaluation = llm(prompt)

    evaluations.append((record, answer_llm, evaluation))

  0%|          | 0/100 [00:00<?, ?it/s]

In [90]:
df_eval = pd.DataFrame(evaluations, columns=['record','answer', 'evaluation'])
df_eval.head()

Unnamed: 0,record,answer,evaluation
0,"{'id': 552, 'question': 'Are there any tips fo...","To keep grilled chicken moist, consider brinin...","{""Relevance"": ""RELEVANT"", ""Explanation"": ""The ..."
1,"{'id': 953, 'question': 'Is Parmigiano Reggian...","Yes, Parmigiano Reggiano is suitable for sandw...","{""Relevance"": ""RELEVANT"", ""Explanation"": ""The ..."
2,"{'id': 762, 'question': 'Can you suggest a sim...",A simple dish using Vacherin cheese could be t...,"{""Relevance"": ""RELEVANT"", ""Explanation"": ""The ..."
3,"{'id': 699, 'question': 'Are there any health ...",The provided context does not explicitly menti...,"{""Relevance"": ""NON_RELEVANT"", ""Explanation"": ""..."
4,"{'id': 547, 'question': 'What is the grilling ...",The grilling temperature range for shrimp is n...,"{""Relevance"": ""PARTLY_RELEVANT"", \n ""Explanati..."


In [91]:
df_eval['id'] = df_eval.record.apply(lambda d: d['id'])
df_eval['question'] = df_eval.record.apply(lambda d: d['question'])

df_eval['evaluation'] = df_eval['evaluation'].apply(json.loads)
df_eval['relevance'] = df_eval.evaluation.apply(lambda d: d['Relevance'])
df_eval['explanation'] = df_eval.evaluation.apply(lambda d: d['Explanation'])

del df_eval['record']
del df_eval['evaluation']

df_eval

Unnamed: 0,answer,id,question,relevance,explanation
0,"To keep grilled chicken moist, consider brinin...",552,Are there any tips for grilling chicken to kee...,RELEVANT,The generated answer provides specific tips fo...
1,"Yes, Parmigiano Reggiano is suitable for sandw...",953,Is Parmigiano Reggiano suitable for sandwiches?,RELEVANT,The generated answer directly addresses the qu...
2,A simple dish using Vacherin cheese could be t...,762,Can you suggest a simple dish using Vacherin c...,RELEVANT,The generated answer provides a specific and s...
3,The provided context does not explicitly menti...,699,Are there any health benefits associated with ...,NON_RELEVANT,The generated answer does not address the ques...
4,The grilling temperature range for shrimp is n...,547,What is the grilling temperature range for shr...,PARTLY_RELEVANT,The generated answer provides some relevant in...
...,...,...,...,...,...
95,Aging meat improves its tenderness primarily d...,65,How does aging meat improve its tenderness?,RELEVANT,The generated answer directly addresses the qu...
96,"Yes, Saigon cinnamon is suitable for making ga...",618,Is Saigon cinnamon suitable for making ganaches?,RELEVANT,The generated answer directly addresses the qu...
97,The first step in preparing a basic omelet is ...,56,What is the first step in preparing a basic om...,RELEVANT,The generated answer directly addresses the qu...
98,"To implement HACCP in your kitchen, follow the...",138,How do I implement HACCP in my kitchen?,RELEVANT,The generated answer directly addresses the qu...


In [92]:
df_eval.relevance.value_counts()

relevance
RELEVANT           90
PARTLY_RELEVANT     7
NON_RELEVANT        3
Name: count, dtype: int64

In [93]:
evaluations_4o = []

for record in tqdm(sample):
        
    question = record['question']
    answer_llm = rag(question, model='gpt-4o')

    prompt = prompt2_template.format(
        question=question,
        answer_llm=answer_llm)
    evaluation = llm(prompt)

    evaluations_4o.append((record, answer_llm, evaluation))

  0%|          | 0/100 [00:00<?, ?it/s]

In [96]:
df_eval = pd.DataFrame(evaluations_4o, columns=['record','answer', 'evaluation'])

df_eval['id'] = df_eval.record.apply(lambda d: d['id'])
df_eval['question'] = df_eval.record.apply(lambda d: d['question'])

df_eval['evaluation'] = df_eval['evaluation'].apply(json.loads)
df_eval['relevance'] = df_eval.evaluation.apply(lambda d: d['Relevance'])
df_eval['explanation'] = df_eval.evaluation.apply(lambda d: d['Explanation'])

del df_eval['record']
del df_eval['evaluation']

df_eval

Unnamed: 0,answer,id,question,relevance,explanation
0,"Yes, there is a tip that involves brining to k...",552,Are there any tips for grilling chicken to kee...,RELEVANT,The generated answer directly addresses the qu...
1,"Yes, Parmigiano Reggiano is suitable for sandw...",953,Is Parmigiano Reggiano suitable for sandwiches?,RELEVANT,The generated answer directly addresses the qu...
2,A simple dish using Vacherin cheese is to serv...,762,Can you suggest a simple dish using Vacherin c...,RELEVANT,The generated answer directly addresses the qu...
3,The provided context does not explicitly menti...,699,Are there any health benefits associated with ...,PARTLY_RELEVANT,The generated answer acknowledges the lack of ...
4,The context provided does not specify the exac...,547,What is the grilling temperature range for shr...,RELEVANT,The generated answer provides a reasonable tem...
...,...,...,...,...,...
95,Aging meat improves its tenderness primarily d...,65,How does aging meat improve its tenderness?,RELEVANT,The generated answer directly addresses the qu...
96,"Yes, Saigon cinnamon is suitable for making ga...",618,Is Saigon cinnamon suitable for making ganaches?,RELEVANT,The generated answer directly addresses the qu...
97,The first step in preparing a basic omelet is ...,56,What is the first step in preparing a basic om...,RELEVANT,The generated answer directly addresses the qu...
98,Implementing HACCP in your kitchen involves se...,138,How do I implement HACCP in my kitchen?,RELEVANT,The generated answer provides a comprehensive ...


In [97]:
df_eval.relevance.value_counts()

relevance
RELEVANT           87
PARTLY_RELEVANT    12
NON_RELEVANT        1
Name: count, dtype: int64