In [1]:
import pandas as pd
import os, json
from tqdm.auto import tqdm
from openai import OpenAI

#!pip install hyperopt
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [19]:
#!wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py
#!pip install minsearch
import minsearch

### Ingestion

In [20]:
df_ck = pd.read_csv('cooking_knowledge.csv')
df_ck.insert(0, 'ID', df_ck.index)
df_ck.columns

Index(['ID', 'type', 'question', 'response'], dtype='object')

In [21]:
documents = df_ck.to_dict(orient='records')

In [22]:
index = minsearch.Index(
    text_fields=['type', 'question', 'response'],
    keyword_fields=['ID']
)
index.fit(documents)

<minsearch.Index at 0x1382ec690>

In [23]:
#query = 'give me info on ribs'
#index.search(query, num_results=10)

### RAG FLOW

In [24]:
client = OpenAI()

entry_template = """
    type: {type}
    question: {question}
    answer: {response}
    """.strip()

prompt_template = """
    You're a cooking course instructor. Answer the QUESTION based on the CONTEXT from the cooking knowledge database.
    Use only the facts from the CONTEXT when answering the QUESTION.
    
    QUESTION: {question}
    
    CONTEXT: 
    {context}
    """.strip()

In [None]:
os.environ['OPENAI_API_KEY'] = 'OPENAI_API_KEY'

In [26]:
def search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )
    return results

In [27]:
def build_prompt(query, search_results):
    context = ""
    
    for doc in search_results:
        context = context + entry_template.format(**doc) + "\n\n"
            
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [28]:
def llm(prompt, model='gpt-4o-mini'):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content
    
def rag(query, model='gpt-4o-mini'):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt, model=model)
    return answer

In [29]:
query = 'give me info on barbecue'

In [30]:
search_results = search(query)
prompt = build_prompt(query, search_results)

In [31]:
answer = rag(query)
print(answer)

Barbecue is a great cooking method for beef ribs among other meats. To enhance the hearty flavor of beef ribs, it's recommended to accompany them with barbecue sauce and spice rubs. Other techniques for cooking beef ribs include braising and dry-roasting.


### Retrieval Evaluation

In [32]:
df_questions = pd.read_csv('ground_truth.csv')
df_questions.head()

Unnamed: 0,id,question
0,0,Can you explain the HACCP system in detail?
1,0,What steps are involved in implementing HACCP?
2,0,How does HACCP contribute to food safety?
3,0,What are some common hazards identified by HACCP?
4,0,Why is HACCP important for food production?


In [33]:
ground_truth = df_questions.to_dict(orient='records')
ground_truth[0]

{'id': 0, 'question': 'Can you explain the HACCP system in detail?'}

In [34]:
def minsearch_search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )
    return results

In [35]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)
    
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [36]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['id']
        results = search_function(q)
        relevance = [d['ID'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

### Retrieval 
The basic approach - using non-boosed minsearch

In [37]:
evaluate(ground_truth, lambda q: minsearch_search(q['question']))

  0%|          | 0/500 [00:00<?, ?it/s]

{'hit_rate': 0.836, 'mrr': 0.640483333333333}

In [38]:
df_validation = df_questions[:200]
df_test = df_questions[200:]

## Finding the best parameters

In [39]:
import random

def simple_optimize(param_ranges, objective_function, n_iterations=10):
    best_params = None
    best_score = float('-inf')  # Assuming we're minimizing. Use float('-inf') if maximizing.

    for _ in range(n_iterations):
        # Generate random parameters
        current_params = {}
        for param, (min_val, max_val) in param_ranges.items():
            if isinstance(min_val, int) and isinstance(max_val, int):
                current_params[param] = random.randint(min_val, max_val)
            else:
                current_params[param] = random.uniform(min_val, max_val)
        
        # Evaluate the objective function
        current_score = objective_function(current_params)
        
        # Update best if current is better
        if current_score > best_score:  # Change to > if maximizing
            best_score = current_score
            best_params = current_params
    
    return best_params, best_score

In [40]:
gt_val = df_validation.to_dict(orient='records')

In [41]:
def minsearch_search(query, boost=None):
    if boost is None:
        boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [42]:
param_ranges = {
    'id': (0.0, 3.0),
    'question': (0.0, 3.0),
    'response': (0.0, 3.0)
}

def objective(boost_params):
    def search_function(q):
        return minsearch_search(q['question'], boost_params)

    results = evaluate(gt_val, search_function)
    return results['mrr']

In [43]:
simple_optimize(param_ranges, objective, n_iterations=20)

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

({'id': 2.3518356647787306,
  'question': 0.7409743639937042,
  'response': 1.2482714535743846},
 0.7507499999999999)

In [44]:
def minsearch_improved(query):
    boost = {
            'id': 2.7692628744506598,
            'question': 0.3243327894022373,
            'response': 0.5744326020937277}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )
    return results

evaluate(ground_truth, lambda q: minsearch_improved(q['question']))

  0%|          | 0/500 [00:00<?, ?it/s]

{'hit_rate': 0.856, 'mrr': 0.6647404761904759}

### RAG Evaluation

In [45]:
prompt2_template = """
You are an expert evaluator for a Retrieval-Augmented Generation (RAG) system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{{"Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"}}
""".strip()

In [46]:
df_sample = df_questions.sample(n=100, random_state=1)
sample = df_sample.to_dict(orient='records')

In [47]:
evaluations = []

for record in tqdm(sample):
        
    question = record['question']
    answer_llm = rag(question)

    prompt = prompt2_template.format(
        question=question,
        answer_llm=answer_llm)
    evaluation = llm(prompt)

    evaluations.append((record, answer_llm, evaluation))

  0%|          | 0/100 [00:00<?, ?it/s]

In [48]:
df_eval = pd.DataFrame(evaluations, columns=['record','answer', 'evaluation'])
df_eval.head()

Unnamed: 0,record,answer,evaluation
0,"{'id': 60, 'question': 'Is microwave cooking s...",Microwave cooking is not safe for all types of...,"{""Relevance"": ""RELEVANT"",\n ""Explanation"": ""T..."
1,"{'id': 68, 'question': 'How does the sous vide...",The sous vide method ensures even doneness by ...,"{""Relevance"": ""RELEVANT"", ""Explanation"": ""The ..."
2,"{'id': 9, 'question': 'What is the ideal tempe...",The ideal temperature for deep frying is betwe...,"{""Relevance"": ""RELEVANT"", ""Explanation"": ""The ..."
3,"{'id': 13, 'question': 'How do you achieve the...","To achieve the pale color in a Fricassee, it i...","{""Relevance"": ""RELEVANT"", ""Explanation"": ""The ..."
4,"{'id': 95, 'question': 'Are there any tips for...","To achieve flavorful results when braising, it...","{""Relevance"": ""RELEVANT"", ""Explanation"": ""The ..."


In [49]:
df_eval['id'] = df_eval.record.apply(lambda d: d['id'])
df_eval['question'] = df_eval.record.apply(lambda d: d['question'])

df_eval['evaluation'] = df_eval['evaluation'].apply(json.loads)
df_eval['relevance'] = df_eval.evaluation.apply(lambda d: d['Relevance'])
df_eval['explanation'] = df_eval.evaluation.apply(lambda d: d['Explanation'])

del df_eval['record']
del df_eval['evaluation']

df_eval

Unnamed: 0,answer,id,question,relevance,explanation
0,Microwave cooking is not safe for all types of...,60,Is microwave cooking safe for all types of food?,RELEVANT,The generated answer directly addresses the qu...
1,The sous vide method ensures even doneness by ...,68,How does the sous vide method ensure even done...,RELEVANT,The generated answer directly addresses how th...
2,The ideal temperature for deep frying is betwe...,9,What is the ideal temperature for deep frying?,RELEVANT,The generated answer directly addresses the qu...
3,"To achieve the pale color in a Fricassee, it i...",13,How do you achieve the pale color in a Fricassee?,RELEVANT,The generated answer directly addresses the qu...
4,"To achieve flavorful results when braising, it...",95,Are there any tips for achieving flavorful res...,RELEVANT,The generated answer provides specific tips fo...
...,...,...,...,...,...
95,"For best flavor, Navarin should be simmered sl...",2,How long should I simmer Navarin for best flavor?,PARTLY_RELEVANT,The generated answer provides a general guidel...
96,"No, Navarin is traditionally made with lamb. W...",38,Can I use other meats instead of lamb in Navarin?,PARTLY_RELEVANT,The generated answer addresses the question by...
97,"Yes, you can add ingredients other than cream ...",18,Can I add ingredients other than cream or ham ...,RELEVANT,The generated answer directly addresses the qu...
98,"To properly beat egg whites for a soufflé, you...",44,How do you properly beat egg whites for a souf...,RELEVANT,The generated answer directly addresses the qu...


In [50]:
df_eval.relevance.value_counts()

relevance
RELEVANT           85
PARTLY_RELEVANT    11
NON_RELEVANT        4
Name: count, dtype: int64

In [51]:
evaluations_4o = []

for record in tqdm(sample):
        
    question = record['question']
    answer_llm = rag(question, model='gpt-4o')

    prompt = prompt2_template.format(
        question=question,
        answer_llm=answer_llm)
    evaluation = llm(prompt)

    evaluations_4o.append((record, answer_llm, evaluation))

  0%|          | 0/100 [00:00<?, ?it/s]

In [52]:
df_eval = pd.DataFrame(evaluations_4o, columns=['record','answer', 'evaluation'])

df_eval['id'] = df_eval.record.apply(lambda d: d['id'])
df_eval['question'] = df_eval.record.apply(lambda d: d['question'])

df_eval['evaluation'] = df_eval['evaluation'].apply(json.loads)
df_eval['relevance'] = df_eval.evaluation.apply(lambda d: d['Relevance'])
df_eval['explanation'] = df_eval.evaluation.apply(lambda d: d['Explanation'])

del df_eval['record']
del df_eval['evaluation']

df_eval

Unnamed: 0,answer,id,question,relevance,explanation
0,Microwave cooking is generally safe for many t...,60,Is microwave cooking safe for all types of food?,RELEVANT,The generated answer accurately addresses the ...
1,The sous vide method ensures even doneness by ...,68,How does the sous vide method ensure even done...,RELEVANT,The generated answer directly addresses the qu...
2,The ideal temperature for deep frying is typic...,9,What is the ideal temperature for deep frying?,RELEVANT,The generated answer directly addresses the qu...
3,"To achieve the pale color in a Fricassee, main...",13,How do you achieve the pale color in a Fricassee?,RELEVANT,The generated answer directly addresses the qu...
4,"To achieve flavorful results when braising, it...",95,Are there any tips for achieving flavorful res...,RELEVANT,The generated answer directly addresses the qu...
...,...,...,...,...,...
95,"For best flavor, Navarin should be simmered sl...",2,How long should I simmer Navarin for best flavor?,PARTLY_RELEVANT,The generated answer provides useful informati...
96,"Navarin is traditionally a lamb stew, and its ...",38,Can I use other meats instead of lamb in Navarin?,RELEVANT,The generated answer directly addresses the qu...
97,"Yes, you can add ingredients other than cream ...",18,Can I add ingredients other than cream or ham ...,RELEVANT,The generated answer directly addresses the qu...
98,"To properly beat egg whites for a soufflé, ens...",44,How do you properly beat egg whites for a souf...,RELEVANT,The generated answer provides a clear and accu...


In [53]:
df_eval.relevance.value_counts()

relevance
RELEVANT           85
PARTLY_RELEVANT    12
NON_RELEVANT        3
Name: count, dtype: int64