In [45]:
import json
import time
import pandas as pd

#### Ingestion

In [3]:
df = pd.read_csv('../data/data.csv')

In [4]:
df.head()

Unnamed: 0,id,exercise_name,type_of_activity,type_of_equipment,body_part,type,muscle_groups_activated,instructions
0,0,Push-Ups,Strength,Bodyweight,Upper Body,Push,"Pectorals, Triceps, Deltoids",Start in a high plank position with your hands...
1,1,Squats,Strength,Bodyweight,Lower Body,Push,"Quadriceps, Glutes, Hamstrings",Stand with feet shoulder-width apart. Lower yo...
2,2,Plank,Strength/Mobility,Bodyweight,Core,Hold,"Rectus Abdominis, Transverse Abdominis",Start in a forearm plank position with your el...
3,3,Deadlift,Strength,Barbell,Lower Body,Pull,"Glutes, Hamstrings, Lower Back","Stand with feet hip-width apart, barbell in fr..."
4,4,Bicep Curls,Strength,Dumbbells,Upper Body,Pull,"Biceps, Forearms","Stand with a dumbbell in each hand, arms fully..."


In [5]:
documents = df.to_dict(orient='records')

In [6]:
documents[0]

{'id': 0,
 'exercise_name': 'Push-Ups',
 'type_of_activity': 'Strength',
 'type_of_equipment': 'Bodyweight',
 'body_part': 'Upper Body',
 'type': 'Push',
 'muscle_groups_activated': 'Pectorals, Triceps, Deltoids',
 'instructions': 'Start in a high plank position with your hands under your shoulders. Lower your body until your chest nearly touches the floor. Push back up to the starting position.'}

In [7]:
import minsearch

In [8]:
index = minsearch.Index(
    text_fields=['exercise_name', 'type_of_activity', 'type_of_equipment', 'body_part',
       'type', 'muscle_groups_activated', 'instructions'],
    keyword_fields=['id']
)

In [9]:
index.fit(documents)

<minsearch.Index at 0x7c3a112881d0>

#### RAG flow

In [10]:
import os
from groq import Groq

In [11]:
client = Groq(api_key=os.environ.get("GROQ_API_KEY"))

In [12]:
def search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [13]:
prompt_template = """
You're a fitness insrtuctor. Answer the QUESTION based on the CONTEXT from our exercises database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT:
{context}
""".strip()

entry_template = """
exercise_name: {exercise_name}
type_of_activity: {type_of_activity}
type_of_equipment: {type_of_equipment}
body_part: {body_part}
type: {type}
muscle_groups_activated: {muscle_groups_activated}
instructions: {instructions}
""".strip()

def build_prompt(query, search_results):
    context = ""
    
    for doc in search_results:
        context = context + entry_template.format(**doc) + "\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt


In [14]:
def llm(prompt):
    response = client.chat.completions.create(
        model='llama3-8b-8192',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [15]:
def rag(query, model='gpt-4o-mini'):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    # print(prompt)
    answer = llm(prompt)
    return answer

In [16]:
question = 'Is the Lat Pulldown considered a strength training activity, and if so, why?'
answer = rag(question)
print(answer)

Based on the context, the Lat Pulldown is considered a strength training activity.


#### Retrieval evaluation

In [17]:
df_question = pd.read_csv('../data/ground-truth-retrieval.csv')

In [18]:
df_question.head()

Unnamed: 0,id,question
0,0,What is the starting position for doing push-ups?
1,0,Which muscle groups are activated during push-...
2,0,How do you know when to push back up while doi...
3,0,Do you need any equipment to perform push-ups?
4,0,What part of the body do push-ups primarily ta...


In [19]:
ground_truth = df_question.to_dict(orient='records')

In [20]:
ground_truth[0]

{'id': 0, 'question': 'What is the starting position for doing push-ups?'}

In [21]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [22]:
def minsearch_search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [23]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['id']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [24]:
from tqdm.auto import tqdm

In [25]:
evaluate(ground_truth, lambda q: minsearch_search(q['question']))

  0%|          | 0/1035 [00:00<?, ?it/s]

{'hit_rate': 0.9478260869565217, 'mrr': 0.8227612913120158}

#### Finding the best parameters

In [26]:
df_validation = df_question[:100]
df_test = df_question[100:]

In [27]:
import random

def simple_optimize(param_ranges, objective_function, n_iterations=10):
    best_params = None
    best_score = float('-inf')  # Assuming we're minimizing. Use float('-inf') if maximizing.

    for _ in range(n_iterations):
        # Generate random parameters
        current_params = {}
        for param, (min_val, max_val) in param_ranges.items():
            if isinstance(min_val, int) and isinstance(max_val, int):
                current_params[param] = random.randint(min_val, max_val)
            else:
                current_params[param] = random.uniform(min_val, max_val)
        
        # Evaluate the objective function
        current_score = objective_function(current_params)
        
        # Update best if current is better
        if current_score > best_score:  # Change to > if maximizing
            best_score = current_score
            best_params = current_params
    
    return best_params, best_score

In [28]:
gt_val = df_validation.to_dict(orient='records')

In [29]:
def minsearch_search(query, boost=None):
    if boost is None:
        boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [30]:
param_ranges = {
    'exercise_name': (0.0, 3.0),
    'type_of_activity': (0.0, 3.0),
    'type_of_equipment': (0.0, 3.0),
    'body_part': (0.0, 3.0),
    'type': (0.0, 3.0),
    'muscle_groups_activated': (0.0, 3.0),
    'instructions': (0.0, 3.0),
}

def objective(boost_params):
    def search_function(q):
        return minsearch_search(q['question'], boost_params)

    results = evaluate(gt_val, search_function)
    return results['mrr']

In [31]:
simple_optimize(param_ranges, objective, n_iterations=20)

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

({'exercise_name': 2.9941249333201334,
  'type_of_activity': 0.23861327432056145,
  'type_of_equipment': 0.10772112204503947,
  'body_part': 1.3619845363234218,
  'type': 2.3930789854162255,
  'muscle_groups_activated': 1.5763234239197699,
  'instructions': 1.770016102832984},
 0.8411666666666666)

In [32]:
def minsearch_improved(query):
    boost = {
        'exercise_name': 2.9,
        'type_of_activity': 2.3,
        'type_of_equipment': 1.02,
        'body_part': 0.12,
        'type': 2.68,
        'muscle_groups_activated': 2.63,
        'instructions': 1.76
    }

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

evaluate(ground_truth, lambda q: minsearch_improved(q['question']))

  0%|          | 0/1035 [00:00<?, ?it/s]

{'hit_rate': 0.9526570048309179, 'mrr': 0.9050854995782533}

#### RAG evaluation

In [33]:
prompt2_template = """
You are an expert evaluator for a RAG system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

In [34]:
len(ground_truth)

1035

In [38]:
record = ground_truth[0]
question = record['question']
answer_llm = rag(question)

In [40]:
print(answer_llm)

Based on the context, the starting position for doing push-ups is:

**Start in a high plank position with your hands under your shoulders.**

This is described in the instructions for the exercise "Push-Ups" under the context.


In [41]:
prompt = prompt2_template.format(question=question, answer_llm=answer_llm)
print(prompt)

You are an expert evaluator for a RAG system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: What is the starting position for doing push-ups?
Generated Answer: Based on the context, the starting position for doing push-ups is:

**Start in a high plank position with your hands under your shoulders.**

This is described in the instructions for the exercise "Push-Ups" under the context.

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}


In [43]:
df_sample = df_question.sample(n=200, random_state=1)

In [44]:
sample = df_sample.to_dict(orient='records')

In [46]:
evaluations = []

for record in tqdm(sample):
    question = record['question']
    answer_llm = rag(question) 

    prompt = prompt2_template.format(
        question=question,
        answer_llm=answer_llm
    )
    while True:
        try:
            evaluation = llm(prompt)
            evaluation = json.loads(evaluation)
            evaluations.append((record, answer_llm, evaluation))
            break
        except Exception as e:
            print(f"Error processing document : {e}")
            print("Retrying in 10 seconds...")
            time.sleep(10)

  0%|          | 0/200 [00:00<?, ?it/s]

Error processing document : Expecting ',' delimiter: line 3 column 214 (char 242)
Retrying in 10 seconds...
Error processing document : Expecting ',' delimiter: line 3 column 227 (char 260)
Retrying in 10 seconds...
Error processing document : Expecting ',' delimiter: line 3 column 267 (char 302)
Retrying in 10 seconds...
Error processing document : Expecting ',' delimiter: line 3 column 430 (char 465)
Retrying in 10 seconds...
Error processing document : Expecting ',' delimiter: line 3 column 269 (char 297)
Retrying in 10 seconds...
Error processing document : Expecting ',' delimiter: line 3 column 387 (char 415)
Retrying in 10 seconds...
Error processing document : Expecting ',' delimiter: line 3 column 225 (char 251)
Retrying in 10 seconds...
Error processing document : Expecting ',' delimiter: line 3 column 293 (char 321)
Retrying in 10 seconds...
Error processing document : Expecting ',' delimiter: line 3 column 258 (char 286)
Retrying in 10 seconds...
Error processing document : 

In [48]:
evaluations[0]

({'id': 171,
  'question': 'What is the primary muscle group activated during the Banded Pull-Up?'},
 'Based on the provided context, the primary muscle group activated during the Banded Pull-Up is the Latissimus Dorsi, along with the Biceps.',
 {'Relevance': 'RELEVANT',
  'Explanation': 'The generated answer directly addresses the question, providing the primary muscle group(s) activated during the Banded Pull-Up, which is the main focus of the inquiry.'})

In [49]:
df_eval = pd.DataFrame(evaluations, columns=['record', 'answer', 'evaluation'])

df_eval['id'] = df_eval.record.apply(lambda d: d['id'])
df_eval['question'] = df_eval.record.apply(lambda d: d['question'])

df_eval['relevance'] = df_eval.evaluation.apply(lambda d: d['Relevance'])
df_eval['explanation'] = df_eval.evaluation.apply(lambda d: d['Explanation'])

del df_eval['record']
del df_eval['evaluation']

In [50]:
df_eval.relevance.value_counts(normalize=True)

relevance
RELEVANT           0.765
PARTLY_RELEVANT    0.200
NON_RELEVANT       0.035
Name: proportion, dtype: float64

In [51]:
df_eval.to_csv('../data/rag-eval-groq.csv', index=False)