In [None]:
!wget https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/01-intro/minsearch.py


In [1]:
import pandas as pd

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


## Ingestion

In [7]:
df = pd.read_csv('../data/data-500.csv')
df.columns = df.columns.str.lower().str.replace(' ','_')
df = df.fillna("")
df = df.applymap(lambda x: str(x) if isinstance(x, (int)) else x)
df = df.applymap(lambda x: str(int(x*10)) if isinstance(x, (float)) else x)
df.columns

  df = df.applymap(lambda x: str(x) if isinstance(x, (int)) else x)
  df = df.applymap(lambda x: str(int(x*10)) if isinstance(x, (float)) else x)


Index(['title', 'year', 'summary', 'short_summary', 'genres', 'imdb_id',
       'runtime', 'youtube_trailer', 'rating', 'movie_poster', 'director',
       'writers', 'cast'],
      dtype='object')

In [8]:
documents = df.to_dict(orient='records')

In [9]:
import minsearch

In [14]:
index = minsearch.Index(
    text_fields=['title', 'year', 'summary', 'short_summary', 'genres', 'imdb_id',
       'runtime', 'youtube_trailer', 'rating', 'movie_poster', 'director',
       'writers', 'cast'],
    keyword_fields=[]
)

In [15]:
index.fit(documents)

<minsearch.Index at 0x7f86501042e0>

## RAG flow

In [16]:
import os

In [17]:
os.environ['OPENAI_API_KEY'] = '<Your_Key>'

In [18]:
from openai import OpenAI

client = OpenAI()

In [19]:
def search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [20]:
prompt_template = """
You're a movie consault. Answer the QUESTION based on the CONTEXT from our movies database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT:
{context}
""".strip()

entry_template = """
title: {title}               
year: {year}     
summary: {summary}     
short_summary: {short_summary}     
genres: {genres}     
imdb_id: {imdb_id}     
runtime: {runtime}     
youtube_trailer: {youtube_trailer}     
rating: {rating}     
movie_poster: {movie_poster}     
director: {director}     
writers: {writers}     
cast: {cast}  
""".strip()

def build_prompt(query, search_results):
    context = ""
    
    for doc in search_results:
        context = context + entry_template.format(**doc) + "\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [21]:
def llm(prompt, model='gpt-4o-mini'):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [22]:
def rag(query, model='gpt-4o-mini'):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    #print(prompt)
    answer = llm(prompt, model=model)
    return answer

In [23]:
question = 'Based on the IMDb rating, list top 5 movies.'
answer = rag(question)
print(answer)

Based on the IMDb ratings from the provided context, here are the top 5 movies:

1. **A Street Cat Named Bob** - Rating: 74
   ![A Street Cat Named Bob](https://hydramovies.com/wp-content/uploads/2018/04/A-Street-Cat-Named-Bob-Movie-Poster.jpg)

2. **Notes on Blindness** - Rating: 71
   ![Notes on Blindness](https://hydramovies.com/wp-content/uploads/2018/04/Notes-on-Blindness-Movie-Poster.jpg)

3. **The Redeemed and the Dominant: Fittest on Earth** - Rating: 71
   ![The Redeemed and the Dominant: Fittest on Earth](https://hydramovies.com/wp-content/uploads/2018/04/The-Redeemed-and-the-Dominant-Fittest-on-Earth-Movie-Poster.jpg)

4. **Teen Titans: The Judas Contract** - Rating: 70
   ![Teen Titans: The Judas Contract](https://hydramovies.com/wp-content/uploads/2018/04/Teen-Titans-The-Judas-Contract-Movie-Poster.jpg)

5. **Dead on Arrival** - Rating: 69
   ![Dead on Arrival](https://hydramovies.com/wp-content/uploads/2018/04/Dead-on-Arrival-Movie-Poster.jpg)


## Retrieval evaluation

In [24]:
df_question = pd.read_csv('../data/movies-ground-truth-retrieval-500.csv')

In [25]:
df_question.head()

Unnamed: 0,imdb_id,question
0,tt7026230,What is the main theme of Patton Oswalt: Annih...
1,tt7026230,Who directed the movie Patton Oswalt: Annihila...
2,tt7026230,What personal experiences does Patton Oswalt f...
3,tt7026230,How long is the runtime of Patton Oswalt: Anni...
4,tt7026230,What is the IMDb rating for Patton Oswalt: Ann...


In [31]:
ground_truth = df_question.to_dict(orient='records')

In [32]:
ground_truth[0]

{'imdb_id': 'tt7026230',
 'question': 'What is the main theme of Patton Oswalt: Annihilation?'}

In [26]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [27]:
def minsearch_search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [34]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['imdb_id']
        results = search_function(q)
        relevance = [d['imdb_id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [29]:
from tqdm.auto import tqdm

In [35]:
evaluate(ground_truth, lambda q: minsearch_search(q['question']))

  0%|          | 0/2500 [00:00<?, ?it/s]

{'hit_rate': 0.908, 'mrr': 0.8668760317460318}

## Finding the best parameters

In [36]:
df_validation = df_question[:250]
df_test = df_question[250:]

In [37]:
import random

def simple_optimize(param_ranges, objective_function, n_iterations=10):
    best_params = None
    best_score = float('-inf')  # Assuming we're minimizing. Use float('-inf') if maximizing.

    for _ in range(n_iterations):
        # Generate random parameters
        current_params = {}
        for param, (min_val, max_val) in param_ranges.items():
            if isinstance(min_val, int) and isinstance(max_val, int):
                current_params[param] = random.randint(min_val, max_val)
            else:
                current_params[param] = random.uniform(min_val, max_val)
        
        # Evaluate the objective function
        current_score = objective_function(current_params)
        
        # Update best if current is better
        if current_score > best_score:  # Change to > if maximizing
            best_score = current_score
            best_params = current_params
    
    return best_params, best_score

In [38]:
gt_val = df_validation.to_dict(orient='records')

In [39]:
def minsearch_search(query, boost=None):
    if boost is None:
        boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [40]:
param_ranges = {
    'title': (0.0, 3.0),
    'year': (0.0, 3.0),
    'summary': (0.0, 3.0),
    'short_summary': (0.0, 3.0),
    'genres': (0.0, 3.0),
    'rating': (0.0, 3.0),
    'movie_poster': (0.0, 3.0),
    'director': (0.0, 3.0),
    'writers': (0.0, 3.0),
    'cast': (0.0, 3.0),
    'runtime': (0.0, 3.0),
}

def objective(boost_params):
    def search_function(q):
        return minsearch_search(q['question'], boost_params)

    results = evaluate(gt_val, search_function)
    return results['mrr']

In [41]:
simple_optimize(param_ranges, objective, n_iterations=20)

  0%|          | 0/250 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

({'title': 2.73902658146692,
  'year': 0.10140035030235828,
  'summary': 2.714552286097589,
  'short_summary': 2.4220046693825266,
  'genres': 2.5009929641075255,
  'rating': 2.4500545891778245,
  'movie_poster': 1.0587687236474732,
  'director': 0.07892052706461905,
  'writers': 1.40911414178857,
  'cast': 2.5007244900337025,
  'runtime': 0.6518570879470155},
 0.8742444444444446)

In [47]:
def minsearch_improved(query):
    boost = {
        'title': 2.74,
        'year': 0.1,
        'summary': 2.71,
        'short_summary': 2.42,
        'genres': 2.50,
        'rating': 2.45,
        'movie_poster': 1.06,
        'director': 0.79,
        'writers': 1.41,
        'cast': 2.50,
        'runtime': 0.65,
    }

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

evaluate(ground_truth, lambda q: minsearch_improved(q['question']))

  0%|          | 0/2500 [00:00<?, ?it/s]

{'hit_rate': 0.9204, 'mrr': 0.8815895238095239}

## RAG evaluation

In [43]:
prompt2_template = """
You are an expert evaluator for a RAG system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

In [44]:
len(ground_truth)

2500

In [45]:
record = ground_truth[0]

In [50]:
import json

In [53]:
df_sample = df_question.sample(n=300, random_state=1)

In [54]:
sample = df_sample.to_dict(orient='records')

In [56]:
evaluations = []
error_records = []
for record in tqdm(sample):
    question = record['question']
    answer_llm = rag(question) 

    prompt = prompt2_template.format(
        question=question,
        answer_llm=answer_llm
    )

    evaluation = llm(prompt)

    try:
        evaluation = json.loads(evaluation)
        evaluations.append((record, answer_llm, evaluation))
    except json.JSONDecodeError as e:
        error_records.append(evaluation)

    

  0%|          | 0/300 [00:00<?, ?it/s]

In [58]:
df_eval = pd.DataFrame(evaluations, columns=['record', 'answer', 'evaluation'])

df_eval['id'] = df_eval.record.apply(lambda d: d['imdb_id'])
df_eval['question'] = df_eval.record.apply(lambda d: d['question'])

df_eval['relevance'] = df_eval.evaluation.apply(lambda d: d['Relevance'])
df_eval['explanation'] = df_eval.evaluation.apply(lambda d: d['Explanation'])

del df_eval['record']
del df_eval['evaluation']

In [63]:
df_eval.relevance.value_counts()

relevance
RELEVANT           264
NON_RELEVANT        20
PARTLY_RELEVANT     15
Name: count, dtype: int64

In [59]:
df_eval.relevance.value_counts(normalize=True)

relevance
RELEVANT           0.882943
NON_RELEVANT       0.066890
PARTLY_RELEVANT    0.050167
Name: proportion, dtype: float64

In [60]:
df_eval.to_csv('../data/rag-eval-gpt-4o-mini.csv', index=False)

In [64]:
#df_eval[df_eval.relevance == 'NON_RELEVANT']

In [65]:
error_records

[]