In [None]:
# We are trying make a simplification to find the relevant document

# for each record in FAQ:
#    generate 5 questions

# Creating the dataset 1000 -> 5000
# Gold standart: human made

The "simplification" mentioned could refer to the certainty 
of having at least one relevant result among the retrieved 
data because of how the data was generated. Generating five 
questions from a single FAQ entry simplifies the evaluation 
of relevance by ensuring at least one question-answer pair is 
directly related to the original entry. 

In [None]:
!pip install tqdm
!pip install mistralai

In [None]:
import requests

docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [None]:
# Problem is we dont't have an id. When we retrieve them, we need to understand 
# which document is relevant for this particular query.

### Creating unique ids

In [None]:
import hashlib 

def generate_document_id(doc):
    combined = f"{doc['course']}-{doc['question']}-{doc['text'][:10]}"
    hash_obj = hashlib.md5(combined.encode())
    hash_hex = hash_obj.hexdigest()
    doc_id = hash_hex[:8]
    return doc_id


Example:

- Combined will be "CS101-What is AI?-Artificial ". *(course-question-text[:10])*
- The MD5 hash of "CS101-What is AI?-Artificial " will be calculated, resulting in a hexadecimal string, e.g., "e99a18c428cb38d5f260853678922e03".
- The first 8 characters, "e99a18c4", will be extracted as the document ID.

We didn't give oredered id (like 1,2...) because when I add or delete a question, there can be a mess. That's why, we focus on the content.

In [None]:
for doc in documents:
    doc['id'] = generate_document_id(doc)

In [None]:
import json

# To make json file

with open('doc_with_ids.json', 'wt') as f_out:
    json.dump(documents, f_out, indent=2)

### LLM

In [None]:
prompt_template = """
You emulate a student who's taking our course.
Formulate 5 questions this student might ask based on a FAQ record. The record
should contain the answer to the questions, and the questions should be complete and not too short.
If possible, use as fewer words as possible from the record. 

The record:

section: {section}
question: {question}
answer: {text}

Provide the output in parsable JSON without using code blocks:

["question1", "question2", ..., "question5"]
""".strip()

In [None]:
import os
secret = os.getenv('MISTRAL_API')
from mistralai.client import MistralClient
from mistralai.models.chat_completion import ChatMessage

model = "mistral-medium-latest"

client = MistralClient(api_key=secret)

### Creating questions for one

In [None]:
def generate_five_questions(doc):
    prompt = prompt_template.format(**doc)

    chat_response = client.chat(
        model=model,
        messages=[ChatMessage(role="user", content=prompt)]
    )

    json_resp = chat_response.choices[0].message.content
    return json_resp

### Creating questions for all

In [None]:
from tqdm.auto import tqdm

In [None]:
results = {} #dictionary

for i in range(15):
    for doc in tqdm(documents[:15]):
        doc_id = doc['id']
        if doc_id in results:
            continue
        questions = generate_five_questions(doc)
        results[doc_id] = questions

In [None]:
results

In [None]:
parsed_results = {}

for doc_id, json_questions in results.items():
    parsed_results[doc_id] = json.loads(json_questions)

In [None]:
parsed_results

### Saving to csv

In [None]:
documents[3]

In [None]:
# To quickly look up with the doc_id
doc_index = {d['id']: d for d in documents}

In [None]:
doc_index

In [None]:
final_results = []

for doc_id, questions in parsed_results.items():
    course = doc_index[doc_id]['course']
    for q in questions: # add the every question
        final_results.append((q, course, doc_id))

In [None]:
final_results[:10]

In [None]:
import pandas as pd

# To make csv file
df = pd.DataFrame(final_results, columns=['question', 'course', 'doc_id'])

In [None]:
df.to_csv('ground_truth.csv', index=False)

In [None]:
df.head()

#### Now we can calculate the quality of our search system...