# video 3.2 - semantic search with elasticsearch

In [None]:
import requests

In [None]:
docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

In [None]:
# flatten docs for elasticsearch

documents = []

for course_dict in documents_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

## Create Embeddings using Pretrained Models

In [None]:
!pip install sentence_transformers==2.7.0

In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
model = SentenceTransformer('all-mpnet-base-v2')

In [None]:
vec = model.encode('This is a simple sentence')
vec.shape

In [None]:
operations = []
for doc in documents:
    doc['text_vector'] = model.encode(doc['text']).tolist()
    operations.append(doc)

## Setup ElasticSearch connection

In [None]:
from elasticsearch import Elasticsearch
es_client = Elasticsearch('http://localhost:9200')
es_client.info()

## Create Mappings and Index 

- Mapping is the process of defining how a document, and the fields it contains, are stored and indexed
- Each doc is a collection of fields, which each have their own data type
- We can compare mapping to a database schema in how it describes the fields and properties that documents hold, the datatype of each field (e.g. str, int, date) and how those fields should be indexed and stored

In [None]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "text_vector": {"type": "dense_vector", "dims": 768, "index": True, "similarity": "cosine"}
        }
    }
}

In [None]:
index_name = 'course-questions'

# Create a delete index statement in order to start with a clean slate👇
es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, **index_settings)

## Add docs to the index

In [None]:
for doc in operations:
    try:
        es_client.index(index=index_name, document=doc)
    except Exception as e:
        print(e)

Create end user query

In [None]:
search_term = "windows or mac?"
vector_search_term = model.encode(search_term)

In [None]:
query = {
    "field": "text_vector",
    "query_vector": vector_search_term,
    "k": 5,  # provide 5 nearest neighbors that match the document
    "num_candidates": 10000,
}

In [None]:
res = es_client.search(index=index_name, knn=query, source=['text', 'section', 'question', 'course'])

In [None]:
res.body['hits']['hits'][0]['_source']

# video 3.2.2 - advanced semantic search

- Remember: using the user's input directly for search makes it a **Keyword search** rather than a semantic one.
- To make a semantic search you need to do embeddings first

In [None]:
knn_query = {
    'field': 'text_vector',
    'query_vector': vector_search_term,
    'k': 5,
    'num_candidates': 10000
}

res = es_client.search(
    index=index_name,
    query={
        'match': {
            'course': 'data-engineering-zoomcamp'
            # In the advanced search, we're limiting results only to 1 course
        },
    },
    knn=knn_query,
    size=5,
    source=['text', 'section', 'question', 'course']
)

In [None]:
res['hits']['hits'][0]

Take a look at the score: it's greater than 1.
This is because ES, when doing any kind of advanced search, doesn't necessarily bind the scores between 0 and 1.

By setting `explain=True`, ES will show exactly how the scores are calculated

In [None]:
res = es_client.search(
    index=index_name,
    query={
        'match': {
            'course': 'data-engineering-zoomcamp'
            # In the advanced search, we're limiting results only to 1 course
        },
    },
    knn=knn_query,
    size=5,
    source=['text', 'section', 'question', 'course'],
    explain=True
)

In [None]:
res['hits']['hits'][0]['_explanation']

# video 3.3.1 - evaluation metrics for retrieval

The key question is: *"What is the best retrieval technique?"* (e.g. vector search, word search etc...)

The answer is that it depends. And there are techniques to evaluate performance.

# Video 3.3.2 - ground truth dataset generation for retrieval evaluation

A ground truth dataset basically looks like this:

- Query: Blablabla, blabla blabla?
- Relevant docs: doc1, doc10, doc11

Then repeat the above for many different queries.

In our case, we will generate just 1 doc for each query:

```python
for each record in FAQ:
    generate 5 questions
    use the respective record as "relevant doc"

1000 records => 5000 queries
```

In [None]:
documents[0]

In [None]:
n = len(documents)

for i in range(n):
    documents[i]['id'] = i
    documents[i].pop('text_vector', None)  # let's get rid of it as it wasn't in the original doc

In [None]:
documents[0]

The above is not ideal because when we update the docs and their number change, then also their ID will change. Let's use hashing

In [None]:
import hashlib

def generate_document_id(doc):
    combined = f"{doc['course']}-{doc['question']}-{doc['text'][:10]}"
    hash_object = hashlib.md5(combined.encode())
    hash_hex = hash_object.hexdigest()
    document_id = hash_hex[:8]

    return document_id

In [None]:
for doc in documents:
    doc['id'] = generate_document_id(doc)

In [None]:
documents[0]

In [None]:
# let's find collisions

In [None]:
from collections import defaultdict

In [None]:
hashes = defaultdict(list)

In [None]:
for doc in documents:
    doc_id = doc['id']
    hashes[doc_id].append(doc)

In [None]:
len(hashes), len(documents)

In [None]:
for k, values in hashes.items():
    if len(values) > 1:
        print(k, len(values))

In [None]:
hashes['593f7569']

The above is duplicate but in this example who cares.

In [None]:
import json

with open('documents-with-ids.json', 'wt') as f_out:
    json.dump(documents, f_out, indent=2)

Now with the help of an LLM, we will create our "groundtruth" dataset from the above documents.

In [None]:
prompt_template = """
You emulate a student who's taking our course.
Formulate 5 questions this student might ask based on a FAQ record.
The record should contain the answer to the questions, and the questions should be complete and not too short.
If possible, use as fewer words as possible from the record.

The record:
section: {section}
question: {question}
answer: {text}

Provide the output in parsable JSON without using code blocks:

["question1", "question2", ..., "question5"]
""".strip()

In [None]:
from openai import OpenAI

In [None]:
with open('./../.env', 'rt') as f_in:
    env = f_in.readlines()

In [None]:
env

In [None]:
OPENAI_API_KEY = env[0].split('OPENAI_API_KEY=')[1].replace('\n', '')

In [None]:
client = OpenAI(api_key=OPENAI_API_KEY)

In [None]:
doc = documents[2]

In [None]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[{'role': 'user', 'content': prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response

In [None]:
res = generate_questions(doc)

In [None]:
questions = json.loads(res)

In [None]:
questions

In [None]:
from tqdm import tqdm

In [None]:
results = {}

In [None]:
for doc in tqdm(documents):
    doc_id = doc['id']
    if doc_id in results:
        continue

    questions = json.loads(generate_questions(doc))
    results[doc_id] = questions

In [None]:
import pickle

In [None]:
with open('results.bin', 'wb') as f_out:
    pickle.dump(results, f_out, pickle.HIGHEST_PROTOCOL)

In [None]:
# Read results from file

with open('results.bin', 'rb') as f_in:
    results = pickle.load(f_in)

In [None]:
doc_index = {d['id']: d for d in documents}

In [None]:
final_results = []

for doc_id, questions in results.items():
    course = doc_index[doc_id]['course']
    for q in questions:
        final_results.append((q, course, doc_id))

In [None]:
import pandas as pd

In [None]:
df = pd.DataFrame(final_results, columns=['question', 'course', 'document'])

In [None]:
df.head(10)

In [None]:
df.to_csv('ground-truth-data.csv', index=False)

In [None]:
!head ground-truth-data.csv