In [1]:
import json

# From the ground_truth.ipynb, we created dataset with ids on it.
with open('documents-with-ids.json', 'rt') as f_in:
    documents = json.load(f_in)

In [2]:
documents[:5]

[{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'course': 'data-engineering-zoomcamp',
  'id': 'c02e79ef'},
 {'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
  'section': 'General course-related questions',
  'question': 'Course - What are the prerequisites for this course?',
  'course': 'data-engineering-zoomcamp',
  'id': '1f6520ca'},
 {'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware

In [3]:
pip install sentence_transformers==2.7.0


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1.2[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [4]:
from sentence_transformers import SentenceTransformer 

In [5]:
# 1. Load a pretrained Sentence Transformer model
model = SentenceTransformer("multi-qa-MiniLM-L6-cos-v1") # cos similarity

In [6]:
vector = model.encode("Can I still join the course?")

In [7]:
len(vector)

384

#### Now we want to create the embeddings.

In [11]:
pip install elasticsearch

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1.2[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [12]:
from elasticsearch import Elasticsearch
es_client = Elasticsearch('http://localhost:9200')

In [13]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"},
            "question_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
            "text_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
            "question_text_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
        }
    }
}

index_name = "course-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True) 
es_client.indices.create(index=index_name, body=index_settings) # Indexes are created

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [14]:
from tqdm.auto import tqdm

In [15]:
for doc in tqdm(documents):
    question = doc['question']
    text = doc['text']
    qt = question + ' ' + text

    # Vector embeddings
    doc['question_vector'] = model.encode(question)
    doc['text_vector'] = model.encode(text)
    doc['qt_vector'] = model.encode(qt)

  0%|          | 0/948 [00:00<?, ?it/s]

In [44]:
doc

{'text': 'Problem description\nInfrastructure created in AWS with CD-Deploy Action needs to be destroyed\nSolution description\nFrom local:\nterraform init -backend-config="key=mlops-zoomcamp-prod.tfstate" --reconfigure\nterraform destroy --var-file vars/prod.tfvars\nAdded by Erick Calderin',
 'section': 'Module 6: Best practices',
 'question': 'How to destroy infrastructure created via GitHub Actions',
 'course': 'mlops-zoomcamp',
 'id': '886d1617',
 'question_vector': array([ 8.83039832e-02,  1.84535477e-02,  1.00397998e-02, -1.63603481e-02,
         8.40308517e-02, -9.60045010e-02, -2.63261230e-04, -5.26764691e-02,
         2.60165613e-03,  4.39167395e-02,  1.14915548e-02,  3.99805792e-02,
         7.17471093e-02, -3.84425484e-02, -1.78289809e-03,  4.48449235e-03,
        -4.33246046e-03,  3.55190295e-03, -1.40589215e-02,  2.84716450e-02,
         4.25044522e-02,  1.72751918e-02, -3.62396985e-02,  1.57562196e-02,
         7.97225684e-02,  7.66049651e-03, -4.24633622e-02, -1.26894545

In [16]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/948 [00:00<?, ?it/s]

In [17]:
query = "Can I still join the course?"
vector_search_term = model.encode(query)

In [18]:
def elastic_search_knn(field, vector, course):

    knn = {
        "field": field,
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000,
        "filter": {
            "term": {
                "course": course
            }
        }
    }

    search_query = {
        "knn": knn,
        "_source": ["text", "section", "question", "course", "id"]
    }

    # Elastic search
    es_resp = es_client.search(index=index_name, body=search_query)

    # Clean answer
    clean_ans = []
    
    for hit in es_resp['hits']['hits']:
        clean_ans.append(hit['_source'])

    return clean_ans

#### Ground truth dataset

In [22]:
import pandas as pd

In [23]:
df_gt = pd.read_csv('ground-truth-data.csv')

In [24]:
ground_truth = df_gt.to_dict(orient='records')

In [45]:
ground_truth[:11]

[{'question': 'When does the course begin?',
  'course': 'data-engineering-zoomcamp',
  'document': 'c02e79ef'},
 {'question': 'How can I get the course schedule?',
  'course': 'data-engineering-zoomcamp',
  'document': 'c02e79ef'},
 {'question': 'What is the link for course registration?',
  'course': 'data-engineering-zoomcamp',
  'document': 'c02e79ef'},
 {'question': 'How can I receive course announcements?',
  'course': 'data-engineering-zoomcamp',
  'document': 'c02e79ef'},
 {'question': 'Where do I join the Slack channel?',
  'course': 'data-engineering-zoomcamp',
  'document': 'c02e79ef'},
 {'question': 'Where can I find the prerequisites for this course?',
  'course': 'data-engineering-zoomcamp',
  'document': '1f6520ca'},
 {'question': 'How do I check the prerequisites for this course?',
  'course': 'data-engineering-zoomcamp',
  'document': '1f6520ca'},
 {'question': 'Where are the course prerequisites listed?',
  'course': 'data-engineering-zoomcamp',
  'document': '1f6520c

#### To wrap the function

question_vector_knn takes parameter as ground_truth elements

In [39]:
ground_truth[0]

{'question': 'When does the course begin?',
 'course': 'data-engineering-zoomcamp',
 'document': 'c02e79ef'}

In [27]:
def question_vector_knn(q):
    question = q['question']
    course = q['course']
    doc_id = q['document']

    v_q = model.encode(question)

    return elastic_search_knn("question_vector", v_q, course)

In [36]:
ground_truth[0]

{'question': 'When does the course begin?',
 'course': 'data-engineering-zoomcamp',
 'document': 'c02e79ef'}

In [35]:
question_vector_knn(ground_truth[0])

[{'question': 'Course - When will the course start?',
  'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  'id': 'c02e79ef'},
 {'question': 'Course - What can I do before the course starts?',
  'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (installed with Anaconda)\nTerraform\nGit\nLook over the p

In [33]:
def text_vector_knn(q):
    question = q['question']
    course = q['course']

    v_q = model.encode(question)

    return elastic_search_knn("text_vector", v_q, course)

In [34]:
text_vector_knn(ground_truth[0])

[{'question': 'Course - When will the course start?',
  'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  'id': 'c02e79ef'},
 {'question': 'Course - Can I follow the course after it finishes?',
  'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continu

#### Now we can evaluate the elastic search results:

In [28]:
def mrr(relevance_total):
    score=0.0

    for line in relevance_total:
        for rank in range(len(line)): # Now we interested in position
            if line[rank] == True:
                score = score + (1/(rank+1))

    return score/len(relevance_total)

In [29]:
def hit_rate(relevance_total):
    count=0

    for line in relevance_total:
        if True in line:
            count = count + 1

    return count/len(relevance_total)

In [40]:
def evaluate(ground_truth, search_func):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document'] # The doc_id from ground_truth dataset that AI created
        results = search_func(q) # It finds relevant questions with elastic search from original data
        relevance = [d['id'] == doc_id for d in results] # If doc_id from ground_truth and elastic search are same, it is relevant
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total)
    }

For each query in the ground truth dataset, the system compares the IDs of the documents returned by the search function with the correct document ID (doc_id) from the ground truth data.



In [31]:
evaluate(ground_truth, question_vector_knn)

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.773071104387292, 'mrr': 0.6666810748505158}

With elastic search:
A search function (question_vector_knn) is defined, which presumably takes a query and finds the most similar documents from a dataset. Cosine similarity.


- The query string from q['question'] is encoded into a vector v_q using the preloaded Sentence Transformer model 

- The encoded vector v_q is passed to a function elastic_search_knn

- The function elastic_search_knn likely performs a nearest-neighbor search in an Elasticsearch index based on the vector v_q, retrieving documents that are most similar to the query vector.

Then if the ids match (in a loop) from the ground truth data first element and its results from elastic search, it returns True. 