In [1]:
import json
import pickle
import pandas as pd
import hashlib
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


## 1. load json file

In [2]:
with open('documents.json', 'rt') as f:
    doc_raw = json.load(f)

In [3]:
documents = []

for i in doc_raw:
    course_name = i['course']
    for j in i['documents']:
        j['course'] = course_name
        documents.append(j)

In [4]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

## 2. add id in documents

In [5]:
hashlib.md5?

[1;31mSignature:[0m [0mhashlib[0m[1;33m.[0m[0mmd5[0m[1;33m([0m[0mstring[0m[1;33m=[0m[1;34mb''[0m[1;33m,[0m [1;33m*[0m[1;33m,[0m [0musedforsecurity[0m[1;33m=[0m[1;32mTrue[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m Returns a md5 hash object; optionally initialized with a string
[1;31mType:[0m      builtin_function_or_method

In [6]:
# check data with the same hash id
from collections import defaultdict


In [7]:
def hash_code(doc):
    combined = f"{doc['course']}-{doc['question']}-{doc['text'][:20]}"
    hash_object = hashlib.md5(combined.encode())

    return hash_object.hexdigest()[:8]
    
    

In [8]:
for doc in documents:
    doc['id'] = hash_code(doc)

In [9]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp',
 'id': 'c86136cf'}

In [10]:
dummy_file = defaultdict(list)

In [11]:
for item in documents:
    item_id = item.get('id')  # Replace 'id' with the actual key for ID in your JSON
    if item_id is not None:
        dummy_file[item_id].append(item)
    
    # Filter out IDs that have more than one occurrence
duplicates = {item_id: items for item_id, items in dummy_file.items() if len(items) > 1}

In [12]:
for item_id, items in duplicates.items():
    print(f"ID: {item_id}")
    for item in items:
        print(f"  {item}")

ID: 4fcdc430
  {'text': "They both do the same, it's just less typing from the script.\nAsked by Andrew Katoch, Added by Edidiong Esu", 'section': '6. Decision Trees and Ensemble Learning', 'question': 'Does it matter if we let the Python file create the server or if we run gunicorn directly?', 'course': 'machine-learning-zoomcamp', 'id': '4fcdc430'}
  {'text': "They both do the same, it's just less typing from the script.", 'section': '6. Decision Trees and Ensemble Learning', 'question': 'Does it matter if we let the Python file create the server or if we run gunicorn directly?', 'course': 'machine-learning-zoomcamp', 'id': '4fcdc430'}


In [13]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp',
 'id': 'c86136cf'}

In [14]:
# save as json file
with open('documents_id.json', 'wt') as f_out:
    json.dump(documents, f_out)

## 3. Set prompt

In [64]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import sentencepiece
import transformers
import huggingface_hub

In [71]:
prompt_template = """
The record:

section: {section}
question: {question}
answer: {text}

You emulate a student who's taking our course.
Formulate 5 questions this student might ask based on a FAQ record. The record
should contain the answer to the questions, and the questions should be complete and not too short.
If possible, use as fewer words as possible from the record. 

Provide the output in parsable JSON without using code blocks:
["question1", "question2", ..., "question5"]
and fillin the questions above. Do not reply ["question1", "question2", "question3", "question4", "question5"]
""".strip()

In [72]:
doc = documents[1]
doc

{'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
 'section': 'General course-related questions',
 'question': 'Course - What are the prerequisites for this course?',
 'course': 'data-engineering-zoomcamp',
 'id': '870d0954'}

In [73]:
prompt = prompt_template.format(**doc)

In [74]:
print(prompt)

The record:

section: General course-related questions
question: Course - What are the prerequisites for this course?
answer: GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites

You emulate a student who's taking our course.
Formulate 5 questions this student might ask based on a FAQ record. The record
should contain the answer to the questions, and the questions should be complete and not too short.
If possible, use as fewer words as possible from the record. 

Provide the output in parsable JSON without using code blocks:
["question1", "question2", ..., "question5"]
and fillin the questions above. Do not reply ["question1", "question2", "question3", "question4", "question5"]


## 4. Generate questions

In [77]:
import google.generativeai as genai

In [79]:
modelg = genai.GenerativeModel()

In [82]:
resg = modelg.generate_content(prompt_template.format(**documents[2]))

In [88]:
resg.text

'["Can I join the course if I register after it has started?", "Can I submit homeworks without registering?", "Do I have to submit the homeworks before the final project deadline?", "Can I join the course even if it has started?", "What happens if I submit the homeworks after the final project deadline?"]'

In [93]:
def gen_prompt(doc):
    prompt = prompt_template.format(**doc)
    json_res = modelg.generate_content(prompt)

    return json_res.text

In [107]:
import time

In [171]:
for doc in tqdm(documents):
    try:
        doc_id = doc['id']
        if doc_id == '5cd4ddbc':
            continue
        if doc_id in result:
            continue
        question = gen_prompt(doc)
        result[doc_id] = question
    except Exception as e:
        if '429' in str(e):
            print(e)
            time.sleep(10)
        else:
            raise

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 948/948 [00:03<00:00, 276.15it/s]


In [331]:
parsed_r = {}

In [332]:

for doc_id, json_q in result.items():
    if doc_id in parsed_r:
        continue
    try:
        parsed_r[doc_id] = json.loads(json_q) # json.load: load json file; json.loads: load json format string
    except json.JSONDecodeError as e:
        prompt_j1 = promp_j.format(result[doc_id])
        json_resj = modelg.generate_content(prompt_j1)
        parsed_r[doc_id] = json.loads(json_resj.text)
        
    

In [327]:
parsed_r['doc_id']

['What\'s the solution to the error message \'curl: (6) Could not resolve host: output.csv\' when using \'os.system(f\\"curl {url} --output {csv_name}\\")\'?',
 'How do I resolve host errors when using curl?',
 "Why am I getting a 'Could not resolve host' error when trying to download a file with curl?",
 "How can I fix the 'Could not resolve host' error in Python using the 'os.system' function?",
 'What does the --output flag do in the curl command?']

In [330]:
result

{'c86136cf': '["When will the course start?", "What day and time will the course start?", "How to register the course?", "What is the link to register before the course starts?", "What is the link to join the course Telegram channel?"]',
 '870d0954': '["What are the entry requirements for this course?", "Does this course have any entry requirements?", "What qualifications do I need to take this course?", "Are there any skills I need to have acquired before I can take this course?", "What skills should I have acquired before I can take this course?"]',
 '03e9e63a': '["Can I join late?", "Is registration required?", "Can homework be submitted after the deadline?", "Will the final project have a deadline?", "Is there a time limit to complete everything?"]',
 '0bf8fc38': '["When will I receive a confirmation email for the Data Engineering Bootcamp?", "Can I start learning and submitting homework without registering?", "Why is registration not checked against any registered list?", "What is

In [318]:
json_q

'["Why is mlflow\'s pyfunc.load_model raising an error in my lambda function?", "How can I increase the memory of a lambda function?", "What does the error message \'AttributeError(\\\'module \\\'dataclasses\\\' has no attribute \\\'\\\'__version__\\\'\\\')\' mean?", "Why is it important to set the logging level to DEBUG when encountering errors in mlflow?", "What is the recommended solution to the error raised by mlflow\'s pyfunc.load_model in a lambda function?"]'

In [319]:
json_q =["Why is mlflow's pyfunc.load_model raising an error in my lambda function?", "How can I increase the memory of a lambda function?", "What does the error message 'AttributeError(\'module \'dataclasses\' has no attribute \'\'__version__\'\')' mean?", "Why is it important to set the logging level to DEBUG when encountering errors in mlflow?", "What is the recommended solution to the error raised by mlflow's pyfunc.load_model in a lambda function?"]

In [320]:
doc_id

'9d587502'

In [321]:
json.dumps(json_q)

'["Why is mlflow\'s pyfunc.load_model raising an error in my lambda function?", "How can I increase the memory of a lambda function?", "What does the error message \'AttributeError(\'module \'dataclasses\' has no attribute \'\'__version__\'\')\' mean?", "Why is it important to set the logging level to DEBUG when encountering errors in mlflow?", "What is the recommended solution to the error raised by mlflow\'s pyfunc.load_model in a lambda function?"]'

In [322]:
result[doc_id] = json.dumps(json_q)

In [297]:
print(json_q)

["What do I need to change to fix the error \"Credentials in profile \"PROFILE_NAME\", target: 'dev', invalid: '5432'is not of type 'integer'\"?"]


In [None]:
[\\"What\'s the solution to the error message \'curl: (6) Could not resolve host: output.csv\' when using \'os.system(f\\"curl {url} --output {csv_name}\\")\'?\\", \\"How do I resolve host errors when using curl?\\", \\"Why am I getting a \'Could not resolve host\' error when trying to download a file with curl?\\", \\"How can I fix the \'Could not resolve host\' error in Python using the \'os.system\' function?\\", \\"What does the --output flag do in the curl command?\\"]

In [291]:
promp_j = '''
I want to load the below string by json.loads fucntion.
Please help to correct the string to correct format for json.loads.
Only reply the json string for me to copy. No explanation or output needed and do not change the content.
{}
'''

In [309]:
i = 0
for jid, jq in result.items():
    print('9d587502\n', result['9d587502'])
    prompt_j1 = promp_j.format(result['9d587502'])
    json_resj = modelg.generate_content(prompt_j1)
    print(json_resj.text)
    break

9d587502
 ["Why is mlflow's pyfunc.load_model raising an error in my lambda function?", "How can I increase the memory of a lambda function?", "What does the error message 'AttributeError(\'module \'dataclasses\' has no attribute \'\'__version__\'\')' mean?", "Why is it important to set the logging level to DEBUG when encountering errors in mlflow?", "What is the recommended solution to the error raised by mlflow's pyfunc.load_model in a lambda function?"]
["Why is mlflow's pyfunc.load_model raising an error in my lambda function?", "How can I increase the memory of a lambda function?", "What does the error message 'AttributeError(\'module \'dataclasses\' has no attribute \'\'__version__\'\')' mean?", "Why is it important to set the logging level to DEBUG when encountering errors in mlflow?", "What is the recommended solution to the error raised by mlflow's pyfunc.load_model in a lambda function?"]


In [305]:
type(json_resj.text)

str

In [274]:
promp_j = '''
I want to load the below string by json.loads fucntion.
Please help to correct the string to correct format for json.loads.
{}
'''

In [276]:
prompt_j1 = promp_j.format(result['c86136cf'])
prompt_j1

'\nI want to load the below string by json.loads fucntion.\nPlease help to correct the string to correct format for json.loads.\n["When will the course start?", "What day and time will the course start?", "How to register the course?", "What is the link to register before the course starts?", "What is the link to join the course Telegram channel?"]\n'

In [336]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp',
 'id': 'c86136cf'}

In [339]:
doc_index = {d['id']:d for d in documents}

In [344]:
doc_index['c86136cf']

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp',
 'id': 'c86136cf'}

In [343]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp',
 'id': 'c86136cf'}

In [346]:
parsed_r

{'c86136cf': ['When will the course start?',
  'What day and time will the course start?',
  'How to register the course?',
  'What is the link to register before the course starts?',
  'What is the link to join the course Telegram channel?'],
 '870d0954': ['What are the entry requirements for this course?',
  'Does this course have any entry requirements?',
  'What qualifications do I need to take this course?',
  'Are there any skills I need to have acquired before I can take this course?',
  'What skills should I have acquired before I can take this course?'],
 '03e9e63a': ['Can I join late?',
  'Is registration required?',
  'Can homework be submitted after the deadline?',
  'Will the final project have a deadline?',
  'Is there a time limit to complete everything?'],
 '0bf8fc38': ['When will I receive a confirmation email for the Data Engineering Bootcamp?',
  'Can I start learning and submitting homework without registering?',
  'Why is registration not checked against any regist

In [352]:
final_doc = []

for d_id, q in parsed_r.items():
    course = doc_index[d_id]['course']
    for qj in q:
        final_doc.append((qj, course, d_id))

In [13]:
df = pd.read_csv('val_question.csv')

In [417]:
df

Unnamed: 0,Questions,Course,Doc_id
0,When will the course start?,data-engineering-zoomcamp,c86136cf
1,What day and time will the course start?,data-engineering-zoomcamp,c86136cf
2,How to register the course?,data-engineering-zoomcamp,c86136cf
3,What is the link to register before the course...,data-engineering-zoomcamp,c86136cf
4,What is the link to join the course Telegram c...,data-engineering-zoomcamp,c86136cf
...,...,...,...
4688,How can I remove pytest after completing a test?,mlops-zoomcamp,2e88aaa5
4689,Where can I find the .vscode folder?,mlops-zoomcamp,2e88aaa5
4690,Why am I unable to reconfigure pytest after a ...,mlops-zoomcamp,2e88aaa5
4691,What happens if I remove the .vscode folder?,mlops-zoomcamp,2e88aaa5


In [14]:
ground_truth = df.to_dict(orient='records') # orient = records -> return [{column: value}, {column: value}]

In [15]:
ground_truth[:3]

[{'Questions': 'When will the course start?',
  'Course': 'data-engineering-zoomcamp',
  'Doc_id': 'c86136cf'},
 {'Questions': 'What day and time will the course start?',
  'Course': 'data-engineering-zoomcamp',
  'Doc_id': 'c86136cf'},
 {'Questions': 'How to register the course?',
  'Course': 'data-engineering-zoomcamp',
  'Doc_id': 'c86136cf'}]

## 5. elastic search

In [2]:
from elasticsearch import Elasticsearch

In [3]:
es_client = Elasticsearch('http://localhost:9200')

In [4]:
index_setting = {
    'settings': {
        'number_of_shards':1,
        'number_of_replicas': 0
    },
    'mappings': {
        'properties': {
            'text': {'type': 'text'},
            'section': {'type': 'text'},
            'question': {'type': 'text'},
            'course': {'type': 'keyword'},
            'id': {'type': 'keyword'}
        }
    }
}

index_name = 'course-question'

In [5]:
es_client.info()

ObjectApiResponse({'name': 'c94a9fa56e92', 'cluster_name': 'docker-cluster', 'cluster_uuid': '87OVNLdITq6vU8NS6bsMBw', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [6]:
es_client.indices.delete(index= index_name, ignore_unavailable= True)
es_client.indices.create(index= index_name, body= index_setting)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-question'})

In [8]:
with open('documents_id.json', 'rt') as f:
    documents_id = json.load(f)

In [9]:
for doc in tqdm(documents_id):
    es_client.index(index= index_name, document= doc)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 948/948 [00:56<00:00, 16.91it/s]


In [10]:
def es_query(query, course):
    search_query = {
        'size': 5,
        'query': {
            'bool': {
                'must': {
                    'multi_match': {
                        'query': query,
                        'fields': ['text', 'section', 'question^3'],
                        'type': 'best_fields'
                    }
                },
                'filter': {
                    'term': {
                        'course': course
                    }
                }
            }
        }
    }

    response = es_client.search(index='course-question', body= search_query)

    result = []
    for hit in response['hits']['hits']:
        result.append(hit['_source'])

    return result


In [11]:
es_query(
    query = 'I just discovered this course. Can I still enroll?',
    course='data-engineering-zoomcamp'
)

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp',
  'id': '03e9e63a'},
 {'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (installed with Anaconda)\nTerraform\nGit\nLook over the prerequisites and syllabus to see if you are comfortable with these subjects.',
  'section': 'General course-related questions',
  'question': 'Course - What can I do before the course starts?',
  'course': 'data-engineering-zoomcamp',
  'id': '597ce7be'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it fin

## 6. Relevance

In [16]:
re_total = []

for q in tqdm(ground_truth):
    '''
    result: from the original file (doc) find the 5 most relevant context given the question from ground truth.
    relevance: see if the id is the same as the id which we used to create those questions.
    '''
    doc_id = q['Doc_id']
    result = es_query(query=q['Questions'], course= q['Course'])
    relevance = [d['id'] == doc_id for d in result]
    re_total.append(relevance)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4693/4693 [04:11<00:00, 18.64it/s]


In [53]:
test_result1 = es_query(ground_truth[0]['Questions'], ground_truth[0]['Course'])

In [56]:
test_result1[0]['id']

'c86136cf'

In [17]:
relevance

[False, False, True, False, False]

In [18]:
es_query('When will the course start?','data-engineering-zoomcamp' )

[{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'course': 'data-engineering-zoomcamp',
  'id': 'c86136cf'},
 {'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'cours

In [19]:
len(re_total)

4693

## 7. Hit rate

Count as 1 if there's at least 1 True in a record.


In [20]:
def hit(relevance_total):
    cnt = 0
    for i in relevance_total:
        if True in i:
            cnt +=1
    return cnt/len(relevance_total)

In [21]:
hit(re_total)

0.5959940336671639

## 8. mrr
Since the list is ordered by its score. Mrr rank higher if True come out earlier within the list.
* [T,F,F,F,F] -> 1/1
* [F,T,F,F,F] -> 1/2
* [F,F,T,F,F] -> 1/3
* [F,F,F,T,F] -> 1/4
* [F,F,F,F,T] -> 1/5

In [22]:
def mrr(relevance_total):
    score = 0.0

    for i in relevance_total:
        for j in range(len(i)):
            if i[j] is True:
                score += 1/(j+1)
    return score/len(relevance_total)
            

In [23]:
mrr(re_total)

0.47373748135520977

## 8 Vector Search

In [24]:
from sentence_transformers import SentenceTransformer

In [25]:
model = SentenceTransformer('sentence-transformers/multi-qa-MiniLM-L6-cos-v1')



In [26]:
len(model.encode('Hi'))

384

In [27]:
es_client2 = Elasticsearch('http://localhost:9200')

In [28]:
es_client2.info()

ObjectApiResponse({'name': 'c94a9fa56e92', 'cluster_name': 'docker-cluster', 'cluster_uuid': '87OVNLdITq6vU8NS6bsMBw', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [29]:
index_setting2 = {
    'settings':{
        'number_of_shards':1,
        'number_of_replicas': 0
    },
    'mappings':{
        'properties':{
            'text': {'type':'text'},
            'section': {'type':'text'},
            'question': {'type':'text'},
            'course': {'type':'keyword'},
            'id': {'type':'keyword'},

            'vtext': {'type': 'dense_vector',
                     'index': True,
                      'dims': 384,
                      'similarity':'cosine'
                     },
            'vquestion': {'type': 'dense_vector',
                     'index': True,
                     'dims': 384,
                      'similarity':'cosine'
                     },
            'vqt': {'type': 'dense_vector',
                     'index': True,
                     'dims': 384,
                      'similarity':'cosine'
                     }
        }
    }
}

index_name2 = 'vector_search'

In [30]:
es_client2.indices.create(index= index_name2, body= index_setting2)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'vector_search'})

In [33]:
documents_id[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp',
 'id': 'c86136cf'}

In [34]:
documents_v = []

for doc in tqdm(documents_id):
    #tq = doc['question'] + ' ' + doc['text']
    doc['vtext'] = model.encode(doc['text'])
    doc['vquestion'] = model.encode(doc['question'])
    doc['vqt'] = model.encode(doc['question'] + ' ' + doc['text'])
    documents_v.append(doc)
    

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 948/948 [02:04<00:00,  7.60it/s]


In [35]:
for doc in tqdm(documents_v):
    es_client2.index(index= index_name2, body= doc)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 948/948 [00:57<00:00, 16.54it/s]


In [38]:
query1 = 'I just discoved this couse. Can Ｉ still join it?'
query_v = model.encode(query1)

In [39]:
search_query = {
    'field': 'vquestion',
    'query_vector' : query_v,
    'k' :5,
    'num_candidates': 10000
}

In [41]:
resultv = es_client2.search(index= index_name2, knn = search_query, source=['text', 'section', 'question', 'id', 'course'])

In [42]:
resultv['hits']['hits']

[{'_index': 'vector_search',
  '_id': 'bJFPcZEBuMBkxYRmPQWz',
  '_score': 0.75742096,
  '_source': {'question': 'The course has already started. Can I still join it?',
   'course': 'machine-learning-zoomcamp',
   'section': 'General course-related questions',
   'text': 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.',
   'id': '62adf4e7'}},
 {'_index': 'vector_search',
  '_id': 'rZFOcZEBuMBkxYRm0QO1',
  '_score': 0.71342003,
  '_source': {'question': 'Course - Can I still join the course after the start date?',
   'course': 'data-engineering-zoomcamp',
   'section': 'General course-related questions',
   'text': "Yes, even if you don't register, you're still eligi

In [80]:
def es_query2(query, course):
    search_query = {
        'knn' :{
            'field': 'vquestion',
            'query_vector' : model.encode(query),
            'k':5,
            'num_candidates': 10000,
            'filter': {
                'term':
                {
                    'course': course
                }
            }
        },
        '_source':['text', 'section', 'question', 'id', 'course']
    }

    response = es_client2.search(index= index_name2, body=search_query)

    result = []
    for hit in response['hits']['hits']:
        result.append(hit['_source'])

    
    return result

In [81]:
es_query2(ground_truth[0]['Questions'], ground_truth[0]['Course'])

[{'question': 'Course - When will the course start?',
  'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  'id': 'c86136cf'},
 {'question': 'Course - What can I do before the course starts?',
  'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (installed with Anaconda)\nTerraform\nGit\nLook over the p

In [87]:
re_total2 = []

for q in tqdm(ground_truth):
    
    '''
    result: from the original file (doc) find the 5 most relevant context given the question from ground truth.
    relevance: see if the id is the same as the id which we used to create those questions.
    '''
    doc_id = q['Doc_id']
    result2 = es_query2(query=q['Questions'], course= q['Course'])
    relevance2 = [d['id'] == doc_id for d in result2]
    re_total2.append(relevance2)


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4693/4693 [05:40<00:00, 13.78it/s]


In [88]:
hit(re_total2)

0.6260387811634349

In [89]:
mrr(re_total2)

0.5178244193479646

In [90]:
def es_query3(field, query, course):
    search_query = {
        'knn' :{
            'field': field,
            'query_vector' : model.encode(query),
            'k':5,
            'num_candidates': 10000,
            'filter': {
                'term':
                {
                    'course': course
                }
            }
        },
        '_source':['text', 'section', 'question', 'id', 'course']
    }

    response = es_client2.search(index= index_name2, body=search_query)

    result = []
    for hit in response['hits']['hits']:
        result.append(hit['_source'])

    
    return result

In [117]:
ground_truth[0]

{'Questions': 'When will the course start?',
 'Course': 'data-engineering-zoomcamp',
 'Doc_id': 'c86136cf'}

In [114]:
es_query3('vtext', ground_truth[0]['Questions'], ground_truth[0]['Course'])

[{'question': 'Course - When will the course start?',
  'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  'id': 'c86136cf'},
 {'question': 'Course - Can I follow the course after it finishes?',
  'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continu

In [125]:
def evaluate(esquery_method, *args):
    
    re_total = []
    
    for q in tqdm(ground_truth):
        
        '''
        result: from the original file (doc) find the 5 most relevant context given the question from ground truth.
        relevance: see if the id is the same as the id which we used to create those questions.
        '''
        doc_id = q['Doc_id']
        result = esquery_method(query = q['Questions'], course = q['Course'], *args)
        relevance = [d['id'] == doc_id for d in result]
        re_total.append(relevance)

    return {'hit_rate':hit(re_total), 'mrr':mrr(re_total)}

In [126]:
evaluate(es_query3, 'vtext')

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4693/4693 [05:37<00:00, 13.90it/s]


{'hit_rate': 0.7443000213083315, 'mrr': 0.6113502379430362}

In [128]:
evaluate(es_query3, 'vqt')

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4693/4693 [05:32<00:00, 14.12it/s]


{'hit_rate': 0.8090773492435542, 'mrr': 0.6835180055401675}

In [129]:
def es_query4(field, query, course):
    search_query = {
        'knn' :{
        'field': field,
            'query_vector' : model.encode(query),
            'k':5,
            'num_candidates': 10000,
            'filter': {
                'term':
                {
                    'course': course
                }
            }
        },
        '_source':['text', 'section', 'question', 'id', 'course']
    }
    response = es_client2.search(index= index_name2, body=search_query)

    result = []
    for hit in response['hits']['hits']:
        result.append(hit['_source'])

    
    return result