In [1]:
import json
import pandas as pd
from elasticsearch import Elasticsearch
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from sentence_transformers import SentenceTransformer

In [3]:
embed_model = 'multi-qa-MiniLM-L6-cos-v1'

In [4]:
model = SentenceTransformer(embed_model)



In [72]:
with open('documents-with-ids.json', 'rt') as f:
    doc = json.load(f)

In [73]:
doc[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp',
 'id': 'c02e79ef'}

In [74]:
for i in tqdm(doc):
    qt = i['question'] +' '+ i['text']
    i['questionv'] = model.encode(i['question'])
    i['textv'] = model.encode(i['text'])
    i['vqt'] = model.encode(qt)


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 948/948 [01:53<00:00,  8.34it/s]


In [8]:
esclient = Elasticsearch('http://localhost:9200')
es_index = 'search06'

In [101]:
esclient.info()

ObjectApiResponse({'name': '665708c9977e', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'WnEHTYf-RpWzVRr-pw_r8A', 'version': {'number': '8.9.0', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '8aa461beb06aa0417a231c345a1b8c38fb498a0d', 'build_date': '2023-07-19T14:43:58.555259655Z', 'build_snapshot': False, 'lucene_version': '9.7.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [102]:
es_setting = {
    'settings':{
        'number_of_shards':1,
        'number_of_replicas':0
    },
    'mappings':{
        'properties':{
            'text':{'type':'text'},
            'section':{'type':'text'},
            'question':{'type':'text'},
            'course':{'type':'keyword'},
            'id':{'type':'keyword'},
            'questionv':{'type':'dense_vector',
                        'dims':384,
                         'index': True,
                         'similarity': 'cosine'
                        },
            'textv':{'type':'dense_vector',
                        'dims':384,
                         'index': True,
                         'similarity': 'cosine'
                        },
            'vqt':{'type':'dense_vector',
                        'dims':384,
                         'index': True,
                         'similarity': 'cosine'
                        },
            
        }
    }
}

In [103]:
esclient.indices.delete(index=es_index, ignore_unavailable=True)
esclient.indices.create(index= es_index, body = es_setting)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'search06'})

In [104]:
for i in tqdm(doc):
    esclient.index(index= es_index, document=i)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 948/948 [00:26<00:00, 35.36it/s]


# hybrid search

In [13]:
query_sample = 'I just found the course. Can I join it?'
course = 'data-engineering-zoomcamp'
query_v = model.encode(query_sample)

In [78]:
knn_search = {
    'field': 'textv',
    'query_vector' :query_v,
    'k':5,
    'num_candidates': 10000,
    'boost':.5,
    'filter':{
                'term':{
                    'course': course
                }
            }
}

In [79]:
keyword_search = {
    'bool':{
        'must':{
            'multi_match':{
                'query': query_sample,
                'fields': ['text', 'question', 'section'],
                'type':'best_fields',
                'boost': .5,
            }
        },
        'filter':{
            'term':{
                'course': course
            }
        }
    }
}

In [105]:
response = esclient.search(index= es_index, query = keyword_search, knn = knn_search, size=5)

In [106]:
for hits in response["hits"]["hits"]:
    print(hits['_source']['id'], hits['_source']['text'])

7842b56a Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.
a482086d Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.
You can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.
63394d91 You can start by installing and setting up all the dependencies and requirements:
Google cloud account
Google Cloud SDK
Python 3 (installed with Anaconda)
Terraform
Git
Look over the prerequisites and syllabus to see if you are comfortable with these subjects.
eb56ae98 Yes, the slack channel remains open and you can ask questions there. But always sDocker containers exit code w search the channel first and second, check the FAQ (this document), most likely all your questio

In [59]:
def hybrid_search(field, query, course):

    query_v = model.encode(query)
    
    knn_search_hybird = {
        'field': field,
        'query_vector' :query_v,
        'k':5,
        'num_candidates': 10000,
        'boost': 0.5,
        'filter':{
            'term':{
                'course': course
            }
        }
    }

    keyword_search_hybrid = {
        'bool':{
            'must':{
                'multi_match':{
                    'query': query,
                    'fields': ['text', 'question', 'section'],
                    'type':'best_fields',
                    'boost':0.5
                }
            },
            'filter':{
                'term':{
                    'course': course
                }
            }
        }
    }

    search_query = {
        'knn': knn_search_hybird,
        'query':keyword_search_hybrid,
        'size':5,
        '_source':['id', 'course', 'section', 'question', 'text']       
    }

    response = esclient.search(index = es_index, body= search_query)

    result = []
    for hits in response['hits']['hits']:
        result.append(hits['_source'])
    return result
    

In [107]:
hybrid_search('questionv', 'How can I get the course schedule?', 'data-engineering-zoomcamp')

[{'text': 'Yes, the slack channel remains open and you can ask questions there. But always sDocker containers exit code w search the channel first and second, check the FAQ (this document), most likely all your questions are already answered here.\nYou can also tag the bot @ZoomcampQABot to help you conduct the search, but don’t rely on its answers 100%, it is pretty good though.',
  'section': 'General course-related questions',
  'question': 'Course - Can I get support if I take the course in the self-paced mode?',
  'course': 'data-engineering-zoomcamp',
  'id': 'eb56ae98'},
 {'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (installed with Anaconda)\nTerraform\nGit\nLook over the prerequisites and syllabus to see if you are comfortable with these subjects.',
  'section': 'General course-related questions',
  'question': 'Course - What can I do before the course starts?',
  'course': 'data-en

In [35]:
df_ground_truth = pd.read_csv('ground-truth-data.csv')

In [36]:
df_ground_truth.head()

Unnamed: 0,question,course,document
0,When does the course begin?,data-engineering-zoomcamp,c02e79ef
1,How can I get the course schedule?,data-engineering-zoomcamp,c02e79ef
2,What is the link for course registration?,data-engineering-zoomcamp,c02e79ef
3,How can I receive course announcements?,data-engineering-zoomcamp,c02e79ef
4,Where do I join the Slack channel?,data-engineering-zoomcamp,c02e79ef


In [37]:
groud_truth = df_ground_truth.to_dict(orient='records')

In [39]:
groud_truth[0]

{'question': 'When does the course begin?',
 'course': 'data-engineering-zoomcamp',
 'document': 'c02e79ef'}

In [40]:
def hit_rate(data):
    cnt =0
    for i in data:
        if True in i:
            cnt+=1

    return cnt/len(data)

In [57]:
def mrr(data):
    score = 0.0

    for i in data:
        for j in range(len(i)):
            if i[j] == True:
                score += 1/(j+1)

    return score/len(data)

In [84]:
def evaluation(field, ground_truth, search_func):
    relevance_total=[]

    for q in tqdm(ground_truth):
        doc_id = q['document']
        result = search_func(field, q['question'], q['course'])
        relevance = [doc_id==d['id'] for d in result]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr':mrr(relevance_total)
        }
    

In [85]:
evaluation('textv', groud_truth, hybrid_search)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4627/4627 [01:46<00:00, 43.52it/s]


{'hit_rate': 0.9234925437648585, 'mrr': 0.8461710251422809}

In [86]:
evaluation('questionv', groud_truth, hybrid_search)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4627/4627 [01:46<00:00, 43.45it/s]


{'hit_rate': 0.9234925437648585, 'mrr': 0.8481665586052878}

In [87]:
evaluation('vqt', groud_truth, hybrid_search)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4627/4627 [01:46<00:00, 43.34it/s]


{'hit_rate': 0.9250054030689432, 'mrr': 0.8506231539514445}

# Rank

Need to upgrade to elasticsearch 8.9.0 or above to 'rank'

In [92]:
def hybrid_search_rrf(field, query, course):

    query_v = model.encode(query)
    
    knn_search_hybird = {
        'field': field,
        'query_vector' :query_v,
        'k':5,
        'num_candidates': 10000,
        'boost': 0.5,
        'filter':{
            'term':{
                'course': course
            }
        }
    }

    keyword_search_hybrid = {
        'bool':{
            'must':{
                'multi_match':{
                    'query': query,
                    'fields': ['text', 'question', 'section'],
                    'type':'best_fields',
                    'boost':0.5
                }
            },
            'filter':{
                'term':{
                    'course': course
                }
            }
        }
    }

    search_query = {
        'knn': knn_search_hybird,
        'query':keyword_search_hybrid,
        'size':5,
        "rank":{
            "rrf":{}
        },
        '_source':['id', 'course', 'section', 'question', 'text'],
    }

    response = esclient.search(index = es_index, body= search_query)

    result = []
    for hits in response['hits']['hits']:
        result.append(hits['_source'])
    return result

## RRF

In [97]:
def compute_rrf(rank, k = 60):
    return 1/(k+rank)

In [116]:
def elasticsearch_hybrid_rrf(field, query, course, k =60):
    query_v = model.encode(query)
    
    knn_search_hybird = {
        'field': field,
        'query_vector' :query_v,
        'k':5,
        'num_candidates': 10000,
        'boost': 0.5,
        'filter':{
            'term':{
                'course': course
            }
        }
    }

    keyword_search_hybrid = {
        'bool':{
            'must':{
                'multi_match':{
                    'query': query,
                    'fields': ['text', 'question', 'section'],
                    'type':'best_fields',
                    'boost':0.5
                }
            },
            'filter':{
                'term':{
                    'course': course
                }
            }
        }
    }


    knn_result = esclient.search(index= es_index, knn = knn_search_hybird, size = 10)['hits']['hits']

    keyword_result = esclient.search(index=es_index, query=keyword_search_hybrid, size = 10)['hits']['hits']

    rrf_score = {}

    for rank, hit in enumerate(knn_result):
        doc_id = hit['_id']
        rrf_score[doc_id] = compute_rrf(rank+1, k)

    for rank, hit in enumerate(keyword_result):
        doc_id = hit['_id']
        if doc_id in rrf_score:
            rrf_score[doc_id]+= compute_rrf(rank+1, k)
        else:
            rrf_score[doc_id] = compute_rrf(rank+1, k)

    reranked_docs = sorted(rrf_score.items(),key = lambda x:x[1], reverse= True)

    final_results = []
    for doc_id, score in reranked_docs[:5]:
        doc = esclient.get(index = es_index, id = doc_id)
        final_results.append(doc['_source'])
    

    return final_results

In [117]:
evaluation('vqt', groud_truth, elasticsearch_hybrid_rrf)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4627/4627 [03:01<00:00, 25.55it/s]


{'hit_rate': 0.9546142208774584, 'mrr': 0.8727000936531963}