In [1]:
import json
import pandas as pd
from elasticsearch import Elasticsearch
from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm

In [2]:
with open('documents-with-ids.json', 'rt') as f:
    documents = json.load(f)

In [3]:
model_name = 'multi-qa-MiniLM-L6-cos-v1'
model = SentenceTransformer(model_name)



In [4]:
for doc in tqdm(documents):
    question = doc['question']
    text = doc['text']
    qt = question + ' ' + text

    doc['question_vector'] = model.encode(question)
    doc['text_vector'] = model.encode(text)
    doc['question_text_vector'] = model.encode(qt)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 948/948 [01:56<00:00,  8.15it/s]


In [5]:
es_client = Elasticsearch('http://localhost:9200') 

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"},
            "question_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
            "text_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
            "question_text_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
        }
    }
}

index_name = "course-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [6]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 948/948 [00:23<00:00, 39.77it/s]


In [11]:
pip install langchain-community

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting langchain-community
  Downloading langchain_community-0.2.16-py3-none-any.whl.metadata (2.7 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting numpy<2.0.0,>=1.26.0 (from langchain-community)
  Using cached numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.22.0-py3-none-any.whl.metadata (7.2 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting mypy-extensions>=0.3.0 (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading mypy_extensions-1.0.0-py3-none-any.whl.metadata (1.1 kB)
Downloading langchain_community-0.2.16-py3-none-any.whl (2.3 MB)
[2K   [38;2

In [12]:
from langchain.embeddings import SentenceTransformerEmbeddings
from typing import Dict
from langchain_elasticsearch import ElasticsearchRetriever

In [13]:
es_url = 'http://localhost:9200'

In [14]:
embeddings = SentenceTransformerEmbeddings(model_name="sentence-transformers/multi-qa-MiniLM-L6-cos-v1")

  embeddings = SentenceTransformerEmbeddings(model_name="sentence-transformers/multi-qa-MiniLM-L6-cos-v1")


In [35]:
def hybrid_search(query):
    query_v = embeddings.embed_query(query)

    return {
        'query':{
            'bool':{
                'must':{
                    'multi_match':{
                        'query': query,
                        'fields': ['question', 'text', 'section'],
                        'type':'best_fields',
                        'boost':0.5
                    }
                },
                'filter':{
                    'term': {
                        'course':course
                    }
                }
            }
        },
        'knn':{
            'field':'question_text_vector',
            'query_vector': query_v,
            'k':5,
            'num_candidates':10000,
            'boost':0.5,
            'filter':{
                'term':{
                    'course':course
                }
            }
        }, 
        'size':5
    }

In [36]:
hybrid_retriever = ElasticsearchRetriever.from_es_params(
    index_name=index_name,
    body_func=hybrid_search,
    content_field='text',
    url=es_url,
)

In [37]:
query = 'I just discovered the course. Can I still join it?'
course = "data-engineering-zoomcamp"

In [38]:
result = hybrid_retriever.invoke(query)

In [40]:
for result in result:
    print(result.metadata['_source']['question'], result.metadata['_source']['course'], result.metadata['_score'])

Course - Can I still join the course after the start date? data-engineering-zoomcamp 12.559245
Course - Can I follow the course after it finishes? data-engineering-zoomcamp 9.39959
Course - What can I do before the course starts? data-engineering-zoomcamp 7.306914
Course - Can I get support if I take the course in the self-paced mode? data-engineering-zoomcamp 7.1085525
Course - When will the course start? data-engineering-zoomcamp 6.7513986


## pipeline

In [41]:
df_ground_truth = pd.read_csv('ground-truth-data.csv')

In [42]:
groud_truth = df_ground_truth.to_dict(orient='records')
groud_truth[0]

{'question': 'When does the course begin?',
 'course': 'data-engineering-zoomcamp',
 'document': 'c02e79ef'}

In [43]:
def hit_rate(data):
    cnt =0
    for i in data:
        if True in i:
            cnt+=1

    return cnt/len(data)

def mrr(data):
    score = 0.0

    for i in data:
        for j in range(len(i)):
            if i[j] == True:
                score += 1/(j+1)

    return score/len(data)

In [44]:
def evaluation(field, ground_truth, search_func):
    relevance_total=[]

    for q in tqdm(ground_truth):
        doc_id = q['document']
        result = search_func(field, q['question'], q['course'])
        relevance = [doc_id==d['id'] for d in result]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr':mrr(relevance_total)
        }
    

In [55]:
def hybrid_search(field, query, course):

    def hybrid_search(query):
        query_v = embeddings.embed_query(query)
    
        return {
            'query':{
                'bool':{
                    'must':{
                        'multi_match':{
                            'query': query,
                            'fields': ['question', 'text', 'section'],
                            'type':'best_fields',
                            'boost':0.5
                        }
                    },
                    'filter':{
                        'term': {
                            'course':course
                        }
                    }
                }
            },
            'knn':{
                'field':'question_text_vector',
                'query_vector': query_v,
                'k':5,
                'num_candidates':10000,
                'boost':0.5,
                'filter':{
                    'term':{
                        'course':course
                    }
                }
            }, 
            'size':5,
            'source':['id','couse','section','question','text']
        }

    hybrid_retriever = ElasticsearchRetriever.from_es_params(
    index_name=index_name,
    body_func=hybrid_search,
    content_field='text',
    url=es_url,
    )
    response = hybrid_retriever.invoke(query)
        
    result = []
    for hits in response:
        result.append(hits.metadata['_source'])
    return result

In [56]:
groud_truth[0]

{'question': 'When does the course begin?',
 'course': 'data-engineering-zoomcamp',
 'document': 'c02e79ef'}

In [57]:
question = groud_truth[0]['question']
course = groud_truth[0]['course']

In [58]:
hybrid_search('question_text_vector',question,course)

[{'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'id': 'c02e79ef'},
 {'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'id': '7842b56a'},
 {'section': 'General course-related questions',
  'question': 'Course - Can I follow the course after it finishes?',
  'id': 'a482086d'},
 {'section': 'Module 1: Docker and Terraform',
  'question': 'PGCLI - error column c.relhasoids does not exist',
  'id': 'c91ad8f2'},
 {'section': 'General course-related questions',
  'question': 'Course - What are the prerequisites for this course?',
  'id': '1f6520ca'}]

In [60]:
evaluation('question_text_vector',groud_truth, hybrid_search)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4627/4627 [02:06<00:00, 36.47it/s]


{'hit_rate': 0.9252215258266695, 'mrr': 0.8506663785029899}