In [1]:
import json
from elasticsearch import Elasticsearch

## 1. Read json file

In [2]:
with open('documents.json', 'rt') as f:
    doc_raw = json.load(f)

In [26]:
documents = []

for course in doc_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [27]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

## 2. Embedding model

In [29]:
pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting transformers<5.0.0,>=4.34.0 (from sentence-transformers)
  Downloading transformers-4.44.0-py3-none-any.whl.metadata (43 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.15.1 (from sentence-transformers)
  Downloading huggingface_hub-0.24.5-py3-none-any.whl.metadata (13 kB)
Collecting regex!=2019.12.17 (from transformers<5.0.0,>=4.34.0->sentence-transformers)
  Downloading regex-2024.7.24-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.5/40.5 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.4.1 (from transformers<5.0.0,>=4.34.0->sentence-transformers)
  Downloading safetensors-0.4.4-cp310-cp310-manylinux_2_17_x86_64.m

In [30]:
from sentence_transformers import SentenceTransformer

In [31]:
model = SentenceTransformer('all-mpnet-base-v2')



In [33]:
len(model.encode('Hi'))

768

In [38]:
type(model.encode('Hi'))

numpy.ndarray

In [34]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [39]:
operation = []
for doc in documents:
    doc['text_vector'] = model.encode(doc['text']).tolist() # change from nparray to list
    operation.append(doc)

## 3. Elasticsearch

In [47]:
import elasticsearch

In [48]:
elasticsearch.VERSION

(8, 14, 0)

In [43]:
client = Elasticsearch(
        "http://localhost:9200",
    )

In [45]:
client.info()

ObjectApiResponse({'name': '3657f72a5d01', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'hyZJ4QPTR1OfoYOfmampIA', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [49]:
index_setting = {
    'settings':{
        'number_of_shards':1,
        'number_of_replicas':0
    },
    'mappings':{
        'properties':{
            'text':{'type':'text'},
            'section':{'type':'text'},
            'question':{'type':'text'},
            'course':{'type':'keyword'}, #keyword means it need to be exact the same when searching
            'text_vector':{'type':'dense_vector', 
                           'dims': 768, 
                           'index':True, # index is set to True by default. It means whether this data is used for searching or not.
                           'similarity':'cosine'
                          }
        }
    }
}

#Note: from Elasticsearch v 8.11, it is optional to provide dims, index and similarity parameters in the mapping of dense_vector.

In [50]:
index_name = 'course-question'

In [51]:
client.indices.create(index=index_name, body=index_setting)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-question'})

## 4. Add doc into index

In [64]:
for doc in operation:
    try:
        client.index(index=index_name, document= doc)
    except Exception as e:
        print(e)

## 5. create query

In [53]:
query = 'windows or mac?'
query_e = model.encode(query)

In [56]:
quert_setting = {
    'field':'text_vector',
    'query_vector' : query_e,
    'k' :5, # top-k nearest neighbor
    'num_candidates': 10000,
}

In [65]:
res = client.search(
    index= index_name,
    knn = quert_setting, 
    source=['question', 'section', 'text', 'course']
)

In [67]:
res['hits']['hits'][0]

{'_index': 'course-question',
 '_id': 'jEXZPZEBawL6flzDnd9m',
 '_score': 0.7147919,
 '_source': {'question': 'Environment - Is the course [Windows/mac/Linux/...] friendly?',
  'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'text': 'Yes! Linux is ideal but technically it should not matter. Students last year used all 3 OSes successfully'}}

In [68]:
res1 = client.search(
    index= index_name,
    knn = quert_setting, 
    source=['text']
)
res1['hits']['hits'][0]

{'_index': 'course-question',
 '_id': 'jEXZPZEBawL6flzDnd9m',
 '_score': 0.7147919,
 '_source': {'text': 'Yes! Linux is ideal but technically it should not matter. Students last year used all 3 OSes successfully'}}