In [4]:
import json
from elasticsearch import Elasticsearch

## 1. Read json file

In [5]:
with open('documents.json', 'rt') as f:
    doc_raw = json.load(f)

In [6]:
documents = []

for course in doc_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [7]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

## 2. Embedding model

In [8]:
pip install sentence-transformers


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1.2[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [9]:
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange


In [10]:
model = SentenceTransformer('all-mpnet-base-v2')



In [11]:
len(model.encode('Hi'))

768

In [12]:
type(model.encode('Hi'))

numpy.ndarray

In [13]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [None]:
operation = []
for doc in documents:
    doc['text_vector'] = model.encode(doc['text']).tolist() # change from nparray to list
    operation.append(doc)

## 3. Elasticsearch

In [None]:
import elasticsearch

In [None]:
elasticsearch.VERSION

In [None]:
client = Elasticsearch(
        "http://localhost:9200",
    )

In [None]:
client.info()

In [None]:
index_setting = {
    'settings':{
        'number_of_shards':1,
        'number_of_replicas':0
    },
    'mappings':{
        'properties':{
            'text':{'type':'text'},
            'section':{'type':'text'},
            'question':{'type':'text'},
            'course':{'type':'keyword'}, #keyword means it need to be exact the same when searching
            'text_vector':{'type':'dense_vector', 
                           'dims': 768, 
                           'index':True, # index is set to True by default. It means whether this data is used for searching or not.
                           'similarity':'cosine'
                          }
        }
    }
}

#Note: from Elasticsearch v 8.11, it is optional to provide dims, index and similarity parameters in the mapping of dense_vector.

In [None]:
index_name = 'course-question'

In [None]:
client.indices.create(index=index_name, body=index_setting)

## 4. Add doc into index

In [None]:
for doc in operation:
    try:
        client.index(index=index_name, document= doc)
    except Exception as e:
        print(e)

## 5. create query

In [None]:
query = 'windows or mac?'
query_e = model.encode(query)

In [None]:
quert_setting = {
    'field':'text_vector',
    'query_vector' : query_e,
    'k' :5, # top-k nearest neighbor
    'num_candidates': 10000,
}

In [65]:
res = client.search(
    index= index_name,
    knn = quert_setting, 
    source=['question', 'section', 'text', 'course']
)

In [67]:
res['hits']['hits'][0]

{'_index': 'course-question',
 '_id': 'jEXZPZEBawL6flzDnd9m',
 '_score': 0.7147919,
 '_source': {'question': 'Environment - Is the course [Windows/mac/Linux/...] friendly?',
  'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'text': 'Yes! Linux is ideal but technically it should not matter. Students last year used all 3 OSes successfully'}}

In [68]:
res1 = client.search(
    index= index_name,
    knn = quert_setting, 
    source=['text']
)
res1['hits']['hits'][0]

{'_index': 'course-question',
 '_id': 'jEXZPZEBawL6flzDnd9m',
 '_score': 0.7147919,
 '_source': {'text': 'Yes! Linux is ideal but technically it should not matter. Students last year used all 3 OSes successfully'}}

In [None]:
res2 = client.search(
    index= index_name,
    knn = quert_setting,
    
    source=['text']
)
res1['hits']['hits'][0]

Object `client.search` not found.
