In [2]:
import json

fpath = "/workspaces/llm-zoomcamp/01-intro/documents.json"

with open(fpath, 'rt') as f_in:
    docs_raw = json.load(f_in)

In [5]:
documents = []

for course_dict in docs_raw:
    for doc in course_dict["documents"]:
        doc["course"] = course_dict["course"]
        documents.append(doc)

documents[1]

{'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
 'section': 'General course-related questions',
 'question': 'Course - What are the prerequisites for this course?',
 'course': 'data-engineering-zoomcamp'}

## Create Embeddings

In [6]:
!pip install sentence_transformers==2.7.0

Collecting sentence_transformers==2.7.0
  Downloading sentence_transformers-2.7.0-py3-none-any.whl.metadata (11 kB)
Collecting transformers<5.0.0,>=4.34.0 (from sentence_transformers==2.7.0)
  Downloading transformers-4.50.3-py3-none-any.whl.metadata (39 kB)
Collecting huggingface-hub>=0.15.1 (from sentence_transformers==2.7.0)
  Downloading huggingface_hub-0.30.1-py3-none-any.whl.metadata (13 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers<5.0.0,>=4.34.0->sentence_transformers==2.7.0)
  Downloading tokenizers-0.21.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Collecting safetensors>=0.4.3 (from transformers<5.0.0,>=4.34.0->sentence_transformers==2.7.0)
  Downloading safetensors-0.5.3-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Downloading sentence_transformers-2.7.0-py3-none-any.whl (171 kB)
Downloading huggingface_hub-0.30.1-py3-none-any.whl (481 kB)
Downloading transformers-4.50.3-py3-none-any.whl (10.2 MB)
[2K 

In [7]:
from sentence_transformers import SentenceTransformer

In [9]:
MODEL_NAME = "all-mpnet-base-v2"
pretrained_model = SentenceTransformer(MODEL_NAME)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [12]:
operations = []

for doc in documents:
    doc["text_vector"] = pretrained_model.encode(doc["text"]).tolist()
    operations.append(doc)

In [13]:
operations[1]

{'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
 'section': 'General course-related questions',
 'question': 'Course - What are the prerequisites for this course?',
 'course': 'data-engineering-zoomcamp',
 'text_vector': [-0.041030403226614,
  0.025834161788225174,
  -0.036801841109991074,
  -0.020898321643471718,
  -0.020596304908394814,
  0.009353742003440857,
  -0.003331671468913555,
  -0.009491903707385063,
  0.030117977410554886,
  0.01908210851252079,
  0.012690035626292229,
  -0.017078785225749016,
  -0.0016324761090800166,
  0.12997251749038696,
  0.030969230458140373,
  -0.025823738425970078,
  0.0278230682015419,
  0.025159770622849464,
  -0.0808122381567955,
  -0.0036173474509269,
  -0.008902025409042835,
  0.003404824063181877,
  -0.0230092890560627,
  -0.03404529020190239,
  0.024598615244030952,
  0.013545555993914604,
  -0.025439025834202766,
  0.011951087042689323,
  -0.020540112629532814,
  -0.010077380575239658,
  0.020575348287820816,
  0.0

## Working with Elasticsearch

In [14]:
from elasticsearch import Elasticsearch

In [15]:
es_client = Elasticsearch("http://localhost:9200")
es_client.info()

ObjectApiResponse({'name': '0d7a7d9d108e', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'Pv9ZR4FwRRuzh6wgX6FFcA', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

#### Create mappings and index with Elastic

In [17]:
# need to create a mapping first before creating index
index_name = "course-questions"

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} ,
            "text_vector": {
                "type": "dense_vector",
                "dims": 768, 
                "index": True, 
                "similarity": "cosine"
            },
        }
    }
}


In [18]:
es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

## Add documents into index

In [19]:
for doc in operations:
    try:
        es_client.index(index=index_name, document=doc)
    except Exception as e:
        print(e)

## Create search query

In [20]:
search_term = "Windows or Mac?"
vector_search_term = pretrained_model.encode(search_term)

In [22]:
query = {
    "field": "text_vector",
    "query_vector": vector_search_term,
    "k": 5,
    "num_candidates": 2000
}

In [23]:
response = es_client.search(
    index=index_name,
    knn=query,
    source=["text", "section", "question", "course"]
)

In [26]:
response["hits"]["hits"]

[{'_index': 'course-questions',
  '_id': 'zsybB5YB757NQWu_JYFA',
  '_score': 0.7147919,
  '_source': {'question': 'Environment - Is the course [Windows/mac/Linux/...] friendly?',
   'course': 'data-engineering-zoomcamp',
   'section': 'General course-related questions',
   'text': 'Yes! Linux is ideal but technically it should not matter. Students last year used all 3 OSes successfully'}},
 {'_index': 'course-questions',
  '_id': '4cybB5YB757NQWu_N4TH',
  '_score': 0.61347336,
  '_source': {'question': 'WSL instructions',
   'course': 'mlops-zoomcamp',
   'section': 'Module 1: Introduction',
   'text': 'If you wish to use WSL on your windows machine, here are the setup instructions:\nCommand: Sudo apt install wget\nGet Anaconda download address here. wget <download address>\nTurn on Docker Desktop WFree Download | AnacondaSL2\nCommand: git clone <github repository address>\nVSCODE on WSL\nJupyter: pip3 install jupyter\nAdded by Gregory Morris (gwm1980@gmail.com)\nAll in all softwares a

In [28]:
top_result = response["hits"]["hits"][0]
top_result

{'_index': 'course-questions',
 '_id': 'zsybB5YB757NQWu_JYFA',
 '_score': 0.7147919,
 '_source': {'question': 'Environment - Is the course [Windows/mac/Linux/...] friendly?',
  'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'text': 'Yes! Linux is ideal but technically it should not matter. Students last year used all 3 OSes successfully'}}

In [29]:
knn_query = {
    "field": "text_vector",
    "query_vector": vector_search_term,
    "k": 5,
    "num_candidates": 10000
}

In [34]:
response_2 = es_client.search(
    index=index_name,
    query={
        "match": {
            "course": "data-engineering-zoomcamp"
        }
    },
    knn=knn_query,
    size=5,
    explain=True
)

In [35]:
response_2["hits"]["hits"][0]["_source"]

{'text': 'Yes! Linux is ideal but technically it should not matter. Students last year used all 3 OSes successfully',
 'section': 'General course-related questions',
 'question': 'Environment - Is the course [Windows/mac/Linux/...] friendly?',
 'course': 'data-engineering-zoomcamp',
 'text_vector': [-0.026965461671352386,
  -0.000626126304268837,
  -0.01662949100136757,
  0.05285150930285454,
  0.05476527288556099,
  -0.03133990615606308,
  0.029942581430077553,
  -0.04808562621474266,
  0.04467551037669182,
  0.005839474033564329,
  0.016233040019869804,
  0.012001154012978077,
  -0.031222281977534294,
  0.016600528731942177,
  -0.04886901378631592,
  -0.06496307998895645,
  0.046434223651885986,
  -0.009297756478190422,
  -0.0642528235912323,
  -0.01373267825692892,
  -0.015976183116436005,
  0.008629541844129562,
  -0.024478990584611893,
  -0.0059806122444570065,
  0.016313830390572548,
  -0.02634184993803501,
  -0.07652202248573303,
  0.010045071132481098,
  -0.018078546971082687,
