In [28]:
import requests
import json
import time

# Start with running:
# docker-compose up -d

ES_URL = "http://localhost:9200"



## Check Elasticsearch is Running

In [None]:
response = requests.get(f"{ES_URL}/_cluster/health")
print(json.dumps(response.json(), indent=2))

## Start Trial License

ML features require a trial or paid license.

In [30]:
response = requests.post(f"{ES_URL}/_license/start_trial?acknowledge=true")
print(f"Trial license: {response.json()}")

# Wait longer for ML indices to initialize
print("Waiting for ML indices to be ready...")
time.sleep(20)

Trial license: {'acknowledged': True, 'trial_was_started': True, 'type': 'trial'}
Waiting for ML indices to be ready...


## Wait for Cluster Ready

Wait for all shards to be active before importing models.

In [31]:
import time

# Wait for cluster to be green and ML indices ready
for i in range(60):
    response = requests.get(f"{ES_URL}/_cluster/health")
    health = response.json()
    
    # Check for ML indices specifically
    ml_response = requests.get(f"{ES_URL}/.ml-inference-*/_recovery")
    ml_ready = ml_response.status_code == 200
    
    if health['status'] == 'green' and health['initializing_shards'] == 0 and ml_ready:
        print("Cluster and ML indices are ready!")
        break
    
    status_info = f"status={health['status']}, initializing={health['initializing_shards']}, ml_indices={ml_ready}"
    print(f"Waiting for cluster... {status_info}")
    time.sleep(3)
else:
    print("Warning: Cluster may not be fully ready")

Cluster and ML indices are ready!


## Import Model from Hugging Face

Import msmarco-MiniLM-L12-cos-v5 model using Eland. This will download the model, convert it to TorchScript, and upload to Elasticsearch.

**This may take several minutes on first run.**

In [32]:
%%sh
eland_import_hub_model \
  --url http://localhost:9200 \
  --hub-model-id sentence-transformers/msmarco-MiniLM-L12-cos-v5 \
  --task-type text_embedding \
  --start

2026-01-09 14:05:26,808 INFO : Establishing connection to Elasticsearch
2026-01-09 14:05:26,841 INFO : Connected to cluster named 'docker-cluster' (version: 8.18.0)
2026-01-09 14:05:26,842 INFO : Loading HuggingFace transformer tokenizer and model 'sentence-transformers/msmarco-MiniLM-L12-cos-v5'
`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.
  inverted_mask = torch.tensor(1.0, dtype=dtype) - expanded_mask
2026-01-09 14:05:31,538 INFO : Creating model with id 'sentence-transformers__msmarco-minilm-l12-cos-v5'
2026-01-09 14:05:32,080 INFO : Uploading model definition
100%|██████████| 127/127 [00:05<00:00, 21.92 parts/s]
2026-01-09 14:05:37,874 INFO : Uploading model vocabulary
2026-01-09 14:05:37,934 INFO : Starting model deployment
2026-01-09 14:05:41,520 INFO : Model successfully imported with id 'sentence-transformers__msmarco-minilm-l12-cos-v5'


## Check Model Status

Verify the model was imported and started successfully.

In [33]:
model_id = "sentence-transformers__msmarco-minilm-l12-cos-v5"

response = requests.get(f"{ES_URL}/_ml/trained_models/{model_id}/_stats")
stats = response.json()

if "trained_model_stats" in stats and len(stats["trained_model_stats"]) > 0:
    model_stats = stats["trained_model_stats"][0]
    print(f"Model ID: {model_stats['model_id']}")
    print(f"Deployment state: {model_stats.get('deployment_stats', {}).get('state', 'not deployed')}")
    print(f"Allocations: {model_stats.get('deployment_stats', {}).get('allocation_status', {}).get('allocation_count', 0)}")
else:
    print("Model not found or not deployed")

Model ID: sentence-transformers__msmarco-minilm-l12-cos-v5
Deployment state: started
Allocations: 1


## Create Inference Endpoint

Create an inference endpoint that uses our model.

In [34]:
inference_config = {
    "service": "elasticsearch",
    "service_settings": {
        "model_id": model_id,
        "num_allocations": 1,
        "num_threads": 1
    }
}

response = requests.put(
    f"{ES_URL}/_inference/text_embedding/msmarco-embeddings",
    headers={"Content-Type": "application/json"},
    json=inference_config
)
print(response.json())

{'inference_id': 'msmarco-embeddings', 'task_type': 'text_embedding', 'service': 'elasticsearch', 'service_settings': {'num_allocations': 1, 'num_threads': 1, 'model_id': 'sentence-transformers__msmarco-minilm-l12-cos-v5', 'dimensions': 384, 'similarity': 'cosine', 'element_type': 'float'}, 'chunking_settings': {'strategy': 'sentence', 'max_chunk_size': 250, 'sentence_overlap': 1}}


## Create Index with Semantic Search

Create the works index with semantic_text fields.

In [35]:
with open('mappings.works_semantic.json', 'r') as f:
    mappings = json.load(f)

response = requests.put(
    f"{ES_URL}/works-semantic-local",
    headers={"Content-Type": "application/json"},
    json={"mappings": mappings}
)

print(f"Status: {response.status_code}")
print(json.dumps(response.json(), indent=2))

Status: 200
{
  "acknowledged": true,
  "shards_acknowledged": true,
  "index": "works-semantic-local"
}


## Test with a Sample Document

Index a test document to verify everything works.

In [36]:
test_doc = {
    "id": "test-001",
    "title": "A Brief History of Time",
    "titleSemantic": "A Brief History of Time",
    "description": "Stephen Hawking explores the nature of the universe, black holes, and the theory of relativity.",
    "descriptionSemantic": "Stephen Hawking explores the nature of the universe, black holes, and the theory of relativity."
}

response = requests.post(
    f"{ES_URL}/works-semantic-local/_doc/test-001",
    headers={"Content-Type": "application/json"},
    json=test_doc
)

print(f"Status: {response.status_code}")
print(json.dumps(response.json(), indent=2))

Status: 201
{
  "_index": "works-semantic-local",
  "_id": "test-001",
  "_version": 1,
  "result": "created",
  "_shards": {
    "total": 2,
    "successful": 1,
    "failed": 0
  },
  "_seq_no": 0,
  "_primary_term": 1
}


## Test Semantic Search

Query using semantic search.

In [37]:
# Give ES a moment to index
time.sleep(2)

query = {
    "query": {
        "semantic": {
            "field": "titleSemantic",
            "query": "books about physics and cosmology"
        }
    }
}

response = requests.post(
    f"{ES_URL}/works-semantic-local/_search",
    headers={"Content-Type": "application/json"},
    json=query
)

print(f"Status: {response.status_code}")
results = response.json()
print(f"\nFound {results['hits']['total']['value']} results")
for hit in results['hits']['hits']:
    print(f"\nScore: {hit['_score']}")
    print(f"Title: {hit['_source']['title']}")
    print(f"Description: {hit['_source']['description'][:100]}...")

Status: 200

Found 1 results

Score: 0.58017063
Title: A Brief History of Time
Description: Stephen Hawking explores the nature of the universe, black holes, and the theory of relativity....


## Next Steps

Now you're ready to:
1. Load your full works snapshot
2. Experiment with different queries
3. Compare semantic vs keyword search results
4. Try hybrid queries combining both approaches