# Serverless Elasticsearch Semantic Search

This notebook loads data into a serverless Elasticsearch project and experiments with semantic search using `semantic_text` fields.

## Install and Import Required Libraries

In [55]:
import json
import requests
from pathlib import Path
from typing import Dict, List, Any
from datetime import datetime
import boto3

# Get API key from AWS Secrets Manager
def get_elasticsearch_api_key():
    """Retrieve Elasticsearch API key from AWS Secrets Manager."""
    secret_name = "agnes/elasticsearch/semantic-playground"
    region_name = "eu-west-1"
    
    session = boto3.session.Session(profile_name='platform-developer')
    client = session.client(
        service_name='secretsmanager',
        region_name=region_name
    )
    
    try:
        get_secret_value_response = client.get_secret_value(
            SecretId=secret_name
        )
        secret = get_secret_value_response['SecretString']
        return secret
    except Exception as e:
        print(f"Error retrieving secret: {e}")
        return ""

API_KEY = get_elasticsearch_api_key()
print("API key retrieved from AWS Secrets Manager")

API key retrieved from AWS Secrets Manager


## Configure Elasticsearch Connection

Configure your serverless Elasticsearch credentials. Update with your actual values from the Elasticsearch serverless console.

In [56]:
# Serverless Elasticsearch Configuration
ES_ENDPOINT = "https://semantic-playground-b28f61.es.eu-west-1.aws.elastic.cloud:443"

# Index configuration
INDEX_NAME = "works-semantic-v1"

# Setup headers for authentication
headers = {
    "Authorization": f"ApiKey {API_KEY}",
    "Content-Type": "application/json"
}

def es_request(method: str, endpoint: str, data: dict = None) -> dict:
    """Make a request to Elasticsearch with proper authentication."""
    url = f"{ES_ENDPOINT}/{endpoint}"
    if method == "GET":
        response = requests.get(url, headers=headers)
    elif method == "POST":
        response = requests.post(url, headers=headers, json=data)
    elif method == "PUT":
        response = requests.put(url, headers=headers, json=data)
    elif method == "DELETE":
        response = requests.delete(url, headers=headers)
    
    response.raise_for_status()
    return response.json() if response.text else {}

# Test connection
try:
    info = es_request("GET", "")
    print(f"Connected to Elasticsearch {info['version']['number']}")
    print(f"Cluster: {info['cluster_name']}")
except Exception as e:
    print(f"Connection failed: {e}")

Connected to Elasticsearch 8.11.0
Cluster: b28f616a820849ab8b809ead5ba160f1


## Create Inference Endpoint

Set up the semantic text inference endpoint using Elasticsearch's ELSER model.
This only needs to be run once for each embedding model

In [None]:
# Create inference endpoint for semantic text using ELSER
inference_config = {
  "service": "elser",
  "service_settings": {
      "num_allocations": 1,
      "num_threads": 1
  }
}

try:
    result = es_request("PUT", "_inference/sparse_embedding/elser-embeddings", inference_config)
    print("ELSER inference endpoint created successfully")
    print(json.dumps(result, indent=2))
except requests.exceptions.HTTPError as e:
    if "resource_already_exists_exception" in str(e.response.text):
        print("ELSER inference endpoint already exists")
    else:
        print(f"Error creating inference endpoint: {e}")
        print(e.response.text)
 

# response
# {
#   "inference_id": "elser-embeddings",
#   "task_type": "sparse_embedding",
#   "service": "elasticsearch",
#   "service_settings": {
#     "num_allocations": 1,
#     "num_threads": 1,
#     "model_id": ".elser_model_2_linux-x86_64",
#     "adaptive_allocations": {
#       "enabled": true,
#       "min_number_of_allocations": 0,
#       "max_number_of_allocations": 1
#     }
#   },
#   "chunking_settings": {
#     "strategy": "sentence",
#     "max_chunk_size": 250,
#     "sentence_overlap": 1
#   }
# }

## Load Sample Data

Load the works.json file containing catalogue records.

In [77]:
# Configure sample size
SAMPLE_SIZE = 10000  # Set to None to load all works

def load_works(file_path: str, limit: int = None) -> List[Dict]:
    """Load works from JSON file (handles both array and NDJSON formats)."""
    works = []
    with open(file_path, 'r') as f:
        first_char = f.read(1)
        f.seek(0)
        
        if first_char == '[':
            # JSON array format
            all_works = json.load(f)
            works = all_works[:limit] if limit else all_works
        else:
            # NDJSON format
            for line in f:
                if line.strip():
                    works.append(json.loads(line))
                    if limit and len(works) >= limit:
                        break
    
    return works

# Load works
works_file = Path("./works.json")
if works_file.exists():
    works = load_works(str(works_file), SAMPLE_SIZE)
    print(f"Loaded {len(works)} works" + (f" (limited to {SAMPLE_SIZE})" if SAMPLE_SIZE else ""))
    print(f"\nSample work keys: {list(works[0].keys())}")
    print(f"\nSample work: {works[0]}")

else:
    print(f"Error: {works_file} not found")

Loaded 10000 works (limited to 10000)

Sample work keys: ['production', 'physicalDescription', 'subjects', 'items', 'designation', 'workType', 'identifiers', 'formerFrequency', 'alternativeTitles', 'id', 'languages', 'partOf', 'genres', 'notes', 'holdings', 'title', 'type', 'contributors', 'images', 'availabilities', 'parts']

Sample work: {'production': [{'label': 'New York : G. Braziller, 1979.', 'agents': [{'label': 'G. Braziller', 'type': 'Agent'}], 'dates': [{'label': '1979', 'type': 'Period'}], 'type': 'ProductionEvent', 'places': [{'label': 'New York', 'type': 'Place'}]}], 'physicalDescription': '118 pages : illustrations (some colour) ; 29 cm', 'subjects': [{'label': 'Ethiopian magic scrolls', 'concepts': [{'id': 'nfh36ejb', 'label': 'Ethiopian magic scrolls', 'identifiers': [{'value': 'sh85045154', 'type': 'Identifier', 'identifierType': {'id': 'lc-subjects', 'type': 'IdentifierType', 'label': 'Library of Congress Subject Headings (LCSH)'}}], 'type': 'Concept'}], 'identifiers'

## Create Index with Semantic Text Mapping

Create an index with `semantic_text` fields for title and description.

In [None]:
# Load mapping from file
with open('mappings.works_semantic.json', 'r') as f:
    mappings = json.load(f)

index_config = {
    "mappings": mappings
}

print("Loaded mapping with fields:")
for field_name in mappings['properties'].keys():
    field_type = mappings['properties'][field_name].get('type', 'nested/object')
    print(f"  - {field_name}: {field_type}")

# Delete index if it exists
# try:
#     es_request("DELETE", INDEX_NAME)
#     print(f"\nDeleted existing index: {INDEX_NAME}")
# except:
#     pass

# Create the index
try:
    result = es_request("PUT", INDEX_NAME, index_config)
    print(f"Created index: {INDEX_NAME}")
    print(json.dumps(result, indent=2))
except Exception as e:
    print(f"Error creating index: {e}")

## Amend mapping

In [42]:
# Add the new titleDescriptionSemantic field to existing index
new_field_mapping = {
    "properties": {
        "titleDescriptionSemantic": {
          "type": "semantic_text",
          "inference_id": "elser-embeddings",
          "model_settings": {
            "service": "elasticsearch",
            "task_type": "sparse_embedding"
          }
        }
    }
}

try:
    result = es_request("PUT", f"{INDEX_NAME}/_mapping", new_field_mapping)
    print("Successfully added titleDescriptionSemantic field to index")
    print(json.dumps(result, indent=2))
except Exception as e:
    print(f"Error updating mapping: {e}")


Successfully added titleDescriptionSemantic field to index
{
  "acknowledged": true
}


## Index Documents with Bulk API

Transform and bulk index the works into Elasticsearch.

In [78]:
def transform_work(work: Dict) -> Dict:
    """Transform work to semantic search schema, matching mappings.works_semantic.json."""
    title = work.get('title', '')
    description = work.get('description', '')
    
    # Concatenate title and description for semantic field
    combined_text = f"{title} {description}".strip()
    
    # Transform subjects to only include mapped fields
    subjects = []
    for subject in work.get('subjects', []):
        transformed_subject = {
            'label': subject.get('label', '')
        }
        # Transform concepts to only include id and label
        concepts = []
        for concept in subject.get('concepts', []):
            concepts.append({
                'id': concept.get('id', ''),
                'label': concept.get('label', '')
            })
        if concepts:
            transformed_subject['concepts'] = concepts
        subjects.append(transformed_subject)
    
    # Transform contributors to only include agent.label
    contributors = []
    for contributor in work.get('contributors', []):
        agent = contributor.get('agent', {})
        contributors.append({
            'agent': {
                'label': agent.get('label', '')
            }
        })
    
    # Transform production to only include dates.label
    production = []
    for prod in work.get('production', []):
        transformed_prod = {}
        dates = []
        for date in prod.get('dates', []):
            dates.append({
                'label': date.get('label', '')
            })
        if dates:
            transformed_prod['dates'] = dates
            production.append(transformed_prod)
    
    # Return only the fields defined in mappings.works_semantic.json
    return {
        'id': work.get('id', work.get('canonicalId', '')),
        'title': title,
        'description': description,
        'subjects': subjects,
        'contributors': contributors,
        'production': production,
        'titleSemantic': title,
        'descriptionSemantic': description,
        'titleDescriptionSemantic': combined_text
    }

def bulk_index_works(works: List[Dict], batch_size: int = 25) -> Dict:
    """Bulk index works to Elasticsearch in batches."""
    import time
    start_time = datetime.now()
    
    total_successes = 0
    total_failures = 0
    
    # Process in batches
    for i in range(0, len(works), batch_size):
        batch = works[i:i + batch_size]
        batch_num = (i // batch_size) + 1
        total_batches = (len(works) + batch_size - 1) // batch_size
        
        # Build bulk request for this batch
        bulk_data = []
        for work in batch:
            doc = transform_work(work)
            bulk_data.append(json.dumps({"index": {"_index": INDEX_NAME, "_id": doc['id']}}))
            bulk_data.append(json.dumps(doc))
        
        bulk_body = "\n".join(bulk_data) + "\n"
        
        # Send bulk request with retry logic
        url = f"{ES_ENDPOINT}/_bulk"
        max_retries = 3
        for attempt in range(max_retries):
            try:
                response = requests.post(
                    url,
                    headers={**headers, "Content-Type": "application/x-ndjson"},
                    data=bulk_body,
                    timeout=120  # Increased timeout for ELSER processing
                )
                response.raise_for_status()
                result = response.json()
                
                # Count successes and failures
                items = result.get('items', [])
                successes = sum(1 for item in items if item['index']['status'] in [200, 201])
                failures = len(items) - successes
                
                total_successes += successes
                total_failures += failures
                
                print(f"Batch {batch_num}/{total_batches}: {successes} indexed, {failures} failed")
                
                if failures > 0:
                    # Show first error in this batch
                    for item in items[:1]:
                        if item['index']['status'] not in [200, 201]:
                            print(f"  Sample error: {item['index'].get('error', 'Unknown error')}")
                
                break  # Success, exit retry loop
                
            except Exception as e:
                if attempt < max_retries - 1:
                    print(f"Batch {batch_num} attempt {attempt + 1} failed: {e}. Retrying...")
                    time.sleep(3 * (attempt + 1))  # Progressive backoff: 3s, 6s, 9s
                else:
                    print(f"Batch {batch_num} failed after {max_retries} attempts: {e}")
                    total_failures += len(batch)
    
    elapsed = (datetime.now() - start_time).total_seconds()
    print(f"\n{'='*60}")
    print(f"Total: {total_successes} indexed, {total_failures} failed in {elapsed:.2f}s")
    print(f"Rate: {total_successes / elapsed:.1f} docs/sec")
    print(f"{'='*60}")
    
    return {"total_successes": total_successes, "total_failures": total_failures}

# Index the works
if works:
    result = bulk_index_works(works, batch_size=25)
    print(f"\nBulk indexing completed")


Batch 1/400: 25 indexed, 0 failed
Batch 2/400: 25 indexed, 0 failed
Batch 3/400: 25 indexed, 0 failed
Batch 4/400: 25 indexed, 0 failed
Batch 5/400: 25 indexed, 0 failed
Batch 6/400: 25 indexed, 0 failed
Batch 7/400: 25 indexed, 0 failed
Batch 8/400: 25 indexed, 0 failed
Batch 9/400: 25 indexed, 0 failed
Batch 10/400: 25 indexed, 0 failed
Batch 11/400: 25 indexed, 0 failed
Batch 12/400: 25 indexed, 0 failed
Batch 13/400: 25 indexed, 0 failed
Batch 14/400: 25 indexed, 0 failed
Batch 15/400: 25 indexed, 0 failed
Batch 16/400: 25 indexed, 0 failed
Batch 17/400: 25 indexed, 0 failed
Batch 18/400: 25 indexed, 0 failed
Batch 19/400: 25 indexed, 0 failed
Batch 20/400: 25 indexed, 0 failed
Batch 21/400: 25 indexed, 0 failed
Batch 22/400: 25 indexed, 0 failed
Batch 23/400: 25 indexed, 0 failed
Batch 24/400: 25 indexed, 0 failed
Batch 25/400: 25 indexed, 0 failed
Batch 26/400: 25 indexed, 0 failed
Batch 27/400: 25 indexed, 0 failed
Batch 28/400: 25 indexed, 0 failed
Batch 29/400: 25 indexed, 0 f

## Verify Data Upload

Check index stats and document count.

In [None]:
# Refresh index to make documents searchable
es_request("POST", f"{INDEX_NAME}/_refresh")

# Get document count
count_result = es_request("GET", f"{INDEX_NAME}/_count")
print(f"Total documents in index: {count_result['count']}")

# Get index stats
stats_result = es_request("GET", f"{INDEX_NAME}/_stats")
store_size = stats_result['indices'][INDEX_NAME]['total']['store']['size_in_bytes']
print(f"Index size: {store_size / 1024 / 1024:.2f} MB")

# Check inference endpoint stats
try:
    inference_stats = es_request("GET", "_inference/_services/sparse_embedding/elser-embeddings/_stats")
    print("\nInference stats:")
    print(json.dumps(inference_stats, indent=2))
except Exception as e:
    print(f"Could not get inference stats: {e}")

## Search Queries

Template semantic, keywork and hybrid RFF queries

In [None]:
def search_keyword(query: str, size: int = 10) -> Dict:
    """Search using traditional keyword search."""
    search_query = {
        "size": size,
        "query": {
            "multi_match": {
                "query": query,
                "fields": ["title^3", "description"]
            }
        },
        "_source": ["id", "title", "description"]
    }
    
    return es_request("POST", f"{INDEX_NAME}/_search", search_query)


def search_semantic(query: str, fields: List[str] = None, size: int = 10) -> Dict:
    """
    Search using semantic_text field(s) with the recommended match query.
    
    Args:
        query: Search query string
        fields: List of semantic fields to search. Options: 'titleSemantic', 'descriptionSemantic', 'titleDescriptionSemantic'
                If None, searches both titleSemantic and descriptionSemantic
        size: Number of results to return
    """
    if fields is None:
        fields = ["titleSemantic", "descriptionSemantic"]
    
    search_query = {
        "size": size,
        "query": {
            "bool": {
                "should": [
                    {"match": {field: {"query": query}}}
                    for field in fields
                ]
            }
        },
        "_source": ["id", "title", "description"]
    }
    
    return es_request("POST", f"{INDEX_NAME}/_search", search_query)


def rrf_retriever(query: str, size: int = 10) -> dict:
    """
    Use Elasticsearch retriever API with Reciprocal Rank Fusion (RRF) to combine keyword and semantic retrieval.
    """
    rrf_query = {
        "retriever": {
            "rrf": {
                "retrievers": [
                    {
                        "standard": {
                            "query": {
                                "multi_match": {
                                    "query": query,
                                    "fields": ["title^3", "description"]
                                }
                            }
                        }
                    },
                    {
                        "standard": {
                            "query": {
                                "match": {
                                    "titleDescriptionSemantic": {
                                        "query": query
                                    }
                                }
                            }
                        }
                    }
                ]
            }
        },
        "size": size
    }
    return es_request("POST", f"{INDEX_NAME}/_search", rrf_query)


def filtered_semantic_rerank(query: str, filter: dict, size: int = 10, candidate_size: int = 100) -> dict:
    """
    Filter documents by nested fields, then rerank the filtered set using semantic relevance.
    
    Args:
        query: The semantic search query string.
        filter: A dict representing filters for nested fields. Supports:
            - contributors.agent.label: term filter (string value)
            - production.dates.label: range filter (dict value)
        size: Number of final results to return.
        candidate_size: Number of candidates to retrieve before reranking.
    Returns:
        Elasticsearch search results after semantic reranking.
    """
    # Step 1: Build nested filter queries
    filter_clauses = []
    
    for field_path, value in filter.items():
        if field_path.startswith("contributors."):
            # contributors is nested - always use term query
            filter_clauses.append({
                "nested": {
                    "path": "contributors",
                    "query": {"term": {field_path: value}}
                }
            })
            
        elif field_path.startswith("production."):
            # production.dates is doubly nested - always use range query
            filter_clauses.append({
                "nested": {
                    "path": "production.dates",
                    "query": {"range": {field_path: value}}
                }
            })
    
    # Build filter query
    filtered_query = {
        "size": candidate_size,
        "query": {
            "bool": {
                "filter": filter_clauses
            }
        },
        "_source": ["id", "title", "description"]
    }
    
    filtered_results = es_request("POST", f"{INDEX_NAME}/_search", filtered_query)
    candidate_ids = [hit["_source"]["id"] for hit in filtered_results["hits"]["hits"]]
    
    if not candidate_ids:
        return {"hits": {"total": {"value": 0}, "hits": []}}

    # Step 2: Semantic rerank within candidates
    rerank_query = {
        "size": size,
        "query": {
            "bool": {
                "filter": [
                    {"terms": {"id": candidate_ids}}
                ],
                "should": [
                    {"match": {"titleDescriptionSemantic": {"query": query}}}
                ]
            }
        },
        "_source": ["id", "title", "description"]
    }
    return es_request("POST", f"{INDEX_NAME}/_search", rerank_query)


def print_results(results: Dict):
    """Pretty print search results."""
    hits = results['hits']['hits']
    print(f"\nFound {results['hits']['total']['value']} results\n")
    
    for i, hit in enumerate(hits, 1):
        source = hit['_source']
        score = hit['_score']
        doc_id = source.get('id', 'No ID')
        work_url = f"https://wellcomecollection.org/works/{doc_id}"
        print(f"{i}. [Score: {score:.3f}] [{doc_id}] {source.get('title', 'No title')}")
        print(f"   {work_url}")
        desc = source.get('description', '')
        if desc:
            print(f"   {desc[:150]}...")
        print()

## Compare Semantic Field Strategies

Compare semantic search results across different field strategies:
1. **KEYWORD BASELINE**: Uses keywork match, with a boost 3 on the title 
2. **SEPARATE TITLE AND DESCRIPTION FIELDS**: Use separate text vectors, title and description
3. **CONCAT TITLE + DESCRIPTION**: Use a vector that concats title and description 
4. **NATIVE RRF**: Use both keyword and semantic search, with Reciprocal Rank Fusion
5. **FILTER AND SEMANTIC RERANK**: Filters to return a list of relevant results then rescore based on semantic field

In [89]:
test_query = "botany"

print("\n" + "=" * 80)
print("KEYWORD BASELINE")
print("=" * 80)
keyword_results = search_keyword(test_query, size=10)
print_results(keyword_results)

print("\n" + "=" * 80)
print("SEPARATE TITLE AND DESCRIPTION FIELDS (titleSemantic + descriptionSemantic)")
print("=" * 80)
desc_results = search_semantic(test_query, fields=["titleSemantic", "descriptionSemantic"], size=10)
print_results(desc_results)

print("\n" + "=" * 80)
print("CONCAT TITLE + DESCRIPTION (titleDescriptionSemantic)")
print("=" * 80)
combined_results = search_semantic(test_query, fields=["titleDescriptionSemantic"], size=10)
print_results(combined_results)

print("\n" + "=" * 80)
print("NATIVE RRF (keyword + semantic)")
print("=" * 80)
rff_results = rff_retriever(test_query, size=10)
print_results(rff_results)

print("\n" + "=" * 80)
print("FILTER AND SEMANTIC RERANK")
print("=" * 80)
reranked_results = filtered_semantic_rerank(
    test_query, 
    filter={"production.dates.label": {"gte": "1900", "lte": "1990"}}, 
    # filter={"contributors.agent.label": "Royal College of Surgeons of England"}, 
    size=10
)
print_results(reranked_results)



KEYWORD BASELINE

Found 12 results

1. [Score: 30.536] [rr4gxybm] Environmental and experimental botany
   https://wellcomecollection.org/works/rr4gxybm

2. [Score: 27.989] [b2eyqe9f] Aquatic botany
   https://wellcomecollection.org/works/b2eyqe9f

3. [Score: 18.975] [p5vcv6uj] A synoptical compend of British botany ... arranged after the Linnean system / [John Kingston Galpine].
   https://wellcomecollection.org/works/p5vcv6uj

4. [Score: 18.752] [x33hq337] Cultivating women, cultivating science : Flora's daughters and botany in England, 1760-1860 / Ann B. Shteir.
   https://wellcomecollection.org/works/x33hq337

5. [Score: 17.778] [pcbzat7b] Percival, John (1865-1949), botanist, Emeritus Professor of Agriculture (Botany) at Reading University
   https://wellcomecollection.org/works/pcbzat7b
   1 autograph letter, signed, with envelope....

6. [Score: 17.085] [duz67tp2] The development of biochemistry in England through botany and the brewing industry 1840-1890 / Neil Davies Morgan.
