In [10]:
import requests
import json
import time
from typing import List, Dict, Any

ES_URL = "http://localhost:9200"
INDEX_NAME = "works-semantic-local"

## Load and Transform Works Data

Load works from your JSON file and transform them to match the semantic search index schema.

In [11]:
import ijson
from pathlib import Path

def transform_work(source_work: Dict[str, Any]) -> Dict[str, Any]:
    """Transform a source work document to match semantic search index schema."""
    
    # Extract basic fields
    transformed = {
        "id": source_work.get("id"),
        "title": source_work.get("title", ""),
        "description": source_work.get("description", ""),
    }
    
    # Add semantic fields (copy from original)
    transformed["titleSemantic"] = transformed["title"]
    transformed["descriptionSemantic"] = transformed["description"]
    
    # Transform subjects if present
    if "subjects" in source_work:
        transformed["subjects"] = [
            {
                "label": subject.get("label", ""),
                "concepts": [
                    {
                        "id": concept.get("id"),
                        "label": concept.get("label", "")
                    }
                    for concept in subject.get("concepts", [])
                ]
            }
            for subject in source_work["subjects"]
        ]
    
    # Transform contributors if present
    if "contributors" in source_work:
        transformed["contributors"] = [
            {
                "agent": {
                    "label": contributor.get("agent", {}).get("label", "")
                }
            }
            for contributor in source_work["contributors"]
        ]
    
    # Transform production dates if present
    if "production" in source_work:
        transformed["production"] = [
            {
                "dates": [
                    {"label": date.get("label", "")}
                    for date in prod.get("dates", [])
                ]
            }
            for prod in source_work["production"]
        ]
    
    return transformed

def load_works(file_path: str, max_docs: int = 1000) -> List[Dict[str, Any]]:
    """Load and transform works from JSON file (handles both array and NDJSON)."""
    works = []
    
    # Check file format
    with open(file_path, 'r') as f:
        first_char = f.read(1)
        is_json_array = first_char == '['
    
    if is_json_array:
        # JSON array format - stream parse with ijson
        print(f"Loading from JSON array (max {max_docs} docs)...")
        with open(file_path, 'rb') as f:
            parser = ijson.items(f, 'item')
            for i, work in enumerate(parser):
                if i >= max_docs:
                    break
                try:
                    transformed = transform_work(work)
                    works.append(transformed)
                    if (i + 1) % 100 == 0:
                        print(f"  Loaded {i + 1} works...")
                except Exception as e:
                    print(f"  Warning: Failed to transform work {i}: {e}")
    else:
        # Newline-delimited JSON format
        print(f"Loading from NDJSON (max {max_docs} docs)...")
        with open(file_path, 'r') as f:
            for i, line in enumerate(f):
                if i >= max_docs:
                    break
                try:
                    work = json.loads(line)
                    transformed = transform_work(work)
                    works.append(transformed)
                    if (i + 1) % 100 == 0:
                        print(f"  Loaded {i + 1} works...")
                except Exception as e:
                    print(f"  Warning: Failed to parse line {i}: {e}")
    
    print(f"\nSuccessfully loaded and transformed {len(works)} works")
    return works

In [12]:
# Specify your works JSON file path
WORKS_FILE = "./works.json" 
MAX_WORKS = 100

# Load and transform works
works = load_works(WORKS_FILE, max_docs=MAX_WORKS)

# Show a sample transformed work
if works:
    print("\nSample transformed work:")
    print(json.dumps(works[0], indent=2))

Loading from NDJSON (max 100 docs)...
  Loaded 100 works...

Successfully loaded and transformed 100 works

Sample transformed work:
{
  "id": "wpe7g7wx",
  "title": "Ethiopian magic scrolls / by Jacques Mercier ; [translated from the French by Richard Pevear].",
  "description": "",
  "titleSemantic": "Ethiopian magic scrolls / by Jacques Mercier ; [translated from the French by Richard Pevear].",
  "descriptionSemantic": "",
  "subjects": [
    {
      "label": "Ethiopian magic scrolls",
      "concepts": [
        {
          "id": "nfh36ejb",
          "label": "Ethiopian magic scrolls"
        }
      ]
    },
    {
      "label": "Illumination of books and manuscripts, Ethiopian",
      "concepts": [
        {
          "id": "z9k738zt",
          "label": "Illumination of books and manuscripts, Ethiopian"
        }
      ]
    },
    {
      "label": "Illumination of books and manuscripts",
      "concepts": [
        {
          "id": "suze5c8w",
          "label": "Illuminatio

## Bulk Index Works

Index the transformed works into Elasticsearch.

In [13]:
# Bulk index the works using Elasticsearch bulk API
print(f"Indexing {len(works)} works using bulk API...")

def bulk_index_works(works: List[Dict[str, Any]], batch_size: int = 500):
    """Index works in batches using the bulk API."""
    total_indexed = 0
    total_failed = 0
    
    for i in range(0, len(works), batch_size):
        batch = works[i:i + batch_size]
        
        # Build bulk request body
        bulk_body = []
        for work in batch:
            # Action line
            bulk_body.append(json.dumps({"index": {"_id": work["id"]}}))
            # Document line
            bulk_body.append(json.dumps(work))
        
        # Join with newlines and add trailing newline
        bulk_data = "\n".join(bulk_body) + "\n"
        
        # Send bulk request
        response = requests.post(
            f"{ES_URL}/{INDEX_NAME}/_bulk",
            headers={"Content-Type": "application/x-ndjson"},
            data=bulk_data
        )
        
        if response.status_code == 200:
            result = response.json()
            if result.get("errors"):
                # Count failures
                for item in result.get("items", []):
                    if item.get("index", {}).get("status") not in [200, 201]:
                        total_failed += 1
                    else:
                        total_indexed += 1
            else:
                total_indexed += len(batch)
            
            print(f"  Processed {min(i + batch_size, len(works))}/{len(works)} works...")
        else:
            print(f"✗ Bulk request failed: {response.status_code}")
            total_failed += len(batch)
    
    return total_indexed, total_failed

# Perform bulk indexing
indexed, failed = bulk_index_works(works, batch_size=500)

# Refresh index to make documents searchable
requests.post(f"{ES_URL}/{INDEX_NAME}/_refresh")

print(f"\n✓ Indexing complete:")
print(f"  Successful: {indexed}")
print(f"  Failed: {failed}")
print("  Index refreshed - documents ready for search")

Indexing 100 works using bulk API...
  Processed 100/100 works...

✓ Indexing complete:
  Successful: 100
  Failed: 0
  Index refreshed - documents ready for search


## Semantic Search Query

Test semantic search with natural language queries.

In [None]:
def semantic_search(query_text: str, field: str = "titleSemantic", size: int = 5) -> Dict[str, Any]:
    """Perform semantic search using the semantic field."""
    query = {
        "query": {
            "semantic": {
                "field": field,
                "query": query_text
            }
        },
        "size": size
    }
    
    response = requests.post(
        f"{ES_URL}/{INDEX_NAME}/_search",
        headers={"Content-Type": "application/json"},
        json=query
    )
    return response.json()

def print_results(results: Dict[str, Any], query: str):
    """Pretty print search results."""
    print(f"\n{'='*80}")
    print(f"Query: '{query}'")
    print(f"{'='*80}")
    
    hits = results.get('hits', {}).get('hits', [])
    total = results.get('hits', {}).get('total', {}).get('value', 0)
    
    print(f"Found {total} results\n")
    
    for i, hit in enumerate(hits, 1):
        source = hit['_source']
        score = hit['_score']
        print(f"{i}. [{source['id']}] {source['title']}")
        print(f"   Score: {score:.4f}")
        print(f"   {source['description'][:100]}...")
        print()

## Experiment: Natural Language Queries

Try different natural language queries to see how semantic search understands intent.

In [None]:
# Query 1: Looking for evolution/biology content
results = semantic_search("books about how species evolve and adapt", field="descriptionSemantic")
print_results(results, "books about how species evolve and adapt")

In [None]:
# Query 2: Looking for physics/cosmology content
results = semantic_search("understanding the universe and physics", field="descriptionSemantic")
print_results(results, "understanding the universe and physics")

In [None]:
# Query 3: Searching by concept rather than exact terms
results = semantic_search("genetic information and heredity", field="descriptionSemantic")
print_results(results, "genetic information and heredity")

## Compare: Semantic vs Keyword Search

Compare semantic search with traditional keyword matching.

In [None]:
def keyword_search(query_text: str, field: str = "description", size: int = 5) -> Dict[str, Any]:
    """Perform traditional keyword search using match query."""
    query = {
        "query": {
            "match": {
                field: query_text
            }
        },
        "size": size
    }
    
    response = requests.post(
        f"{ES_URL}/{INDEX_NAME}/_search",
        headers={"Content-Type": "application/json"},
        json=query
    )
    return response.json()

# Compare the two approaches
test_query = "genetic inheritance"

print("\n" + "#" * 80)
print("SEMANTIC SEARCH")
print("#" * 80)
semantic_results = semantic_search(test_query, field="descriptionSemantic")
print_results(semantic_results, test_query)

print("\n" + "#" * 80)
print("KEYWORD SEARCH")
print("#" * 80)
keyword_results = keyword_search(test_query, field="description")
print_results(keyword_results, test_query)

## Experiment: Hybrid Search

Combine semantic and keyword search for best of both worlds.

In [None]:
def hybrid_search(query_text: str, semantic_field: str = "descriptionSemantic", 
                  keyword_field: str = "description", size: int = 5) -> Dict[str, Any]:
    """Combine semantic and keyword search."""
    query = {
        "query": {
            "bool": {
                "should": [
                    {
                        "semantic": {
                            "field": semantic_field,
                            "query": query_text
                        }
                    },
                    {
                        "match": {
                            keyword_field: {
                                "query": query_text,
                                "boost": 0.5  # Lower weight for keyword matching
                            }
                        }
                    }
                ]
            }
        },
        "size": size
    }
    
    response = requests.post(
        f"{ES_URL}/{INDEX_NAME}/_search",
        headers={"Content-Type": "application/json"},
        json=query
    )
    return response.json()

# Test hybrid search
results = hybrid_search("space and time", semantic_field="descriptionSemantic", keyword_field="description")
print_results(results, "space and time (hybrid)")

## Custom Query Experiments

Try your own queries here!

In [None]:
# Your custom query
my_query = "TODO: Enter your query here"

results = semantic_search(my_query, field="descriptionSemantic")
print_results(results, my_query)

## Index Statistics

Check the index status and document count.

In [None]:
# Get index stats
response = requests.get(f"{ES_URL}/{INDEX_NAME}/_stats")
stats = response.json()

index_stats = stats['indices'][INDEX_NAME]['total']
print("Index Statistics:")
print(f"  Documents: {index_stats['docs']['count']:,}")
print(f"  Deleted: {index_stats['docs']['deleted']:,}")
print(f"  Store size: {index_stats['store']['size_in_bytes'] / (1024**2):.2f} MB")

# Get a sample document
response = requests.get(f"{ES_URL}/{INDEX_NAME}/_search?size=1")
sample = response.json()['hits']['hits'][0]['_source']
print("\nSample document:")
print(json.dumps(sample, indent=2))