> **_NOTE:_**  **This script is supposed to be executed at a notebook instance having access to OpenSearch cluster!**

## Step 1

Init a client to access the OpenSearch cluster.

In [1]:
from opensearchpy import OpenSearch

client = OpenSearch(
    hosts = "http://localhost:9200",
    verify_certs=False,
    ssl_show_warn=False
)

client.cat.health()

'1717666616 09:36:56 integTest yellow 1 1 true 7 7 0 0 3 0 - 70.0%\n'

## Step 2

Upload neural sparse model

In [2]:
client.transport.perform_request("PUT","/_cluster/settings",body=
{
  "persistent": {
    "plugins": {
      "ml_commons": {
        "only_run_on_ml_node": "false",
        "native_memory_threshold": "99"
      }
    }
  }
})

{'acknowledged': True,
 'persistent': {'plugins': {'ml_commons': {'only_run_on_ml_node': 'false',
    'native_memory_threshold': '99'}}},
 'transient': {}}

In [3]:
## bi-encoder
client.transport.perform_request("POST","/_plugins/_ml/models/_register?deploy=true",body={
    "name": "amazon/neural-sparse/opensearch-neural-sparse-encoding-v1",
    "version": "1.0.1",
    "model_format": "TORCH_SCRIPT"
})

{'task_id': '21Po7I8BVxEYKIR0uDoX', 'status': 'CREATED'}

In [None]:
## for doc-only

## tokenizer for query
client.transport.perform_request("POST","/_plugins/_ml/models/_register?deploy=true",body={
    "name": "amazon/neural-sparse/opensearch-neural-sparse-tokenizer-v1",
    "version": "1.0.1",
    "model_format": "TORCH_SCRIPT"
})

## model to encode doc
client.transport.perform_request("POST","/_plugins/_ml/models/_register?deploy=true",body={
    "name": "amazon/neural-sparse/opensearch-neural-sparse-encoding-doc-v1",
    "version": "1.0.1",
    "model_format": "TORCH_SCRIPT"
})

In [4]:
client.transport.perform_request("GET","/_plugins/_ml/tasks/21Po7I8BVxEYKIR0uDoX")

{'model_id': '3FPo7I8BVxEYKIR0ujrn',
 'task_type': 'REGISTER_MODEL',
 'function_name': 'SPARSE_ENCODING',
 'state': 'COMPLETED',
 'worker_node': ['mGYbq15vTMulWEFNMp6Duw'],
 'create_time': 1717666625559,
 'last_update_time': 1717666645867,
 'is_async': True}

In [5]:
model_id = "3FPo7I8BVxEYKIR0ujrn"

## Step 3

Set up the ingest processors and index

In [6]:
client.transport.perform_request("PUT","/_ingest/pipeline/chunking-sparse-pipeline",body={
  "description": "A text chunking ingest pipeline",
  "processors": [
    {
      "text_chunking": {
        "algorithm": {
          "fixed_token_length": {
            "token_limit": 10,
            "overlap_rate": 0.2,
            "tokenizer": "standard"
          }
        },
        "field_map": {
          "text": "text_chunk"
        }
      }
    },
    {
      "sparse_encoding": {
        "model_id": model_id,
        "field_map": {
          "text_chunk": "text_sparse"
        }
      }
    }
  ]
})


{'acknowledged': True}

In [9]:
index_name = "test"

client.indices.create(index=index_name, body={
    "settings": {
        "index.number_of_shards": 3,
        "default_pipeline": "chunking-sparse-pipeline"
    },
    "mappings": {
        "properties": {
            "text": {
                "type": "text"
            },
            "text_chunk": {
                "type": "text"
            },
            "text_sparse": {
                "type": "nested",
                "properties":{
                    "sparse_encoding":{
                        "type":"rank_features"
                    }
                }
            }
        }
    }
})

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'test'}

In [10]:
client.index(index=index_name,body={"text":"1 2 3 4 5 6 7 8 9 "*5})

{'_index': 'test',
 '_id': '3lPq7I8BVxEYKIR0NTr-',
 '_version': 1,
 'result': 'created',
 '_shards': {'total': 2, 'successful': 1, 'failed': 0},
 '_seq_no': 0,
 '_primary_term': 1}

In [11]:
# match all
client.search(index=index_name)

{'took': 2,
 'timed_out': False,
 '_shards': {'total': 3, 'successful': 3, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 1, 'relation': 'eq'},
  'max_score': 1.0,
  'hits': [{'_index': 'test',
    '_id': '3lPq7I8BVxEYKIR0NTr-',
    '_score': 1.0,
    '_source': {'text': '1 2 3 4 5 6 7 8 9 1 2 3 4 5 6 7 8 9 1 2 3 4 5 6 7 8 9 1 2 3 4 5 6 7 8 9 1 2 3 4 5 6 7 8 9 ',
     'text_chunk': ['1 2 3 4 5 6 7 8 9 1 ',
      '9 1 2 3 4 5 6 7 8 9 ',
      '8 9 1 2 3 4 5 6 7 8 ',
      '7 8 9 1 2 3 4 5 6 7 ',
      '6 7 8 9 1 2 3 4 5 6 ',
      '5 6 7 8 9 '],
     'text_sparse': [{'sparse_encoding': {'nine': 0.28217945,
        'numbers': 0.84184736,
        'rating': 0.27807358,
        'dice': 0.6914691,
        'seven': 0.8882875,
        'arithmetic': 0.053770408,
        'alphabet': 1.0231282,
        'three': 0.45908993,
        'todd': 0.07154571,
        'multi': 0.26249123,
        'frequency': 0.26369756,
        'division': 0.29245317,
        'number': 0.95898205,
        'score

In [12]:
# search using neural_sparse query
client.search(index=index_name,body={
    "query":{
        "nested": {
            "score_mode": "max",
            "path": "text_sparse",
            "query": {
                "neural_sparse": {
                    "text_sparse.sparse_encoding": {
                        "query_tokens":{
                            "numbers": 1.0
                        }
                    }
                }
            }
        }
    }
})

{'took': 2,
 'timed_out': False,
 '_shards': {'total': 3, 'successful': 3, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 1, 'relation': 'eq'},
  'max_score': 0.8886719,
  'hits': [{'_index': 'test',
    '_id': '3lPq7I8BVxEYKIR0NTr-',
    '_score': 0.8886719,
    '_source': {'text': '1 2 3 4 5 6 7 8 9 1 2 3 4 5 6 7 8 9 1 2 3 4 5 6 7 8 9 1 2 3 4 5 6 7 8 9 1 2 3 4 5 6 7 8 9 ',
     'text_chunk': ['1 2 3 4 5 6 7 8 9 1 ',
      '9 1 2 3 4 5 6 7 8 9 ',
      '8 9 1 2 3 4 5 6 7 8 ',
      '7 8 9 1 2 3 4 5 6 7 ',
      '6 7 8 9 1 2 3 4 5 6 ',
      '5 6 7 8 9 '],
     'text_sparse': [{'sparse_encoding': {'nine': 0.28217945,
        'numbers': 0.84184736,
        'rating': 0.27807358,
        'dice': 0.6914691,
        'seven': 0.8882875,
        'arithmetic': 0.053770408,
        'alphabet': 1.0231282,
        'three': 0.45908993,
        'todd': 0.07154571,
        'multi': 0.26249123,
        'frequency': 0.26369756,
        'division': 0.29245317,
        'number': 0.95898205,
  