In [None]:
!pip install -q datasets
!pip3 install boto3 requests requests_aws4auth argparse opensearch-py

### 1. 下载测试数据squad_v2

In [11]:
from datasets import load_dataset

# 选择要下载的数据集名称，可以在 Hugging Face 数据集网站上查找
dataset_name = "squad_v2"

# 下载数据集
dataset = load_dataset(dataset_name)

# 打印数据集信息
print(dataset)

# 访问数据集中的示例
sample = dataset["train"][0]
print(sample)

  from pandas.core.computation.check import NUMEXPR_INSTALLED


DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 130319
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 11873
    })
})
{'id': '56be85543aeaaa14008c9063', 'title': 'Beyoncé', 'context': 'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "C

### 2. 创建用于测试的AOS索引 & 模型

- 创建sparse model
  + 进入OpenSearch的Integration页面, 由于咱们OpenSearch集群是部署在VPC中的，所以选择“Configure VPC Domain”，会弹出一个Cloudformation模版填写。
    ![integration_1.png](./integration_1.png)<br>
    + vpc请选择OpenSearch所在的vpc，security group选择为OpenSearch同一个
    + 子网subnet请选择对应的Private subnet 
    <br>
  + 验证部署的nerual-sparse模型
    进入cloudformation对应stack，切换到output, 获取modelId, ConnecterId 以及Sagemaker endpoint
    ![nerual-sparse.png](./nerual-sparse.png)

- 创建cohere模型以及ingest的pipeline

In [21]:
aos_endpoint='vpc-domain66ac69e0-2m4jji7cweof-4fefsofiqdzu3hxammxwq5hth4.us-west-2.es.amazonaws.com'
# sparse_model_id=<sparse_model_id> # 需要在cloudformation中去找
sparse_model_id="EVNZh4wBpwn7Z6ncaEtm"
index_name="aos-retrieval"

In [None]:
!python3 setup_model_and_pipeline.py --aos_endpoint {aos_endpoint} --sparse_model_id {sparse_model_id} --index_name {index_name}

### 3. 执行数据摄入

In [29]:
import json
from setup_model_and_pipeline import get_aos_client

def ingest_data(aos_client, index_name, content):    
    request_body = {
        "content": content
    }

    response = aos_client.transport.perform_request(
        method="POST",
        url=f"/{index_name}/_doc",
        body=json.dumps(request_body)
    )

    return response

aos_client = get_aos_client(aos_endpoint)

for idx, item in enumerate(dataset["train"]):
    try:
        response = ingest_data(aos_client, index_name, item['context'][:2000])
        if idx % 200 == 0:
            print(f"{idx}-th ingested.")
        if idx == 1000:
            break
    except Exception as e:
        print(e)
        print(item['context'])

0-th ingested.
200-th ingested.
400-th ingested.
600-th ingested.
800-th ingested.
1000-th ingested.


### 4. 执行查询

In [42]:
def search_by_bm25(aos_client, index_name, query, topk=4):    
    request_body = {
      "size": topk,
      "query": {
        "match": {
            "content" : query
        }
      }
    }

    response = aos_client.transport.perform_request(
        method="GET",
        url=f"/{index_name}/_search",
        body=json.dumps(request_body)
    )

    docs = [hit["_source"]['content'] for hit in response["hits"]["hits"]]
    return docs

def search_by_dense(aos_client, index_name, query, dense_model_id, topk=4):
    request_body = {
        "query": {
            "neural": {
                "dense_embedding": {
                  "query_text": query,
                  "model_id": dense_model_id,
                  "k": topk
                }
            }
        }
    }

    response = aos_client.transport.perform_request(
        method="GET",
        url=f"/{index_name}/_search",
        body=json.dumps(request_body)
    )

    docs = [hit["_source"]['content'] for hit in response["hits"]["hits"]]
    return docs

def search_by_sparse(aos_client, index_name, query, sparse_model_id, topk=4):
    request_body = {
      "query": {
          "size": topk,
          "neural_sparse": {
              "sparse_embedding": {
                "query_text": query,
                "model_id": sparse_model_id,
                "max_token_score": 3.5
              }
          }
      }
    }

    response = aos_client.transport.perform_request(
        method="GET",
        url=f"/{index_name}/_search",
        body=json.dumps(request_body)
    )

    docs = [hit["_source"]['content'] for hit in response["hits"]["hits"]]
    return docs

def search_by_dense_sparse(aos_client, index_name, query, sparse_model_id, dense_model_id, topk=4):
    request_body = {
      "query": {
        "size": topk,
        "hybrid": {
          "queries": [
            {
              "neural_sparse": {
                  "sparse_embedding": {
                    "query_text": query,
                    "model_id": sparse_model_id,
                    "max_token_score": 3.5
                  }
              }
            },
            {
              "neural": {
                  "dense_embedding": {
                      "query_text": query,
                      "model_id": dense_model_id,
                      "k": 10
                    }
                }
            }
          ]
        }
      }
    }

    response = aos_client.transport.perform_request(
        method="GET",
        url=f"/{index_name}/_search?search_pipeline=hybird-search-pipeline",
        body=json.dumps(request_body)
    )

    docs = [hit["_source"]['content'] for hit in response["hits"]["hits"]]
    return docs

def search_by_dense_bm25(aos_client, index_name, query, dense_model_id, topk=4):
    request_body = {
      "size": topk,
      "query": {
        "hybrid": {
          "queries": [
            {
              "query": {
                  "match": {
                    "query_text": query
                  }
              }
            },
            {
              "neural": {
                  "dense_embedding": {
                      "query_text": query,
                      "model_id": dense_model_id,
                      "k": 10
                    }
                }
            }
          ]
        }
      }
    }

    response = aos_client.transport.perform_request(
        method="GET",
        url=f"/{index_name}/_search?search_pipeline=hybird-search-pipeline",
        body=json.dumps(request_body)
    )

    docs = [hit["_source"]['content'] for hit in response["hits"]["hits"]]
    return docs

In [40]:
import time

hit_cnt = 0
miss_cnt = 0
start = time.time()
for idx, item in enumerate(dataset["train"]):
    query = item['question']
    content = item['context']
    if idx % 200 == 0:
        print(f"{idx}-th searched.")
    if idx == 1000:
        break
    results = search_by_bm25(aos_client, index_name, query)
    if content in results:
        hit_cnt += 1
    else:
        miss_cnt += 1
        
elpase_time = time.time() - start
print(f"[search_by_bm25] hit:{hit_cnt}, miss:{miss_cnt}, elpase_time:{elpase_time}")

0-th searched.
200-th searched.
400-th searched.
600-th searched.
800-th searched.
1000-th searched.
[search_by_bm25] hit:605, miss:395, elpase_time:33.366294384002686


In [None]:
import time

hit_cnt = 0
miss_cnt = 0
start = time.time()
dense_model_id = "GLthh4wBsY2vwfNenYNZ"
for idx, item in enumerate(dataset["train"]):
    query = item['question']
    content = item['context']
    if idx % 200 == 0:
        print(f"{idx}-th searched.")
    if idx == 1000:
        break
    results = search_by_dense(aos_client, index_name, query, dense_model_id)
    if content in results:
        hit_cnt += 1
    else:
        miss_cnt += 1
        
elpase_time = time.time() - start
print(f"[search_by_dense] hit:{hit_cnt}, miss:{miss_cnt}, elpase_time:{elpase_time}")

In [None]:
hit_cnt = 0
miss_cnt = 0
sparse_model_id = <sparse_model_id>
for idx, item in enumerate(dataset["train"]):
    query = item['question']
    content = item['context']
    if idx % 200 == 0:
        print(f"{idx}-th searched.")
    results = search_by_sparse(aos_client, index_name, query, sparse_model_id)
    if content in results:
        hit_cnt += 1
    else:
        miss_cnt += 1
        
print(f"[search_by_sparse] hit:{hit_cnt}, miss:{miss_cnt}")

In [None]:
hit_cnt = 0
miss_cnt = 0
sparse_model_id = <sparse_model_id>
dense_model_id = <dense_model_id>
for idx, item in enumerate(dataset["train"]):
    query = item['question']
    content = item['context']
    if idx % 200 == 0:
        print(f"{idx}-th searched.")
    results = search_by_dense_sparse(aos_client, index_name, query, sparse_model_id, dense_model_id)
    if content in results:
        hit_cnt += 1
    else:
        miss_cnt += 1
        
print(f"[search_by_dense_sparse] hit:{hit_cnt}, miss:{miss_cnt}")

In [None]:
hit_cnt = 0
miss_cnt = 0
dense_model_id = <dense_model_id>
for idx, item in enumerate(dataset["train"]):
    query = item['question']
    content = item['context']
    if idx % 200 == 0:
        print(f"{idx}-th searched.")
    results = search_by_dense_bm25(aos_client, index_name, query, dense_model_id)
    if content in results:
        hit_cnt += 1
    else:
        miss_cnt += 1
        
print(f"[search_by_dense_bm25] hit:{hit_cnt}, miss:{miss_cnt}")