In [None]:
!pip install -q datasets
!pip3 install boto3 requests requests_aws4auth argparse opensearch-py

### 1. Download Experiment Data - squad_v2 (下载实验数据squad_v2）

In [11]:
from datasets import load_dataset

dataset_name = "squad_v2"

dataset = load_dataset(dataset_name)

print(dataset)

sample = dataset["train"][0]
print(sample)

  from pandas.core.computation.check import NUMEXPR_INSTALLED


DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 130319
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 11873
    })
})
{'id': '56be85543aeaaa14008c9063', 'title': 'Beyoncé', 'context': 'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "C

### 2. Setup OpenSearch Index & Model For Experiment(创建用于测试的AOS索引 & 模型) 

- Setup Sparse vector model(创建Sparse Vector模型)
  + 进入OpenSearch的Integration页面, 由于咱们OpenSearch集群是部署在VPC中的，所以选择“Configure VPC Domain”，会弹出一个Cloudformation模版填写。
    ![integration_1.png](./integration_1.png)<br>
    + vpc请选择OpenSearch所在的vpc，security group选择为OpenSearch同一个
    + 子网subnet请选择对应的Private subnet 
    <br>
  + 验证部署的nerual-sparse模型
    进入cloudformation对应stack，切换到output, 获取modelId, ConnecterId 以及Sagemaker endpoint
    ![nerual-sparse.png](./nerual-sparse.png)

- Setup Cohere Multilingual Model & ingestion pipeline(创建cohere模型以及ingest的pipeline)

In [21]:
aos_endpoint='vpc-domain66ac69e0-2m4jji7cweof-4fefsofiqdzu3hxammxwq5hth4.us-west-2.es.amazonaws.com'
# sparse_model_id=<sparse_model_id> # look for sparse_model_id in the output of Cloudformation
index_name="aos-retrieval"

In [None]:
!python3 setup_model_and_pipeline.py --aos_endpoint {aos_endpoint} --sparse_model_id {sparse_model_id} --index_name {index_name}

- Extarct dense_model_id for query embedding from Output of previous cell(根据上个Cell的输出提取query向量化的模型ID)

In [None]:
dense_model_id=<dense_model_id>

### 3. Ingest Data（执行数据摄入）

In [29]:
import json
from setup_model_and_pipeline import get_aos_client

def ingest_data(aos_client, index_name, content):    
    request_body = {
        "content": content
    }

    response = aos_client.transport.perform_request(
        method="POST",
        url=f"/{index_name}/_doc",
        body=json.dumps(request_body)
    )

    return response

aos_client = get_aos_client(aos_endpoint)

for idx, item in enumerate(dataset["train"].select(range(1000))):
    try:
        response = ingest_data(aos_client, index_name, item['context'][:2000])
        if idx % 50 == 0:
            print(f"{idx}-th ingested.")
    except Exception as e:
        print(e)
        print(item['context'])

0-th ingested.
200-th ingested.
400-th ingested.
600-th ingested.
800-th ingested.
1000-th ingested.


### 4. Search benchmark （查询性能测试）

In [69]:
from search_func import search_by_bm25, search_by_dense, search_by_sparse, search_by_dense_sparse, search_by_dense_bm25

In [63]:
import time

hit_cnt = 0
miss_cnt = 0
start = time.time()
for idx, item in enumerate(dataset["train"].select(range(1000))):
    query = item['question']
    content = item['context']
    if idx % 50 == 0:
        print(f"{idx}-th searched.")
    results = search_by_bm25(aos_client, index_name, query)
    if content in results:
        hit_cnt += 1
    else:
        miss_cnt += 1
        
elpase_time = time.time() - start
print(f"[search_by_bm25] hit:{hit_cnt}, miss:{miss_cnt}, elpase_time:{elpase_time}")

0-th searched.
200-th searched.
400-th searched.
600-th searched.
800-th searched.
[search_by_bm25] hit:605, miss:395, elpase_time:31.39031982421875


In [71]:
import time

hit_cnt = 0
miss_cnt = 0
start = time.time()
for idx, item in enumerate(dataset["train"].select(range(1000))):
    query = item['question']
    content = item['context']
    if idx % 50 == 0:
        print(f"{idx}-th searched.")
    results = search_by_dense(aos_client, index_name, query, dense_model_id)
    if content in results:
        hit_cnt += 1
    else:
        miss_cnt += 1
        
elpase_time = time.time() - start
print(f"[search_by_dense] hit:{hit_cnt}, miss:{miss_cnt}, elpase_time:{elpase_time}")

0-th searched.
200-th searched.
400-th searched.
600-th searched.
800-th searched.
[search_by_dense] hit:772, miss:228, elpase_time:391.7816433906555


In [72]:
hit_cnt = 0
miss_cnt = 0
sparse_model_id = 'EVNZh4wBpwn7Z6ncaEtm'
start = time.time()
for idx, item in enumerate(dataset["train"].select(range(1000))):
    query = item['question']
    content = item['context']
    if idx % 50 == 0:
        print(f"{idx}-th searched.")
    results = search_by_sparse(aos_client, index_name, query, sparse_model_id)
    if content in results:
        hit_cnt += 1
    else:
        miss_cnt += 1

elpase_time = time.time() - start
print(f"[search_by_sparse] hit:{hit_cnt}, miss:{miss_cnt}, elpase_time:{elpase_time}")

0-th searched.
200-th searched.
400-th searched.
600-th searched.
800-th searched.
[search_by_sparse] hit:718, miss:282, elpase_time:459.3156998157501


In [77]:
hit_cnt = 0
miss_cnt = 0
start = time.time()
for idx, item in enumerate(dataset["train"].select(range(1000))):
    query = item['question']
    content = item['context']
    if idx % 50 == 0:
        print(f"{idx}-th searched.")
    results = search_by_dense_sparse(aos_client, index_name, query, sparse_model_id, dense_model_id)
    if content in results:
        hit_cnt += 1
    else:
        miss_cnt += 1

elpase_time = time.time() - start
print(f"[search_by_dense_sparse] hit:{hit_cnt}, miss:{miss_cnt}, elpase_time:{elpase_time}")

0-th searched.
50-th searched.
100-th searched.
150-th searched.
200-th searched.
250-th searched.
300-th searched.
350-th searched.
400-th searched.
450-th searched.
500-th searched.
550-th searched.
600-th searched.
650-th searched.
700-th searched.
750-th searched.
800-th searched.
850-th searched.
900-th searched.
950-th searched.
[search_by_dense_sparse] hit:840, miss:160, elpase_time:471.84466671943665


In [82]:
hit_cnt = 0
miss_cnt = 0
dense_model_id = "GLthh4wBsY2vwfNenYNZ"
start = time.time()
for idx, item in enumerate(dataset["train"].select(range(1000))):
    query = item['question']
    content = item['context']
    if idx % 50 == 0:
        print(f"{idx}-th searched.")
    results = search_by_dense_bm25(aos_client, index_name, query, dense_model_id)
    if content in results:
        hit_cnt += 1
    else:
        miss_cnt += 1

elpase_time = time.time() - start
print(f"[search_by_dense_sparse] hit:{hit_cnt}, miss:{miss_cnt}, elpase_time:{elpase_time}")

0-th searched.
50-th searched.
100-th searched.
150-th searched.
200-th searched.
250-th searched.
300-th searched.
350-th searched.
400-th searched.
450-th searched.
500-th searched.
550-th searched.
600-th searched.
650-th searched.
700-th searched.
750-th searched.
800-th searched.
850-th searched.
900-th searched.
950-th searched.
[search_by_dense_sparse] hit:725, miss:275, elpase_time:409.31755113601685
