### Setup

In [1]:
from azure.identity import DefaultAzureCredential
from azure.core.credentials import AzureKeyCredential
import openai
from azure.search.documents import SearchClient
from azure.search.documents.models import RawVectorQuery
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from tqdm import tqdm
import nltk
import numpy as np
import json

nltk.download('punkt')
nltk.download('stopwords')

# Replace these with your service-specific values, make sure you give your user access to use these resources, or use keys for auth
#AZURE_OPENAI_URL = "<<your openai service url>>" # your Azure OpenAI instance
AZURE_SEARCH_SERVICE = ""
AZURE_SEARCH_APIKEY = ""

creds = AzureKeyCredential(AZURE_SEARCH_APIKEY)

[nltk_data] Downloading package punkt to /home/U4VN/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/U4VN/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
from numpy import dot
from numpy.linalg import norm
def cosine_similarity(a,b):
    return dot(a, b)/(norm(a)*norm(b))

### Vector representations

In [4]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

  from .autonotebook import tqdm as notebook_tqdm
  return torch._C._cuda_getDeviceCount() > 0


In [5]:
sentences = ['This framework generates embeddings for each input sentence',
    'Sentences are passed as a list of string.',
    'The quick brown fox jumps over the lazy dog.']

#Sentences are encoded by calling model.encode()
embeddings = model.encode(sentences)



In [6]:
EMB_SIZE = embeddings.shape[1]

### Document similarity modeled as cosine distance

In [8]:
sentences1 = ['The new movie is awesome',
             'The new movie is awesome',
             'The new movie is awesome']

sentences2 = ['The dog plays in the garden',
              'This recent movie is so good',
              'The new movie is awesome']

embeddings1 = [model.encode(s) for s in sentences1]
embeddings2 = [model.encode(s) for s in sentences2]

for i in range(len(sentences1)):
    print("{} \t\t {} \t\t Score: {}".format(sentences1[i], sentences2[i], util.cos_sim(embeddings1[i], embeddings2[i])[0][0]))

The new movie is awesome 		 The dog plays in the garden 		 Score: 0.08427257835865021
The new movie is awesome 		 This recent movie is so good 		 Score: 0.6956300139427185
The new movie is awesome 		 The new movie is awesome 		 Score: 1.0


### Create Dataset

In [7]:
def load_squad_dataset(input_file):
    with open(input_file, 'r') as file:
        json_list = list(file)

    data = []
    # [1:] removes the header
    for item in json_list[1:]:
        item = json.loads(item)
        data.append(item)
    return data

def get_contexts_questions(squad_data):
    sentences = []
    questions = []
    q2s = {}
    last_sent_idx = 0
    cur_q_idx = -1
    for data in squad_data:
        last_sent_idx = len(sentences)
        context = data['context']
        context_sentences = sent_tokenize(context)
        sentences.extend(context_sentences)

        for qa in data['qas']:
            question = qa['question']

            answer_starts = [start[0] for start in qa['detected_answers'][0]['char_spans']]
            #answer_text = qa['detected_answers'][0]['text']
            cur_q_idx += 1
            questions.append(question)
            q2s[cur_q_idx] = []

            # Find the sentence containing the answer span
            for idx, s in enumerate(context_sentences):
                start_index = context.find(s)
                end_index = start_index + len(s)
                for answer_start in answer_starts:
                    if start_index <= answer_start < end_index:
                        q2s[cur_q_idx].append(idx + last_sent_idx)

    return sentences, questions, q2s

In [8]:
squad_data = load_squad_dataset('./DROP-dev.jsonl')
print(len(squad_data))
sentences, questions, q2s = get_contexts_questions(squad_data)
print(len(sentences), len(questions), len(q2s))


281
2514 1503 1503


### Create a vector index

In [9]:
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import *

In [11]:
AZURE_SEARCH_INDEX = "dropqa" 

In [28]:


index = SearchIndex(
    name=AZURE_SEARCH_INDEX, 
    fields=[
        SimpleField(name="id", type=SearchFieldDataType.String, key=True),
        SearchField(name="content", type=SearchFieldDataType.String),
        SearchField(name="embedding", 
                    type=SearchFieldDataType.Collection(SearchFieldDataType.Single), 
                    searchable=True, 
                    vector_search_dimensions=EMB_SIZE,
                    vector_search_profile="vprofile")
    ],
    vector_search=VectorSearch(
        algorithms=[HnswVectorSearchAlgorithmConfiguration(name="algo", parameters=HnswParameters(metric="cosine"))],
        profiles=[VectorSearchProfile(name="vprofile", algorithm="algo")]
    )
)

index_client = SearchIndexClient(AZURE_SEARCH_SERVICE, credential=creds)
index_client.create_index(index)

<azure.search.documents.indexes.models._index.SearchIndex at 0x2b5b47fe7490>

### Insert the dataset

In [12]:
print(AZURE_SEARCH_SERVICE, AZURE_SEARCH_INDEX)

https://sesame.search.windows.net dropqa


In [13]:
search_client = SearchClient(AZURE_SEARCH_SERVICE, AZURE_SEARCH_INDEX, credential=creds)

In [30]:
embs = model.encode(sentences, show_progress_bar=True)

Batches: 100%|██████████| 1955/1955 [28:38<00:00,  1.14it/s] 


In [31]:
docs = [{"id": str(i), "embedding": embs[i].tolist(), "content": s} for i,s in enumerate(sentences)]

In [32]:


batch_size = 1000
doc_batches = [docs[i:i+batch_size] for i in range(0, len(docs), batch_size)]

# Upload documents in batches
for batch in doc_batches:
    _ = search_client.upload_documents(documents=batch)

In [33]:
print('Indexing finished')

Indexing finished


In [None]:
print(len(docs))

### Search using vector similarity

#### Hybrid retrieval

#### Hybrid + Semantic Reranking

In [15]:
idx = 111
q = questions[idx]
print(f'Question: {q}')
#print(f'Correct Sentence: {sentences[q2s[idx]]}')
print()
r = search_client.search(q, top=5, 
                         vector_queries=[RawVectorQuery(vector=model.encode(q), k=50, fields="embedding")],
                         query_type="semantic", semantic_configuration_name="default", query_language="en-us")
for doc in r:
    print(f"id: {doc['id']}, {doc['content']}, score: {doc['@search.score']}, reranker: {doc['@search.reranker_score']}")

Question: Which mobile phone operator has more users, Mtel or Vivacom?

id: 148, Currently there are three active mobile phone operators—Mobiltel, Telenor and Vivacom, Mtel is the largest one with 5.2 million users as of 2010, Telenor has 3,9 million as of 2007 and Vivacom over 1 million., score: 0.03333333507180214, reranker: 3.6415441036224365
id: 2047, This is compared with 859.9 in 2008 and 964.7 in 1990., score: 0.012345679104328156, reranker: 0.9217217564582825
id: 1053, The population density was ., score: 0.012048192322254181, reranker: 0.515778124332428
id: 2057, Privatization of the state-owned telecommunications firm Český Telecom took place in 2005., score: 0.014492753893136978, reranker: 0.24234060943126678
id: 1817, Chinese in Spain number over 166,000., score: 0.013333333656191826, reranker: 0.18412983417510986


### Run Benchmark

In [17]:
TOP_K = 100

In [None]:
import concurrent.futures

reciprocal_ranks = []
MAX_CONCURRENT_TASKS = 8

def process_question(question, idx, q2s, model, search_client):
    question_embedding = model.encode(question, show_progress_bar=False)
    idx_correct = q2s[idx]
    if idx_correct is None:
        return None

    results = search_client.search(question, top=TOP_K, 
                                   vector_queries=[RawVectorQuery(vector=question_embedding, k=50, fields="embedding")],
                                   query_type="semantic", semantic_configuration_name="default", query_language="en-us")

    for rank, hit in enumerate(results):
        if int(hit['id']) in idx_correct:
            return 1/(rank+1)
    return 0.0

with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_CONCURRENT_TASKS) as executor:
    futures = [executor.submit(process_question, question, idx, q2s, model, search_client) for idx, question in enumerate(questions)]

    for future in tqdm(concurrent.futures.as_completed(futures), total=len(questions)):
        if future.result() is not None:
            reciprocal_ranks.append(future.result())

print(f'MRR@{TOP_K} for {AZURE_SEARCH_INDEX} = {np.mean(reciprocal_ranks)}')


 59%|█████▉    | 891/1503 [04:42<03:04,  3.31it/s]