In [None]:
import os
import json
from dotenv import load_dotenv
load_dotenv()

from langchain.embeddings import HuggingFaceEmbeddings, OpenAIEmbeddings, HuggingFaceInstructEmbeddings
from langchain.embeddings.base import Embeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.document_loaders import TextLoader
from langchain.docstore.document import Document

from tqdm import tqdm

os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
ctags_path = './repo/langchain/libs/langchain/tags'
assert os.path.isfile(ctags_path), "Please run `zsh download_example_rpo.sh` first"

In [None]:
AZURE_SEARCH_SERVICE = os.environ["AZURE_SEARCH_SERVICE"]
AZURE_SEARCH_TINY_INDEX = os.environ["AZURE_SEARCH_TINY_INDEX"]
AZURE_SEARCH_BIGGER_INDEX = os.environ["AZURE_SEARCH_BIGGER_INDEX"]

## Helper functions

In [None]:
def read_tags_file(file_path: str) -> list[dict]:
    with open(file_path, 'r', errors='ignore') as file:
        lines = file.readlines()

    tags = []
    for line in lines:
        if line.startswith('!'):  # Skip metadata lines
            continue
        parts = line.split('\t')
        if len(parts) >= 4:
            tag_name = parts[0]
            file_name = parts[1]
            pattern = parts[2]
            tags.append(dict(tag_name=tag_name, file_name=file_name, pattern=pattern))

    return tags

In [None]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") # vector length 384
embedding_dimension = embeddings.client.get_sentence_embedding_dimension()

def get_embeddings(text: str) -> list:
    return embeddings.embed_query(text)

## Init Azure client

In [None]:
from azure.identity import DefaultAzureCredential
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
# from azure.search.documents.indexes.models import SimpleField, SearchField, SearchFieldDataType, SearchIndex
from azure.search.documents.indexes.models import (
    HnswParameters,
    SearchableField,
    SearchField,
    SearchFieldDataType,
    SearchIndex,
    SemanticConfiguration,
    SemanticField,
    SimpleField,
    VectorSearch,
    VectorSearchAlgorithmKind,
    VectorSearchProfile,
)

from azure.search.documents.models import (
    VectorQuery,
    # VectorizedQuery, # work in 11.4.0b10
    RawVectorQuery # work in 11.4.0
)


# only work in azure-search-documents==11.4.0b10

# from azure.search.documents.indexes.models import (
#     HnswParameters,
#     SearchableField,
#     SearchField,
#     SearchFieldDataType,
#     SearchIndex,
#     SimpleField,
#     VectorSearch,
#     VectorSearchAlgorithmKind,
#     VectorSearchProfile,
#     HnswAlgorithmConfiguration
# )

In [None]:
creds = DefaultAzureCredential()
index_client = SearchIndexClient(AZURE_SEARCH_SERVICE, credential=creds)
search_client = SearchClient(AZURE_SEARCH_SERVICE, AZURE_SEARCH_TINY_INDEX, credential=creds)

### Create index

In [None]:
# SimpleField have an argument "hidden" which default is False, hidden=False mean retrievable=True

# fields=[
#     SimpleField(name="id", type=SearchFieldDataType.String, key=True),
#     SimpleField(name="metadata", type=SearchFieldDataType.String, key=True),
#     SearchField(
#         name=f"{AZURE_SEARCH_TINY_INDEX}_vector", 
#         type=SearchFieldDataType.Collection(SearchFieldDataType.Single), 
#         searchable=True,
#         vector_search_dimensions=embedding_dimension,
#         vector_search_profile="vprofile"
#     )
# ]

In [None]:
# vector_search=VectorSearch(
#     algorithms=[
#         HnswAlgorithmConfiguration(
#             name="hnsw_algo_config",
#             kind=VectorSearchAlgorithmKind.HNSW,
#             parameters=HnswParameters(metric="cosine"),
#         )
#     ],
#     profiles=[
#         VectorSearchProfile(
#             name="vprofile",
#             algorithm_configuration_name="hnsw_algo_config",
#         ),
#     ],
# )

In [None]:
# example: https://github.com/Azure-Samples/azure-search-openai-demo/blob/87d15fc021a2a84c4cbcbec56b2fe0560af05d52/scripts/prepdocslib/searchmanager.py#L93-L121
# index = SearchIndex(
#     name=AZURE_SEARCH_TINY_INDEX, 
#     fields=fields,
#     vector_search=vector_search
# )

# await index_client.create_index(index)

### Read ctag, embedding and upload to Azure

In [None]:
ctags_root_path = os.path.dirname(ctags_path)
tags = read_tags_file(ctags_path)

tags = tags[:1000]

documents = []
idx = 0
for tag in tqdm(tags):
    documents.append(
        dict(
            id=str(idx),
            metadata=json.dumps(tag),
            poc_vector=get_embeddings(f"{tag['file_name']} | {tag['tag_name']} ")
        )
    )
    idx+=1

In [None]:
search_client.upload_documents(documents=documents)

### Search using vector similarity

In [None]:
text = "What is langchain?"

# use VectorizedQuery in 11.4.0
r = search_client.search(None, vector_queries=[RawVectorQuery(vector=get_embeddings(text), k=3, fields=AZURE_SEARCH_TINY_INDEX)])
for doc in r:
    print(f"id: {doc['id']}, score: {doc['@search.score']}")