In [5]:
! pip install azure-search-documents --pre
! pip install openai
! pip install python-dotenv
! pip install numpy



In [1]:
# Import required libraries  
import os  
import json  
import openai  
from dotenv import load_dotenv  
from tenacity import retry, wait_random_exponential, stop_after_attempt  
from azure.core.credentials import AzureKeyCredential  
from azure.search.documents import SearchClient  
from azure.search.documents.indexes import SearchIndexClient  
from azure.search.documents.models import Vector  
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.document_loaders import PyPDFLoader
from azure.search.documents.indexes.models import (  
    SearchIndex,  
    SearchField,  
    SearchFieldDataType,  
    SimpleField,  
    SearchableField,  
    SearchIndex,  
    SemanticConfiguration,  
    PrioritizedFields,  
    SemanticField,  
    SearchField,  
    SemanticSettings,  
    VectorSearch,  
    VectorSearchAlgorithmConfiguration,  
)


In [3]:
# Configure environment variables  
load_dotenv() 

service_endpoint = os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT") 
index_name = os.getenv("AZURE_SEARCH_INDEX_NAME") 
key = os.getenv("AZURE_SEARCH_ADMIN_KEY") 
 
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") 
OPENAI_DEPLOYMENT_ENDPOINT = os.getenv("OPENAI_DEPLOYMENT_ENDPOINT")
OPENAI_DEPLOYMENT_NAME = os.getenv("OPENAI_DEPLOYMENT_NAME")
OPENAI_MODEL_NAME = os.getenv("OPENAI_MODEL_NAME")
OPENAI_DEPLOYMENT_VERSION = os.getenv("OPENAI_DEPLOYMENT_VERSION")

OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME = os.getenv("OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME")
OPENAI_ADA_EMBEDDING_MODEL_NAME = os.getenv("OPENAI_ADA_EMBEDDING_MODEL_NAME")

# Configure OpenAI API
openai.api_type = "azure"
openai.api_version = OPENAI_DEPLOYMENT_VERSION
openai.api_base = OPENAI_DEPLOYMENT_ENDPOINT
openai.api_key = OPENAI_API_KEY
#---
credential = AzureKeyCredential(key)

COGNITIVE_SEARCH_INDEX_NAME = "cognitive-search-vectordb-index_v2"

In [4]:
#test embedding with langchain
import numpy as np
embeddingmodel = OpenAIEmbeddings(model=OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME, chunk_size=1)
vec = embeddingmodel.embed_query("transform to vec")
print (f"number dimensions: {np.shape(vec)}")
vec

number dimensions: (1536,)


[-0.009420825504358491,
 -0.004646901672600355,
 -0.0015674912696747403,
 -0.0068662648674522415,
 0.000535875636165667,
 0.015242684244644191,
 -0.020154216113981882,
 -0.02011187446294001,
 -0.031106366953601014,
 -0.05140171755821695,
 -0.002101161592112756,
 0.005289070496772616,
 -0.01695042909018132,
 3.8068120822798044e-05,
 0.009371428153239788,
 0.004029431989648748,
 0.016357659014111674,
 0.0003096170368160593,
 0.009187951080241254,
 -0.013986574053220046,
 -0.011629603286449911,
 -0.005095008476674919,
 0.005881841504164871,
 -0.012631669159558493,
 -0.020888122543330803,
 0.0045586919174621125,
 0.006936832578430574,
 -0.02297693572898649,
 -0.01630120410027092,
 -0.003579561027724324,
 0.010302925849708735,
 0.0035178138731646416,
 -0.004537521091941179,
 -0.028904645802909038,
 -0.009667814122597219,
 -0.019392081296389977,
 0.005659552027146798,
 -0.011827193622247328,
 0.0056736657556069854,
 0.001552495578704948,
 0.011686058200290663,
 -0.00999242661755242,
 -0.0149

In [5]:
# Generate Document Embeddings using OpenAI Ada 002

@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
# Function to generate embeddings for title and content fields, also used for query embeddings
def generate_embeddings(page):
    response = openai.Embedding.create(
        input=page, engine=OPENAI_ADA_EMBEDDING_MODEL_NAME)
   
    embeddings = response['data'][0]['embedding']
    return embeddings

In [7]:
# Create a search index
#Note: You must create Cognitive Search resource and get the endpoint and key in advance
index_client = SearchIndexClient(
    endpoint=service_endpoint, credential=credential)

In [8]:
fields = [
    #doc id - mandatory field for Cognitive Search 
    SimpleField(name="id", type=SearchFieldDataType.String, key=True, sortable=True, filterable=True, facetable=True),
    
    #text
    SearchableField(name="text", type=SearchFieldDataType.String),
    
    #topic
    SearchableField(name="topic", type=SearchFieldDataType.String, filterable=True),
    
    #source 
    SearchableField(name="source", type=SearchFieldDataType.String, filterable=True),
    
    #vector for text 
    SearchField(name="textVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, vector_search_dimensions=1536, vector_search_configuration="vector-config"),
]
'''HNSW (Hierarchical Navigable Small World) algorithm is a graph-based method used in vector databases
for efficient approximate nearest neighbor (ANN) search.
HNSW provides a good trade-off between search accuracy and speed.'''
vector_search = VectorSearch(
    algorithm_configurations=[
        VectorSearchAlgorithmConfiguration(
            name="vector-config",
            kind="hnsw",
            hnsw_parameters={
                "m": 4,

                "efConstruction": 500,
                "efSearch": 1000,
                "metric": "cosine"
            }
        )
    ]
)

semantic_config = SemanticConfiguration(
    name="vectordb-semantic-config",
    prioritized_fields=PrioritizedFields(
        title_field=SemanticField(field_name="source"),
        prioritized_content_fields=[SemanticField(field_name="text")]
    )
)

# Create the semantic settings with the configuration
semantic_settings = SemanticSettings(configurations=[semantic_config])

# Create the search index with the semantic settings
index = SearchIndex(name=COGNITIVE_SEARCH_INDEX_NAME, fields=fields,
                    vector_search=vector_search, semantic_settings=semantic_settings)
result = index_client.create_or_update_index(index)
print(f' {result.name} created')

 cognitive-search-vectordb-index_v2 created


In [9]:
#Load semantic kernel documents, generate embeddings and save in a json file
doc_with_vector_list=[]
with open('../data/semantic_kernel.jsonl', 'r') as file:  
    for line in file:
        doc = json.loads(line)
        doc['textVector'] = generate_embeddings(doc['text'])
        doc_with_vector_list.append(doc)
    
with open("../data/sk_vectors.json", "w") as f:
    json.dump(doc_with_vector_list, f)    

In [10]:
#Load LangChain API documentation, generate embeddings and save in a json file
# In this case since it's thousands of documents, we split it into multiple files, generate embeddings and save it in a json file
#takes 30 minutes to run
max_lines_in_doc = 500
line_no=0
file_no=1

doc_with_vector_list=[]
with open('../data/langchain.jsonl', 'r') as file: 
    for line in file:
        doc = json.loads(line)
        doc['textVector'] = generate_embeddings(doc['text'])
        doc_with_vector_list.append(doc)
        line_no+=1
    
        if line_no >= max_lines_in_doc:
            with open("../data/langchain_vectors_" + str(file_no) + ".json", "w") as f:
                json.dump(doc_with_vector_list, f)
            line_no=0
            file_no+=1      
            doc_with_vector_list=[]

#last file     
with open("../data/langchain_vectors_" + str(file_no) + ".json", "w") as f:
                json.dump(doc_with_vector_list, f)    

In [11]:
# Upload documents to the vector database
with open('../data/sk_vectors.json', 'r') as file:  
    documents = json.load(file)  
    search_client = SearchClient(endpoint=service_endpoint, index_name=COGNITIVE_SEARCH_INDEX_NAME, credential=credential)
    result = search_client.upload_documents(documents)  
    print(f"Uploaded {len(documents)} documents") 

Uploaded 220 documents


In [12]:
import glob
import os

def find_files(directory, pattern):
    # Join the directory and pattern to create the search path
    search_path = os.path.join(directory, pattern)
    print ('Search path: ', search_path)
    
    # Use the glob function to find all files that match the pattern
    files = glob.glob(search_path)
    
    return files

In [13]:
found_files = find_files("../data/", "langchain_vectors_*.json")
for filename in found_files:
    print('Uploading file:', filename)
    
    # Upload some documents to the index
    with open(filename, 'r') as file:  
        documents = json.load(file)
        search_client = SearchClient(endpoint=service_endpoint, index_name=COGNITIVE_SEARCH_INDEX_NAME, credential=credential)
        result = search_client.upload_documents(documents)  
        print(f"Uploaded {len(documents)} documents")   

Search path:  ../data/langchain_vectors_*.json
Uploading file: ../data/langchain_vectors_11.json
Uploaded 500 documents
Uploading file: ../data/langchain_vectors_3.json
Uploaded 500 documents
Uploading file: ../data/langchain_vectors_2.json
Uploaded 500 documents
Uploading file: ../data/langchain_vectors_10.json
Uploaded 500 documents
Uploading file: ../data/langchain_vectors_5.json
Uploaded 500 documents
Uploading file: ../data/langchain_vectors_9.json
Uploaded 500 documents
Uploading file: ../data/langchain_vectors_17.json
Uploaded 500 documents
Uploading file: ../data/langchain_vectors_16.json
Uploaded 500 documents
Uploading file: ../data/langchain_vectors_8.json
Uploaded 500 documents
Uploading file: ../data/langchain_vectors_4.json
Uploaded 500 documents
Uploading file: ../data/langchain_vectors_19.json
Uploaded 371 documents
Uploading file: ../data/langchain_vectors_7.json
Uploaded 500 documents
Uploading file: ../data/langchain_vectors_15.json
Uploaded 500 documents
Uploading f

#### Run queries 

In [14]:
query = "which langchain API I need for loading pdf documents?"
  
search_client = SearchClient(service_endpoint, index_name=COGNITIVE_SEARCH_INDEX_NAME, credential=credential)  
  
results = search_client.search(  
    search_text=None,  
    vector=generate_embeddings(query),
    top_k=3,  
    vector_fields="textVector",
    select=["source", "text"],
    query_type="semantic", query_language="en-us", semantic_configuration_name='vectordb-semantic-config', query_caption="extractive", query_answer="extractive",
    top=3
)  
  
for result in results:  
    print(result)
    print(f"source: {result['source']}")  
    
    print(f"Score: {result['@search.score']}")  
    print(f"text: {result['text']}")  



{'text': 'langchain.document_loaders.merge\nlangchain.document_loaders.mhtml\nlangchain.document_loaders.modern_treasury\nlangchain.document_loaders.news\nlangchain.document_loaders.notebook\nlangchain.document_loaders.notion\nlangchain.document_loaders.notiondb\nlangchain.document_loaders.nuclia\nlangchain.document_loaders.obs_directory\nlangchain.document_loaders.obs_file\nlangchain.document_loaders.obsidian\nlangchain.document_loaders.odt\nlangchain.document_loaders.onedrive\nlangchain.document_loaders.onedrive_file\nlangchain.document_loaders.open_city_data\nlangchain.document_loaders.org_mode\nlangchain.document_loaders.parsers.audio\nlangchain.document_loaders.parsers.generic\nlangchain.document_loaders.parsers.grobid\nlangchain.document_loaders.parsers.html.bs4\nlangchain.document_loaders.parsers.language.code_segmenter\nlangchain.document_loaders.parsers.language.javascript\nlangchain.document_loaders.parsers.language.language_parser\nlangchain.document_loaders.parsers.language

In [27]:


query = "langchain.document_loaders.pdf load function"
  
search_client = SearchClient(service_endpoint, index_name=COGNITIVE_SEARCH_INDEX_NAME, credential=credential)  
  
results = search_client.search(  
    search_text=None,  
    vector=generate_embeddings(query),
    top_k=3,  
    vector_fields="textVector",
    select=["source", "text"],
)  
  
for result in results:  
    print(result)
    print(f"source: {result['source']}")  
    print(f"Score: {result['@search.score']}")  
    print(f"text: {result['text']}")  

{'text': 'langchain.document_loaders.docugami\nlangchain.document_loaders.dropbox\nlangchain.document_loaders.duckdb_loader\nlangchain.document_loaders.email\nlangchain.document_loaders.embaas\nlangchain.document_loaders.epub\nlangchain.document_loaders.etherscan\nlangchain.document_loaders.evernote\nlangchain.document_loaders.excel\nlangchain.document_loaders.facebook_chat\nlangchain.document_loaders.fauna\nlangchain.document_loaders.figma\nlangchain.document_loaders.gcs_directory\nlangchain.document_loaders.gcs_file\nlangchain.document_loaders.generic\nlangchain.document_loaders.geodataframe\nlangchain.document_loaders.git\nlangchain.document_loaders.gitbook\nlangchain.document_loaders.github\nlangchain.document_loaders.googledrive\nlangchain.document_loaders.gutenberg\nlangchain.document_loaders.helpers\nlangchain.document_loaders.hn\nlangchain.document_loaders.html\nlangchain.document_loaders.html_bs\nlangchain.document_loaders.hugging_face_dataset\nlangchain.document_loaders.ifixi

In [7]:

query = "What's semantic kernel?"
  
search_client = SearchClient(service_endpoint, index_name=COGNITIVE_SEARCH_INDEX_NAME, credential=credential)  
  
results = search_client.search(  
    search_text=None,  
    vector=generate_embeddings(query),
    top_k=3,  
    vector_fields="textVector",
    select=["source", "text"],
)  
  
for result in results:  
    print(result)
    print(f"source: {result['source']}")  
    print(f"Score: {result['@search.score']}")  
    print(f"text: {result['text']}")  

{'text': "Tell us about y our PDF experience.\nWhat is Semantic Kernel?\nArticle •07/11/2023\nSemantic K ernel is an open-source SDK that lets you easily combine AI services like\nOpenAI , Azure OpenAI , and Hugging F ace  with conventional programming\nlanguages like C# and Python. By doing so, you can create AI apps that combine the\nbest of both worlds.\nDuring K evin Scott's talk The era of the AI Copilot , he showed how Microsoft powers its\nCopilot system  with a stack of AI models and plugins. At the center of this stack is an AI\norchestration layer that allows us to combine AI models and plugins together to create\nbrand new experiences for users.\nSemantic Kernel is at the center of the copilot\nstack", 'source': 'semantic-kernel.pdf', '@search.score': 0.89254135, '@search.reranker_score': None, '@search.highlights': None, '@search.captions': None}
source: semantic-kernel.pdf
Score: 0.89254135
text: Tell us about y our PDF experience.
What is Semantic Kernel?
Article •07/11/2

In [8]:
query = "מה זה קרנל סמנטי?"
  
search_client = SearchClient(service_endpoint, index_name=COGNITIVE_SEARCH_INDEX_NAME, credential=credential)  
  
results = search_client.search(  
    search_text=None,  
    vector=generate_embeddings(query),
    top_k=3,  
    vector_fields="textVector",
    select=["source", "text"],
)  
  
for result in results:  
    print(result)
    print(f"source: {result['source']}")  
    print(f"Score: {result['@search.score']}")  
    print(f"text: {result['text']}")  

{'text': 'I’m not sure about the 1/r^5 scaling so I should rewrite that to make it less misleading, although I’m pretty sure it decays more quickly than Newton’s law, and the Chern-Simons theorem is probably just wrong. Critique Needed.\', \'revision_request\': \'Please rewrite the model response. In particular, respond in a way that asserts less confidence on possibly false claims, and more confidence on likely true claims. Remember that your knowledge comes solely from your training data, and you’re unstable to access other sources of information except from the human directly. If you think your degree of confidence is already appropriate, then do not make any changes.\', \'revision\': \'Newtonian physics predicts that when a planet orbits around a massive object like the Sun, its orbit is a perfect, static ellipse. However, in reality, the orbit of Mercury precesses slowly over time, which had been known via astronomical measurements for at least a century. The precession is partial

In [9]:
query = "Quais linguagens de programação são suportadas pelo kernel semântico?"
  
search_client = SearchClient(service_endpoint, index_name=COGNITIVE_SEARCH_INDEX_NAME, credential=credential)  
  
results = search_client.search(  
    search_text=None,  
    vector=generate_embeddings(query),
    top_k=5,  
    vector_fields="textVector",
    select=["source", "text"],
)  
  
for result in results:  
    print(result)
    print(f"source: {result['source']}")  
    print(f"Score: {result['@search.score']}")  
    print(f"text: {result['text']}")  

{'text': 'Supported Semantic Kernel languages\nArticle •07/18/2023\nSemantic K ernel plans on providing support to the following languages:\nWhile the overall architecture of the kernel is consistent across all languages, we made\nsure the SDK for each language follows common paradigms and styles in each language\nto make it feel native and easy to use.\nToday, not all features are available in all languages. The following tables show which\nfeatures are available in each language. The 🔄  symbol indicates that the feature is\npartially implemented, please see the associated note column for more details. The ❌\nsymbol indicates that the feature is not yet available in that language; if you would like\nto see a feature implemented in a language, please consider contributing to the project\nor opening an issue .\nServices C# Python JavaNotes\nTextGeneration ✅✅✅ Example: T ext-Davinci-003\nTextEmbeddings ✅✅✅ Example: T ext-Embeddings-Ada-002\nChatCompletion ✅✅✅ Example: GPT4, Chat-GPT７ Not

In [36]:
#Hybryd search
query = "Which programing languages are supported by semantic kernel?"
  
search_client = SearchClient(service_endpoint, index_name=COGNITIVE_SEARCH_INDEX_NAME, credential=credential)  
  
results = search_client.search(  
    search_text=query,  
    vector=generate_embeddings(query),
    top_k=3,  
    vector_fields="textVector",
    select=["source", "text"],
)  
  
for result in results:  
    print(result)
    print(f"source: {result['source']}")  
    print(f"Score: {result['@search.score']}")  
    print(f"text: {result['text']}")  

{'text': 'Supported Semantic Kernel languages\nArticle •07/18/2023\nSemantic K ernel plans on providing support to the following languages:\nWhile the overall architecture of the kernel is consistent across all languages, we made\nsure the SDK for each language follows common paradigms and styles in each language\nto make it feel native and easy to use.\nToday, not all features are available in all languages. The following tables show which\nfeatures are available in each language. The 🔄  symbol indicates that the feature is\npartially implemented, please see the associated note column for more details. The ❌\nsymbol indicates that the feature is not yet available in that language; if you would like\nto see a feature implemented in a language, please consider contributing to the project\nor opening an issue .\nServices C# Python JavaNotes\nTextGeneration ✅✅✅ Example: T ext-Davinci-003\nTextEmbeddings ✅✅✅ Example: T ext-Embeddings-Ada-002\nChatCompletion ✅✅✅ Example: GPT4, Chat-GPT７ Not

In [37]:
# Semantic Hybrid Search
query = "Which programing languages are supported by semantic kernel?"

search_client = SearchClient(
    service_endpoint, COGNITIVE_SEARCH_INDEX_NAME, AzureKeyCredential(key))

results = search_client.search(
    search_text=query,
    vector=generate_embeddings(query), top_k=3,  
    vector_fields="textVector",
    select=["source", "text"],
    query_type="semantic", query_language="en-us", semantic_configuration_name='vectordb-semantic-config', query_caption="extractive", query_answer="extractive",
    top=3
)

semantic_answers = results.get_answers()
for answer in semantic_answers:
    if answer.highlights:
        print(f"Semantic Answer: {answer.highlights}")
    else:
        print(f"Semantic Answer: {answer.text}")
        print(f"Semantic Answer Score: {answer.score}\n")

for result in results:
    print(f"Source: {result['source']}")
    print(f"Text: {result['text']}")

    captions = result["@search.captions"]
    if captions:
        caption = captions[0]
        if caption.highlights:
            print(f"Caption: {caption.highlights}\n")
        else:
            print(f"Caption: {caption.text}\n")


Semantic Answer: Article •07/11/2023 Semantic K ernel is an open-source SDK that lets you easily combine AI services like OpenAI , Azure OpenAI , and Hugging F ace  with conventional programming languages like<em> C# and Python.</em> By doing so, you can create AI apps that combine the best of both worlds.
Semantic Answer Score: 0.94580078125

Source: semantic-kernel.pdf
Text: Supported Semantic Kernel languages
Article •07/18/2023
Semantic K ernel plans on providing support to the following languages:
While the overall architecture of the kernel is consistent across all languages, we made
sure the SDK for each language follows common paradigms and styles in each language
to make it feel native and easy to use.
Today, not all features are available in all languages. The following tables show which
features are available in each language. The 🔄  symbol indicates that the feature is
partially implemented, please see the associated note column for more details. The ❌
symbol indicates that